refactor(docker): improve server architecture and configuration
Complete overhaul of Docker deployment setup with improved architecture: - Add Redis integration for task management - Implement rate limiting and security middleware - Add Prometheus metrics and health checks - Improve error handling and logging - Add support for streaming responses - Implement proper configuration management - Add platform-specific optimizations for ARM64/AMD64 BREAKING CHANGE: Docker deployment now requires Redis and new config.yml structure
This commit is contained in:
@@ -5,7 +5,7 @@ from typing import List, Tuple, Dict, Optional
|
||||
from rank_bm25 import BM25Okapi
|
||||
from collections import deque
|
||||
from bs4 import NavigableString, Comment
|
||||
from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data
|
||||
from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data, merge_chunks
|
||||
from abc import ABC, abstractmethod
|
||||
import math
|
||||
from snowballstemmer import stemmer
|
||||
@@ -23,7 +23,14 @@ from colorama import Fore, Style
|
||||
class RelevantContentFilter(ABC):
|
||||
"""Abstract base class for content filtering strategies"""
|
||||
|
||||
def __init__(self, user_query: str = None):
|
||||
def __init__(self, user_query: str = None, verbose: bool = False, logger: Optional[AsyncLogger] = None):
|
||||
"""
|
||||
Initializes the RelevantContentFilter class with optional user query.
|
||||
|
||||
Args:
|
||||
user_query (str): User query for filtering (optional).
|
||||
verbose (bool): Enable verbose logging (default: False).
|
||||
"""
|
||||
self.user_query = user_query
|
||||
self.included_tags = {
|
||||
# Primary structure
|
||||
@@ -92,6 +99,8 @@ class RelevantContentFilter(ABC):
|
||||
r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
|
||||
)
|
||||
self.min_word_count = 2
|
||||
self.verbose = False
|
||||
self.logger = logger
|
||||
|
||||
@abstractmethod
|
||||
def filter_content(self, html: str) -> List[str]:
|
||||
@@ -755,8 +764,11 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
base_url: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
extra_args: Dict = None,
|
||||
# char_token_rate: float = WORD_TOKEN_RATE * 5,
|
||||
# chunk_mode: str = "char",
|
||||
verbose: bool = False,
|
||||
logger: Optional[AsyncLogger] = None,
|
||||
ignore_cache: bool = False,
|
||||
):
|
||||
super().__init__(None)
|
||||
self.provider = provider
|
||||
@@ -768,10 +780,15 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
self.instruction = instruction
|
||||
self.chunk_token_threshold = chunk_token_threshold
|
||||
self.overlap_rate = overlap_rate
|
||||
self.word_token_rate = word_token_rate
|
||||
self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
|
||||
# self.chunk_mode: str = chunk_mode
|
||||
# self.char_token_rate = char_token_rate or word_token_rate / 5
|
||||
# self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
|
||||
self.token_rate = word_token_rate or WORD_TOKEN_RATE
|
||||
self.base_url = base_url
|
||||
self.api_base = api_base or base_url
|
||||
self.extra_args = extra_args or {}
|
||||
self.ignore_cache = ignore_cache
|
||||
self.verbose = verbose
|
||||
|
||||
# Setup logger with custom styling for LLM operations
|
||||
@@ -779,7 +796,7 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
self.logger = logger
|
||||
elif verbose:
|
||||
self.logger = AsyncLogger(
|
||||
verbose=True,
|
||||
verbose=verbose,
|
||||
icons={
|
||||
**AsyncLogger.DEFAULT_ICONS,
|
||||
"LLM": "★", # Star for LLM operations
|
||||
@@ -803,45 +820,25 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
return hashlib.md5(content.encode()).hexdigest()
|
||||
|
||||
def _merge_chunks(self, text: str) -> List[str]:
|
||||
"""Split text into chunks with overlap"""
|
||||
# Calculate tokens and sections
|
||||
total_tokens = len(text.split()) * self.word_token_rate
|
||||
num_sections = max(1, math.floor(total_tokens / self.chunk_token_threshold))
|
||||
adjusted_chunk_threshold = total_tokens / num_sections
|
||||
"""Split text into chunks with overlap using char or word mode."""
|
||||
ov = int(self.chunk_token_threshold * self.overlap_rate)
|
||||
sections = merge_chunks(
|
||||
docs = [text],
|
||||
target_size= self.chunk_token_threshold,
|
||||
overlap=ov,
|
||||
word_token_ratio=self.word_token_rate
|
||||
)
|
||||
return sections
|
||||
|
||||
|
||||
|
||||
# Split into words
|
||||
words = text.split()
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_token_count = 0
|
||||
|
||||
for word in words:
|
||||
word_tokens = len(word) * self.word_token_rate
|
||||
if current_token_count + word_tokens <= adjusted_chunk_threshold:
|
||||
current_chunk.append(word)
|
||||
current_token_count += word_tokens
|
||||
else:
|
||||
# Add overlap if not the last chunk
|
||||
if chunks and self.overlap_rate > 0:
|
||||
overlap_size = int(len(current_chunk) * self.overlap_rate)
|
||||
current_chunk.extend(current_chunk[-overlap_size:])
|
||||
|
||||
chunks.append(" ".join(current_chunk))
|
||||
current_chunk = [word]
|
||||
current_token_count = word_tokens
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(" ".join(current_chunk))
|
||||
|
||||
return chunks
|
||||
|
||||
def filter_content(self, html: str, ignore_cache: bool = False) -> List[str]:
|
||||
def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
if self.logger:
|
||||
self.logger.info(
|
||||
"Starting LLM content filtering process",
|
||||
"Starting LLM markdown content filtering process",
|
||||
tag="LLM",
|
||||
params={"provider": self.provider},
|
||||
colors={"provider": Fore.CYAN}
|
||||
@@ -853,9 +850,12 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
cache_key = self._get_cache_key(html, self.instruction or "")
|
||||
cache_file = cache_dir / f"{cache_key}.json"
|
||||
|
||||
# if ignore_cache == None:
|
||||
ignore_cache = self.ignore_cache
|
||||
|
||||
if not ignore_cache and cache_file.exists():
|
||||
if self.logger:
|
||||
self.logger.info("Found cached result", tag="CACHE")
|
||||
self.logger.info("Found cached markdown result", tag="CACHE")
|
||||
try:
|
||||
with cache_file.open('r') as f:
|
||||
cached_data = json.load(f)
|
||||
@@ -867,13 +867,13 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
return cached_data['blocks']
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"Cache read error: {str(e)}", tag="CACHE")
|
||||
self.logger.error(f"LLM markdown: Cache read error: {str(e)}", tag="CACHE")
|
||||
|
||||
# Split into chunks
|
||||
html_chunks = self._merge_chunks(html)
|
||||
if self.logger:
|
||||
self.logger.info(
|
||||
"Split content into {chunk_count} chunks",
|
||||
"LLM markdown: Split content into {chunk_count} chunks",
|
||||
tag="CHUNK",
|
||||
params={"chunk_count": len(html_chunks)},
|
||||
colors={"chunk_count": Fore.YELLOW}
|
||||
@@ -887,7 +887,7 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
for i, chunk in enumerate(html_chunks):
|
||||
if self.logger:
|
||||
self.logger.debug(
|
||||
"Processing chunk {chunk_num}/{total_chunks}",
|
||||
"LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
|
||||
tag="CHUNK",
|
||||
params={
|
||||
"chunk_num": i + 1,
|
||||
@@ -904,16 +904,38 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
for var, value in prompt_variables.items():
|
||||
prompt = prompt.replace("{" + var + "}", value)
|
||||
|
||||
def _proceed_with_chunk(
|
||||
provider: str,
|
||||
prompt: str,
|
||||
api_token: str,
|
||||
base_url: Optional[str] = None,
|
||||
extra_args: Dict = {}
|
||||
) -> List[str]:
|
||||
if self.logger:
|
||||
self.logger.info(
|
||||
"LLM Markdown: Processing chunk {chunk_num}",
|
||||
tag="CHUNK",
|
||||
params={"chunk_num": i + 1}
|
||||
)
|
||||
return perform_completion_with_backoff(
|
||||
provider,
|
||||
prompt,
|
||||
api_token,
|
||||
base_url=base_url,
|
||||
extra_args=extra_args
|
||||
)
|
||||
|
||||
future = executor.submit(
|
||||
perform_completion_with_backoff,
|
||||
_proceed_with_chunk,
|
||||
self.provider,
|
||||
prompt,
|
||||
self.api_token,
|
||||
base_url=self.api_base,
|
||||
extra_args=self.extra_args
|
||||
self.api_base,
|
||||
self.extra_args
|
||||
)
|
||||
futures.append((i, future))
|
||||
|
||||
|
||||
# Collect results in order
|
||||
ordered_results = []
|
||||
for i, future in sorted(futures):
|
||||
@@ -940,14 +962,14 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
ordered_results.append(blocks)
|
||||
if self.logger:
|
||||
self.logger.success(
|
||||
"Successfully processed chunk {chunk_num}",
|
||||
"LLM markdown: Successfully processed chunk {chunk_num}",
|
||||
tag="CHUNK",
|
||||
params={"chunk_num": i + 1}
|
||||
)
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
"Error processing chunk {chunk_num}: {error}",
|
||||
"LLM markdown: Error processing chunk {chunk_num}: {error}",
|
||||
tag="CHUNK",
|
||||
params={
|
||||
"chunk_num": i + 1,
|
||||
@@ -958,7 +980,7 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
end_time = time.time()
|
||||
if self.logger:
|
||||
self.logger.success(
|
||||
"Completed processing in {time:.2f}s",
|
||||
"LLM markdown: Completed processing in {time:.2f}s",
|
||||
tag="LLM",
|
||||
params={"time": end_time - start_time},
|
||||
colors={"time": Fore.YELLOW}
|
||||
|
||||
@@ -21,6 +21,9 @@ from .utils import (
|
||||
extract_xml_data,
|
||||
split_and_parse_json_objects,
|
||||
sanitize_input_encode,
|
||||
chunk_documents,
|
||||
merge_chunks,
|
||||
advanced_split,
|
||||
)
|
||||
from .models import * # noqa: F403
|
||||
|
||||
@@ -501,6 +504,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
instruction: str = None,
|
||||
schema: Dict = None,
|
||||
extraction_type="block",
|
||||
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
|
||||
overlap_rate=OVERLAP_RATE,
|
||||
word_token_rate=WORD_TOKEN_RATE,
|
||||
apply_chunking=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@@ -652,53 +659,16 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
)
|
||||
return blocks
|
||||
|
||||
def _merge(self, documents, chunk_token_threshold, overlap):
|
||||
def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
|
||||
"""
|
||||
Merge documents into sections based on chunk_token_threshold and overlap.
|
||||
"""
|
||||
# chunks = []
|
||||
sections = []
|
||||
total_tokens = 0
|
||||
|
||||
# Calculate the total tokens across all documents
|
||||
for document in documents:
|
||||
total_tokens += len(document.split(" ")) * self.word_token_rate
|
||||
|
||||
# Calculate the number of sections needed
|
||||
num_sections = math.floor(total_tokens / chunk_token_threshold)
|
||||
if num_sections < 1:
|
||||
num_sections = 1 # Ensure there is at least one section
|
||||
adjusted_chunk_threshold = total_tokens / num_sections
|
||||
|
||||
total_token_so_far = 0
|
||||
current_chunk = []
|
||||
|
||||
for document in documents:
|
||||
tokens = document.split(" ")
|
||||
token_count = len(tokens) * self.word_token_rate
|
||||
|
||||
if total_token_so_far + token_count <= adjusted_chunk_threshold:
|
||||
current_chunk.extend(tokens)
|
||||
total_token_so_far += token_count
|
||||
else:
|
||||
# Ensure to handle the last section properly
|
||||
if len(sections) == num_sections - 1:
|
||||
current_chunk.extend(tokens)
|
||||
continue
|
||||
|
||||
# Add overlap if specified
|
||||
if overlap > 0 and current_chunk:
|
||||
overlap_tokens = current_chunk[-overlap:]
|
||||
current_chunk.extend(overlap_tokens)
|
||||
|
||||
sections.append(" ".join(current_chunk))
|
||||
current_chunk = tokens
|
||||
total_token_so_far = token_count
|
||||
|
||||
# Add the last chunk
|
||||
if current_chunk:
|
||||
sections.append(" ".join(current_chunk))
|
||||
|
||||
sections = merge_chunks(
|
||||
docs = documents,
|
||||
target_size= chunk_token_threshold,
|
||||
overlap=overlap,
|
||||
word_token_ratio=self.word_token_rate
|
||||
)
|
||||
return sections
|
||||
|
||||
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from tabnanny import verbose
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
from .models import MarkdownGenerationResult
|
||||
from .html2text import CustomHTML2Text
|
||||
@@ -29,9 +30,11 @@ class MarkdownGenerationStrategy(ABC):
|
||||
self,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
options: Optional[Dict[str, Any]] = None,
|
||||
verbose: bool = False,
|
||||
):
|
||||
self.content_filter = content_filter
|
||||
self.options = options or {}
|
||||
self.verbose = verbose
|
||||
|
||||
@abstractmethod
|
||||
def generate_markdown(
|
||||
|
||||
@@ -206,17 +206,6 @@ Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags.
|
||||
|
||||
PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.
|
||||
|
||||
INPUT HTML:
|
||||
<|HTML_CONTENT_START|>
|
||||
{HTML}
|
||||
<|HTML_CONTENT_END|>
|
||||
|
||||
|
||||
SPECIFIC INSTRUCTION:
|
||||
<|USER_INSTRUCTION_START|>
|
||||
{REQUEST}
|
||||
<|USER_INSTRUCTION_END|>
|
||||
|
||||
TASK DETAILS:
|
||||
1. Content Selection
|
||||
- DO: Keep essential information, main content, key details
|
||||
@@ -240,15 +229,7 @@ TASK DETAILS:
|
||||
- DON'T: Fragment related content
|
||||
- DON'T: Duplicate information
|
||||
|
||||
Example Input:
|
||||
<div class="main-content"><h1>Setup Guide</h1><p>Follow these steps...</p></div>
|
||||
<div class="sidebar">Related articles...</div>
|
||||
|
||||
Example Output:
|
||||
# Setup Guide
|
||||
Follow these steps...
|
||||
|
||||
IMPORTANT: If specific instruction is provided above, prioritize those requirements over these general guidelines.
|
||||
IMPORTANT: If user specific instruction is provided, ignore above guideline and prioritize those requirements over these general guidelines.
|
||||
|
||||
OUTPUT FORMAT:
|
||||
Wrap your response in <content> tags. Use proper markdown throughout.
|
||||
@@ -256,7 +237,18 @@ Wrap your response in <content> tags. Use proper markdown throughout.
|
||||
[Your markdown content here]
|
||||
</content>
|
||||
|
||||
Begin filtering now."""
|
||||
Begin filtering now.
|
||||
|
||||
--------------------------------------------
|
||||
|
||||
<|HTML_CONTENT_START|>
|
||||
{HTML}
|
||||
<|HTML_CONTENT_END|>
|
||||
|
||||
<|USER_INSTRUCTION_START|>
|
||||
{REQUEST}
|
||||
<|USER_INSTRUCTION_END|>
|
||||
"""
|
||||
|
||||
JSON_SCHEMA_BUILDER= """
|
||||
# HTML Schema Generation Instructions
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from ast import Call
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
@@ -8,9 +9,10 @@ import re
|
||||
import os
|
||||
import platform
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||
from array import array
|
||||
from .config import *
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from typing import Dict, Any, List, Tuple, Union, Optional, Callable
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
from requests.exceptions import InvalidSchema
|
||||
@@ -31,6 +33,154 @@ import aiohttp
|
||||
from pathlib import Path
|
||||
from packaging import version
|
||||
from . import __version__
|
||||
from typing import Sequence, List
|
||||
from array import array
|
||||
from itertools import chain
|
||||
from collections import deque
|
||||
from typing import Callable, Generator, Iterable, List, Optional
|
||||
|
||||
def chunk_documents(
|
||||
documents: Iterable[str],
|
||||
chunk_token_threshold: int,
|
||||
overlap: int,
|
||||
word_token_rate: float = 0.75,
|
||||
tokenizer: Optional[Callable[[str], List[str]]] = None,
|
||||
) -> Generator[str, None, None]:
|
||||
"""
|
||||
Efficiently chunks documents into token-limited sections with overlap between chunks.
|
||||
|
||||
Args:
|
||||
documents: Iterable of document strings
|
||||
chunk_token_threshold: Maximum tokens per chunk
|
||||
overlap: Number of tokens to overlap between chunks
|
||||
word_token_rate: Token estimate per word when not using a tokenizer
|
||||
tokenizer: Function that splits text into tokens (if available)
|
||||
|
||||
Yields:
|
||||
Text chunks as strings
|
||||
"""
|
||||
token_queue = deque()
|
||||
contribution_queue = deque()
|
||||
current_token_count = 0.0
|
||||
|
||||
for doc in documents:
|
||||
# Tokenize document
|
||||
if tokenizer:
|
||||
tokens = tokenizer(doc)
|
||||
contributions = [1.0] * len(tokens)
|
||||
else:
|
||||
tokens = doc.split()
|
||||
contributions = [word_token_rate] * len(tokens)
|
||||
|
||||
# Add to processing queues
|
||||
token_queue.extend(tokens)
|
||||
contribution_queue.extend(contributions)
|
||||
current_token_count += sum(contributions)
|
||||
|
||||
# Process full chunks
|
||||
while current_token_count >= chunk_token_threshold:
|
||||
# Find chunk split point
|
||||
chunk_tokens = []
|
||||
chunk_contrib = []
|
||||
chunk_total = 0.0
|
||||
|
||||
# Build chunk up to threshold
|
||||
while contribution_queue:
|
||||
next_contrib = contribution_queue[0]
|
||||
if chunk_total + next_contrib > chunk_token_threshold:
|
||||
break
|
||||
|
||||
chunk_total += next_contrib
|
||||
chunk_contrib.append(contribution_queue.popleft())
|
||||
chunk_tokens.append(token_queue.popleft())
|
||||
|
||||
# Handle edge case where first token exceeds threshold
|
||||
if not chunk_contrib: # Single token exceeds threshold
|
||||
chunk_contrib.append(contribution_queue.popleft())
|
||||
chunk_tokens.append(token_queue.popleft())
|
||||
|
||||
# Calculate overlap
|
||||
overlap_total = 0.0
|
||||
overlap_idx = 0
|
||||
for contrib in reversed(chunk_contrib):
|
||||
if overlap_total + contrib > overlap:
|
||||
break
|
||||
overlap_total += contrib
|
||||
overlap_idx += 1
|
||||
|
||||
# Prepend overlap to queues
|
||||
if overlap_idx > 0:
|
||||
overlap_tokens = chunk_tokens[-overlap_idx:]
|
||||
overlap_contrib = chunk_contrib[-overlap_idx:]
|
||||
|
||||
token_queue.extendleft(reversed(overlap_tokens))
|
||||
contribution_queue.extendleft(reversed(overlap_contrib))
|
||||
current_token_count += overlap_total
|
||||
|
||||
# Update current token count and yield chunk
|
||||
current_token_count -= sum(chunk_contrib)
|
||||
yield " ".join(chunk_tokens[:len(chunk_tokens)-overlap_idx] if overlap_idx else chunk_tokens)
|
||||
|
||||
# Yield remaining tokens
|
||||
if token_queue:
|
||||
yield " ".join(token_queue)
|
||||
|
||||
def merge_chunks(
|
||||
docs: Sequence[str],
|
||||
target_size: int,
|
||||
overlap: int = 0,
|
||||
word_token_ratio: float = 1.0,
|
||||
splitter: Callable = None
|
||||
) -> List[str]:
|
||||
"""Merges documents into chunks of specified token size.
|
||||
|
||||
Args:
|
||||
docs: Input documents
|
||||
target_size: Desired token count per chunk
|
||||
overlap: Number of tokens to overlap between chunks
|
||||
word_token_ratio: Multiplier for word->token conversion
|
||||
"""
|
||||
# Pre-tokenize all docs and store token counts
|
||||
splitter = splitter or str.split
|
||||
token_counts = array('I')
|
||||
all_tokens: List[List[str]] = []
|
||||
total_tokens = 0
|
||||
|
||||
for doc in docs:
|
||||
tokens = doc.split()
|
||||
count = int(len(tokens) * word_token_ratio)
|
||||
if count: # Skip empty docs
|
||||
token_counts.append(count)
|
||||
all_tokens.append(tokens)
|
||||
total_tokens += count
|
||||
|
||||
if not total_tokens:
|
||||
return []
|
||||
|
||||
# Pre-allocate chunks
|
||||
num_chunks = max(1, (total_tokens + target_size - 1) // target_size)
|
||||
chunks: List[List[str]] = [[] for _ in range(num_chunks)]
|
||||
|
||||
curr_chunk = 0
|
||||
curr_size = 0
|
||||
|
||||
# Distribute tokens
|
||||
for tokens in chain.from_iterable(all_tokens):
|
||||
if curr_size >= target_size and curr_chunk < num_chunks - 1:
|
||||
if overlap > 0:
|
||||
overlap_tokens = chunks[curr_chunk][-overlap:]
|
||||
curr_chunk += 1
|
||||
chunks[curr_chunk].extend(overlap_tokens)
|
||||
curr_size = len(overlap_tokens)
|
||||
else:
|
||||
curr_chunk += 1
|
||||
curr_size = 0
|
||||
|
||||
chunks[curr_chunk].append(tokens)
|
||||
curr_size += 1
|
||||
|
||||
# Return only non-empty chunks
|
||||
return [' '.join(chunk) for chunk in chunks if chunk]
|
||||
|
||||
|
||||
class VersionManager:
|
||||
@@ -189,6 +339,77 @@ class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
SPLITS = bytearray([
|
||||
# Control chars (0-31) + space (32)
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
# Special chars (33-47): ! " # $ % & ' ( ) * + , - . /
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
# Numbers (48-57): Treat as non-splits
|
||||
0,0,0,0,0,0,0,0,0,0,
|
||||
# More special chars (58-64): : ; < = > ? @
|
||||
1,1,1,1,1,1,1,
|
||||
# Uppercase (65-90): Keep
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
# More special chars (91-96): [ \ ] ^ _ `
|
||||
1,1,1,1,1,1,
|
||||
# Lowercase (97-122): Keep
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
# Special chars (123-126): { | } ~
|
||||
1,1,1,1,
|
||||
# Extended ASCII
|
||||
*([1] * 128)
|
||||
])
|
||||
|
||||
# Additional split chars for HTML/code
|
||||
HTML_CODE_CHARS = {
|
||||
# HTML specific
|
||||
'•', '►', '▼', '©', '®', '™', '→', '⇒', '≈', '≤', '≥',
|
||||
# Programming symbols
|
||||
'+=', '-=', '*=', '/=', '=>', '<=>', '!=', '==', '===',
|
||||
'++', '--', '<<', '>>', '&&', '||', '??', '?:', '?.',
|
||||
# Common Unicode
|
||||
'…', '"', '"', ''', ''', '«', '»', '—', '–',
|
||||
# Additional splits
|
||||
'+', '=', '~', '@', '#', '$', '%', '^', '&', '*',
|
||||
'(', ')', '{', '}', '[', ']', '|', '\\', '/', '`',
|
||||
'<', '>', ',', '.', '?', '!', ':', ';', '-', '_'
|
||||
}
|
||||
|
||||
def advanced_split(text: str) -> list[str]:
|
||||
result = []
|
||||
word = array('u')
|
||||
|
||||
i = 0
|
||||
text_len = len(text)
|
||||
|
||||
while i < text_len:
|
||||
char = text[i]
|
||||
o = ord(char)
|
||||
|
||||
# Fast path for ASCII
|
||||
if o < 256 and SPLITS[o]:
|
||||
if word:
|
||||
result.append(word.tounicode())
|
||||
word = array('u')
|
||||
# Check for multi-char symbols
|
||||
elif i < text_len - 1:
|
||||
two_chars = char + text[i + 1]
|
||||
if two_chars in HTML_CODE_CHARS:
|
||||
if word:
|
||||
result.append(word.tounicode())
|
||||
word = array('u')
|
||||
i += 1 # Skip next char since we used it
|
||||
else:
|
||||
word.append(char)
|
||||
else:
|
||||
word.append(char)
|
||||
i += 1
|
||||
|
||||
if word:
|
||||
result.append(word.tounicode())
|
||||
|
||||
return result
|
||||
|
||||
def create_box_message(
|
||||
message: str,
|
||||
type: str = "info",
|
||||
|
||||
Reference in New Issue
Block a user