refactor(docker): improve server architecture and configuration

Complete overhaul of Docker deployment setup with improved architecture:
- Add Redis integration for task management
- Implement rate limiting and security middleware
- Add Prometheus metrics and health checks
- Improve error handling and logging
- Add support for streaming responses
- Implement proper configuration management
- Add platform-specific optimizations for ARM64/AMD64

BREAKING CHANGE: Docker deployment now requires Redis and new config.yml structure
This commit is contained in:
UncleCode
2025-02-02 20:19:51 +08:00
parent 7b1ef07c41
commit 33a21d6a7a
16 changed files with 1918 additions and 344 deletions

View File

@@ -5,7 +5,7 @@ from typing import List, Tuple, Dict, Optional
from rank_bm25 import BM25Okapi
from collections import deque
from bs4 import NavigableString, Comment
from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data
from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data, merge_chunks
from abc import ABC, abstractmethod
import math
from snowballstemmer import stemmer
@@ -23,7 +23,14 @@ from colorama import Fore, Style
class RelevantContentFilter(ABC):
"""Abstract base class for content filtering strategies"""
def __init__(self, user_query: str = None):
def __init__(self, user_query: str = None, verbose: bool = False, logger: Optional[AsyncLogger] = None):
"""
Initializes the RelevantContentFilter class with optional user query.
Args:
user_query (str): User query for filtering (optional).
verbose (bool): Enable verbose logging (default: False).
"""
self.user_query = user_query
self.included_tags = {
# Primary structure
@@ -92,6 +99,8 @@ class RelevantContentFilter(ABC):
r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
)
self.min_word_count = 2
self.verbose = False
self.logger = logger
@abstractmethod
def filter_content(self, html: str) -> List[str]:
@@ -755,8 +764,11 @@ class LLMContentFilter(RelevantContentFilter):
base_url: Optional[str] = None,
api_base: Optional[str] = None,
extra_args: Dict = None,
# char_token_rate: float = WORD_TOKEN_RATE * 5,
# chunk_mode: str = "char",
verbose: bool = False,
logger: Optional[AsyncLogger] = None,
ignore_cache: bool = False,
):
super().__init__(None)
self.provider = provider
@@ -768,10 +780,15 @@ class LLMContentFilter(RelevantContentFilter):
self.instruction = instruction
self.chunk_token_threshold = chunk_token_threshold
self.overlap_rate = overlap_rate
self.word_token_rate = word_token_rate
self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
# self.chunk_mode: str = chunk_mode
# self.char_token_rate = char_token_rate or word_token_rate / 5
# self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
self.token_rate = word_token_rate or WORD_TOKEN_RATE
self.base_url = base_url
self.api_base = api_base or base_url
self.extra_args = extra_args or {}
self.ignore_cache = ignore_cache
self.verbose = verbose
# Setup logger with custom styling for LLM operations
@@ -779,7 +796,7 @@ class LLMContentFilter(RelevantContentFilter):
self.logger = logger
elif verbose:
self.logger = AsyncLogger(
verbose=True,
verbose=verbose,
icons={
**AsyncLogger.DEFAULT_ICONS,
"LLM": "", # Star for LLM operations
@@ -803,45 +820,25 @@ class LLMContentFilter(RelevantContentFilter):
return hashlib.md5(content.encode()).hexdigest()
def _merge_chunks(self, text: str) -> List[str]:
"""Split text into chunks with overlap"""
# Calculate tokens and sections
total_tokens = len(text.split()) * self.word_token_rate
num_sections = max(1, math.floor(total_tokens / self.chunk_token_threshold))
adjusted_chunk_threshold = total_tokens / num_sections
"""Split text into chunks with overlap using char or word mode."""
ov = int(self.chunk_token_threshold * self.overlap_rate)
sections = merge_chunks(
docs = [text],
target_size= self.chunk_token_threshold,
overlap=ov,
word_token_ratio=self.word_token_rate
)
return sections
# Split into words
words = text.split()
chunks = []
current_chunk = []
current_token_count = 0
for word in words:
word_tokens = len(word) * self.word_token_rate
if current_token_count + word_tokens <= adjusted_chunk_threshold:
current_chunk.append(word)
current_token_count += word_tokens
else:
# Add overlap if not the last chunk
if chunks and self.overlap_rate > 0:
overlap_size = int(len(current_chunk) * self.overlap_rate)
current_chunk.extend(current_chunk[-overlap_size:])
chunks.append(" ".join(current_chunk))
current_chunk = [word]
current_token_count = word_tokens
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def filter_content(self, html: str, ignore_cache: bool = False) -> List[str]:
def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
if not html or not isinstance(html, str):
return []
if self.logger:
self.logger.info(
"Starting LLM content filtering process",
"Starting LLM markdown content filtering process",
tag="LLM",
params={"provider": self.provider},
colors={"provider": Fore.CYAN}
@@ -853,9 +850,12 @@ class LLMContentFilter(RelevantContentFilter):
cache_key = self._get_cache_key(html, self.instruction or "")
cache_file = cache_dir / f"{cache_key}.json"
# if ignore_cache == None:
ignore_cache = self.ignore_cache
if not ignore_cache and cache_file.exists():
if self.logger:
self.logger.info("Found cached result", tag="CACHE")
self.logger.info("Found cached markdown result", tag="CACHE")
try:
with cache_file.open('r') as f:
cached_data = json.load(f)
@@ -867,13 +867,13 @@ class LLMContentFilter(RelevantContentFilter):
return cached_data['blocks']
except Exception as e:
if self.logger:
self.logger.error(f"Cache read error: {str(e)}", tag="CACHE")
self.logger.error(f"LLM markdown: Cache read error: {str(e)}", tag="CACHE")
# Split into chunks
html_chunks = self._merge_chunks(html)
if self.logger:
self.logger.info(
"Split content into {chunk_count} chunks",
"LLM markdown: Split content into {chunk_count} chunks",
tag="CHUNK",
params={"chunk_count": len(html_chunks)},
colors={"chunk_count": Fore.YELLOW}
@@ -887,7 +887,7 @@ class LLMContentFilter(RelevantContentFilter):
for i, chunk in enumerate(html_chunks):
if self.logger:
self.logger.debug(
"Processing chunk {chunk_num}/{total_chunks}",
"LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
tag="CHUNK",
params={
"chunk_num": i + 1,
@@ -904,16 +904,38 @@ class LLMContentFilter(RelevantContentFilter):
for var, value in prompt_variables.items():
prompt = prompt.replace("{" + var + "}", value)
def _proceed_with_chunk(
provider: str,
prompt: str,
api_token: str,
base_url: Optional[str] = None,
extra_args: Dict = {}
) -> List[str]:
if self.logger:
self.logger.info(
"LLM Markdown: Processing chunk {chunk_num}",
tag="CHUNK",
params={"chunk_num": i + 1}
)
return perform_completion_with_backoff(
provider,
prompt,
api_token,
base_url=base_url,
extra_args=extra_args
)
future = executor.submit(
perform_completion_with_backoff,
_proceed_with_chunk,
self.provider,
prompt,
self.api_token,
base_url=self.api_base,
extra_args=self.extra_args
self.api_base,
self.extra_args
)
futures.append((i, future))
# Collect results in order
ordered_results = []
for i, future in sorted(futures):
@@ -940,14 +962,14 @@ class LLMContentFilter(RelevantContentFilter):
ordered_results.append(blocks)
if self.logger:
self.logger.success(
"Successfully processed chunk {chunk_num}",
"LLM markdown: Successfully processed chunk {chunk_num}",
tag="CHUNK",
params={"chunk_num": i + 1}
)
except Exception as e:
if self.logger:
self.logger.error(
"Error processing chunk {chunk_num}: {error}",
"LLM markdown: Error processing chunk {chunk_num}: {error}",
tag="CHUNK",
params={
"chunk_num": i + 1,
@@ -958,7 +980,7 @@ class LLMContentFilter(RelevantContentFilter):
end_time = time.time()
if self.logger:
self.logger.success(
"Completed processing in {time:.2f}s",
"LLM markdown: Completed processing in {time:.2f}s",
tag="LLM",
params={"time": end_time - start_time},
colors={"time": Fore.YELLOW}

View File

@@ -21,6 +21,9 @@ from .utils import (
extract_xml_data,
split_and_parse_json_objects,
sanitize_input_encode,
chunk_documents,
merge_chunks,
advanced_split,
)
from .models import * # noqa: F403
@@ -501,6 +504,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
instruction: str = None,
schema: Dict = None,
extraction_type="block",
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
overlap_rate=OVERLAP_RATE,
word_token_rate=WORD_TOKEN_RATE,
apply_chunking=True,
**kwargs,
):
"""
@@ -652,53 +659,16 @@ class LLMExtractionStrategy(ExtractionStrategy):
)
return blocks
def _merge(self, documents, chunk_token_threshold, overlap):
def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
"""
Merge documents into sections based on chunk_token_threshold and overlap.
"""
# chunks = []
sections = []
total_tokens = 0
# Calculate the total tokens across all documents
for document in documents:
total_tokens += len(document.split(" ")) * self.word_token_rate
# Calculate the number of sections needed
num_sections = math.floor(total_tokens / chunk_token_threshold)
if num_sections < 1:
num_sections = 1 # Ensure there is at least one section
adjusted_chunk_threshold = total_tokens / num_sections
total_token_so_far = 0
current_chunk = []
for document in documents:
tokens = document.split(" ")
token_count = len(tokens) * self.word_token_rate
if total_token_so_far + token_count <= adjusted_chunk_threshold:
current_chunk.extend(tokens)
total_token_so_far += token_count
else:
# Ensure to handle the last section properly
if len(sections) == num_sections - 1:
current_chunk.extend(tokens)
continue
# Add overlap if specified
if overlap > 0 and current_chunk:
overlap_tokens = current_chunk[-overlap:]
current_chunk.extend(overlap_tokens)
sections.append(" ".join(current_chunk))
current_chunk = tokens
total_token_so_far = token_count
# Add the last chunk
if current_chunk:
sections.append(" ".join(current_chunk))
sections = merge_chunks(
docs = documents,
target_size= chunk_token_threshold,
overlap=overlap,
word_token_ratio=self.word_token_rate
)
return sections
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:

View File

@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
from tabnanny import verbose
from typing import Optional, Dict, Any, Tuple
from .models import MarkdownGenerationResult
from .html2text import CustomHTML2Text
@@ -29,9 +30,11 @@ class MarkdownGenerationStrategy(ABC):
self,
content_filter: Optional[RelevantContentFilter] = None,
options: Optional[Dict[str, Any]] = None,
verbose: bool = False,
):
self.content_filter = content_filter
self.options = options or {}
self.verbose = verbose
@abstractmethod
def generate_markdown(

View File

@@ -206,17 +206,6 @@ Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags.
PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.
INPUT HTML:
<|HTML_CONTENT_START|>
{HTML}
<|HTML_CONTENT_END|>
SPECIFIC INSTRUCTION:
<|USER_INSTRUCTION_START|>
{REQUEST}
<|USER_INSTRUCTION_END|>
TASK DETAILS:
1. Content Selection
- DO: Keep essential information, main content, key details
@@ -240,15 +229,7 @@ TASK DETAILS:
- DON'T: Fragment related content
- DON'T: Duplicate information
Example Input:
<div class="main-content"><h1>Setup Guide</h1><p>Follow these steps...</p></div>
<div class="sidebar">Related articles...</div>
Example Output:
# Setup Guide
Follow these steps...
IMPORTANT: If specific instruction is provided above, prioritize those requirements over these general guidelines.
IMPORTANT: If user specific instruction is provided, ignore above guideline and prioritize those requirements over these general guidelines.
OUTPUT FORMAT:
Wrap your response in <content> tags. Use proper markdown throughout.
@@ -256,7 +237,18 @@ Wrap your response in <content> tags. Use proper markdown throughout.
[Your markdown content here]
</content>
Begin filtering now."""
Begin filtering now.
--------------------------------------------
<|HTML_CONTENT_START|>
{HTML}
<|HTML_CONTENT_END|>
<|USER_INSTRUCTION_START|>
{REQUEST}
<|USER_INSTRUCTION_END|>
"""
JSON_SCHEMA_BUILDER= """
# HTML Schema Generation Instructions

View File

@@ -1,3 +1,4 @@
from ast import Call
import time
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -8,9 +9,10 @@ import re
import os
import platform
from .prompts import PROMPT_EXTRACT_BLOCKS
from array import array
from .config import *
from pathlib import Path
from typing import Dict, Any
from typing import Dict, Any, List, Tuple, Union, Optional, Callable
from urllib.parse import urljoin
import requests
from requests.exceptions import InvalidSchema
@@ -31,6 +33,154 @@ import aiohttp
from pathlib import Path
from packaging import version
from . import __version__
from typing import Sequence, List
from array import array
from itertools import chain
from collections import deque
from typing import Callable, Generator, Iterable, List, Optional
def chunk_documents(
documents: Iterable[str],
chunk_token_threshold: int,
overlap: int,
word_token_rate: float = 0.75,
tokenizer: Optional[Callable[[str], List[str]]] = None,
) -> Generator[str, None, None]:
"""
Efficiently chunks documents into token-limited sections with overlap between chunks.
Args:
documents: Iterable of document strings
chunk_token_threshold: Maximum tokens per chunk
overlap: Number of tokens to overlap between chunks
word_token_rate: Token estimate per word when not using a tokenizer
tokenizer: Function that splits text into tokens (if available)
Yields:
Text chunks as strings
"""
token_queue = deque()
contribution_queue = deque()
current_token_count = 0.0
for doc in documents:
# Tokenize document
if tokenizer:
tokens = tokenizer(doc)
contributions = [1.0] * len(tokens)
else:
tokens = doc.split()
contributions = [word_token_rate] * len(tokens)
# Add to processing queues
token_queue.extend(tokens)
contribution_queue.extend(contributions)
current_token_count += sum(contributions)
# Process full chunks
while current_token_count >= chunk_token_threshold:
# Find chunk split point
chunk_tokens = []
chunk_contrib = []
chunk_total = 0.0
# Build chunk up to threshold
while contribution_queue:
next_contrib = contribution_queue[0]
if chunk_total + next_contrib > chunk_token_threshold:
break
chunk_total += next_contrib
chunk_contrib.append(contribution_queue.popleft())
chunk_tokens.append(token_queue.popleft())
# Handle edge case where first token exceeds threshold
if not chunk_contrib: # Single token exceeds threshold
chunk_contrib.append(contribution_queue.popleft())
chunk_tokens.append(token_queue.popleft())
# Calculate overlap
overlap_total = 0.0
overlap_idx = 0
for contrib in reversed(chunk_contrib):
if overlap_total + contrib > overlap:
break
overlap_total += contrib
overlap_idx += 1
# Prepend overlap to queues
if overlap_idx > 0:
overlap_tokens = chunk_tokens[-overlap_idx:]
overlap_contrib = chunk_contrib[-overlap_idx:]
token_queue.extendleft(reversed(overlap_tokens))
contribution_queue.extendleft(reversed(overlap_contrib))
current_token_count += overlap_total
# Update current token count and yield chunk
current_token_count -= sum(chunk_contrib)
yield " ".join(chunk_tokens[:len(chunk_tokens)-overlap_idx] if overlap_idx else chunk_tokens)
# Yield remaining tokens
if token_queue:
yield " ".join(token_queue)
def merge_chunks(
docs: Sequence[str],
target_size: int,
overlap: int = 0,
word_token_ratio: float = 1.0,
splitter: Callable = None
) -> List[str]:
"""Merges documents into chunks of specified token size.
Args:
docs: Input documents
target_size: Desired token count per chunk
overlap: Number of tokens to overlap between chunks
word_token_ratio: Multiplier for word->token conversion
"""
# Pre-tokenize all docs and store token counts
splitter = splitter or str.split
token_counts = array('I')
all_tokens: List[List[str]] = []
total_tokens = 0
for doc in docs:
tokens = doc.split()
count = int(len(tokens) * word_token_ratio)
if count: # Skip empty docs
token_counts.append(count)
all_tokens.append(tokens)
total_tokens += count
if not total_tokens:
return []
# Pre-allocate chunks
num_chunks = max(1, (total_tokens + target_size - 1) // target_size)
chunks: List[List[str]] = [[] for _ in range(num_chunks)]
curr_chunk = 0
curr_size = 0
# Distribute tokens
for tokens in chain.from_iterable(all_tokens):
if curr_size >= target_size and curr_chunk < num_chunks - 1:
if overlap > 0:
overlap_tokens = chunks[curr_chunk][-overlap:]
curr_chunk += 1
chunks[curr_chunk].extend(overlap_tokens)
curr_size = len(overlap_tokens)
else:
curr_chunk += 1
curr_size = 0
chunks[curr_chunk].append(tokens)
curr_size += 1
# Return only non-empty chunks
return [' '.join(chunk) for chunk in chunks if chunk]
class VersionManager:
@@ -189,6 +339,77 @@ class InvalidCSSSelectorError(Exception):
pass
SPLITS = bytearray([
# Control chars (0-31) + space (32)
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# Special chars (33-47): ! " # $ % & ' ( ) * + , - . /
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# Numbers (48-57): Treat as non-splits
0,0,0,0,0,0,0,0,0,0,
# More special chars (58-64): : ; < = > ? @
1,1,1,1,1,1,1,
# Uppercase (65-90): Keep
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# More special chars (91-96): [ \ ] ^ _ `
1,1,1,1,1,1,
# Lowercase (97-122): Keep
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# Special chars (123-126): { | } ~
1,1,1,1,
# Extended ASCII
*([1] * 128)
])
# Additional split chars for HTML/code
HTML_CODE_CHARS = {
# HTML specific
'', '', '', '©', '®', '', '', '', '', '', '',
# Programming symbols
'+=', '-=', '*=', '/=', '=>', '<=>', '!=', '==', '===',
'++', '--', '<<', '>>', '&&', '||', '??', '?:', '?.',
# Common Unicode
'', '"', '"', ''', ''', '«', '»', '', '',
# Additional splits
'+', '=', '~', '@', '#', '$', '%', '^', '&', '*',
'(', ')', '{', '}', '[', ']', '|', '\\', '/', '`',
'<', '>', ',', '.', '?', '!', ':', ';', '-', '_'
}
def advanced_split(text: str) -> list[str]:
result = []
word = array('u')
i = 0
text_len = len(text)
while i < text_len:
char = text[i]
o = ord(char)
# Fast path for ASCII
if o < 256 and SPLITS[o]:
if word:
result.append(word.tounicode())
word = array('u')
# Check for multi-char symbols
elif i < text_len - 1:
two_chars = char + text[i + 1]
if two_chars in HTML_CODE_CHARS:
if word:
result.append(word.tounicode())
word = array('u')
i += 1 # Skip next char since we used it
else:
word.append(char)
else:
word.append(char)
i += 1
if word:
result.append(word.tounicode())
return result
def create_box_message(
message: str,
type: str = "info",