refactor(docker): improve server architecture and configuration

Complete overhaul of Docker deployment setup with improved architecture: - Add Redis integration for task management - Implement rate limiting and security middleware - Add Prometheus metrics and health checks - Improve error handling and logging - Add support for streaming responses - Implement proper configuration management - Add platform-specific optimizations for ARM64/AMD64 BREAKING CHANGE: Docker deployment now requires Redis and new config.yml structure
2025-02-02 20:19:51 +08:00
parent 7b1ef07c41
commit 33a21d6a7a
16 changed files with 1918 additions and 344 deletions
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -5,7 +5,7 @@ from typing import List, Tuple, Dict, Optional
 from rank_bm25 import BM25Okapi
 from collections import deque
 from bs4 import NavigableString, Comment
-from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data
+from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data, merge_chunks
 from abc import ABC, abstractmethod
 import math
 from snowballstemmer import stemmer
@@ -23,7 +23,14 @@ from colorama import Fore, Style
 class RelevantContentFilter(ABC):
    """Abstract base class for content filtering strategies"""

-    def __init__(self, user_query: str = None):
+    def __init__(self, user_query: str = None, verbose: bool = False, logger: Optional[AsyncLogger] = None):
+        """
+        Initializes the RelevantContentFilter class with optional user query.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            verbose (bool): Enable verbose logging (default: False).
+        """
        self.user_query = user_query
        self.included_tags = {
            # Primary structure
@@ -92,6 +99,8 @@ class RelevantContentFilter(ABC):
            r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
        )
        self.min_word_count = 2
+        self.verbose = False
+        self.logger = logger

    @abstractmethod
    def filter_content(self, html: str) -> List[str]:
@@ -755,8 +764,11 @@ class LLMContentFilter(RelevantContentFilter):
        base_url: Optional[str] = None,
        api_base: Optional[str] = None,
        extra_args: Dict = None,
+        # char_token_rate: float = WORD_TOKEN_RATE * 5,
+        # chunk_mode: str = "char",
        verbose: bool = False,
        logger: Optional[AsyncLogger] = None,
+        ignore_cache: bool = False,
    ):
        super().__init__(None)
        self.provider = provider
@@ -768,10 +780,15 @@ class LLMContentFilter(RelevantContentFilter):
        self.instruction = instruction
        self.chunk_token_threshold = chunk_token_threshold
        self.overlap_rate = overlap_rate
-        self.word_token_rate = word_token_rate
+        self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
+        # self.chunk_mode: str = chunk_mode
+        # self.char_token_rate = char_token_rate or word_token_rate / 5
+        # self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
+        self.token_rate = word_token_rate or WORD_TOKEN_RATE
        self.base_url = base_url
        self.api_base = api_base or base_url
        self.extra_args = extra_args or {}
+        self.ignore_cache = ignore_cache
        self.verbose = verbose
        
        # Setup logger with custom styling for LLM operations
@@ -779,7 +796,7 @@ class LLMContentFilter(RelevantContentFilter):
            self.logger = logger
        elif verbose:
            self.logger = AsyncLogger(
-                verbose=True,
+                verbose=verbose,
                icons={
                    **AsyncLogger.DEFAULT_ICONS,
                    "LLM": "★",  # Star for LLM operations
@@ -803,45 +820,25 @@ class LLMContentFilter(RelevantContentFilter):
        return hashlib.md5(content.encode()).hexdigest()

    def _merge_chunks(self, text: str) -> List[str]:
-        """Split text into chunks with overlap"""
-        # Calculate tokens and sections
-        total_tokens = len(text.split()) * self.word_token_rate
-        num_sections = max(1, math.floor(total_tokens / self.chunk_token_threshold))
-        adjusted_chunk_threshold = total_tokens / num_sections
+        """Split text into chunks with overlap using char or word mode."""
+        ov = int(self.chunk_token_threshold * self.overlap_rate)
+        sections = merge_chunks(
+            docs = [text],
+            target_size= self.chunk_token_threshold,
+            overlap=ov,
+            word_token_ratio=self.word_token_rate
+        )
+        return sections
+    
+        

-        # Split into words
-        words = text.split()
-        chunks = []
-        current_chunk = []
-        current_token_count = 0
-
-        for word in words:
-            word_tokens = len(word) * self.word_token_rate
-            if current_token_count + word_tokens <= adjusted_chunk_threshold:
-                current_chunk.append(word)
-                current_token_count += word_tokens
-            else:
-                # Add overlap if not the last chunk
-                if chunks and self.overlap_rate > 0:
-                    overlap_size = int(len(current_chunk) * self.overlap_rate)
-                    current_chunk.extend(current_chunk[-overlap_size:])
-                
-                chunks.append(" ".join(current_chunk))
-                current_chunk = [word]
-                current_token_count = word_tokens
-
-        if current_chunk:
-            chunks.append(" ".join(current_chunk))
-
-        return chunks
-
-    def filter_content(self, html: str, ignore_cache: bool = False) -> List[str]:
+    def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
        if not html or not isinstance(html, str):
            return []

        if self.logger:
            self.logger.info(
-                "Starting LLM content filtering process", 
+                "Starting LLM markdown content filtering process", 
                tag="LLM",
                params={"provider": self.provider},
                colors={"provider": Fore.CYAN}
@@ -853,9 +850,12 @@ class LLMContentFilter(RelevantContentFilter):
        cache_key = self._get_cache_key(html, self.instruction or "")
        cache_file = cache_dir / f"{cache_key}.json"

+        # if ignore_cache == None:
+        ignore_cache = self.ignore_cache
+
        if not ignore_cache and cache_file.exists():
            if self.logger:
-                self.logger.info("Found cached result", tag="CACHE")
+                self.logger.info("Found  cached markdown result", tag="CACHE")
            try:
                with cache_file.open('r') as f:
                    cached_data = json.load(f)
@@ -867,13 +867,13 @@ class LLMContentFilter(RelevantContentFilter):
                    return cached_data['blocks']
            except Exception as e:
                if self.logger:
-                    self.logger.error(f"Cache read error: {str(e)}", tag="CACHE")
+                    self.logger.error(f"LLM markdown: Cache read error: {str(e)}", tag="CACHE")

        # Split into chunks
        html_chunks = self._merge_chunks(html)
        if self.logger:
            self.logger.info(
-                "Split content into {chunk_count} chunks", 
+                "LLM markdown: Split content into {chunk_count} chunks", 
                tag="CHUNK",
                params={"chunk_count": len(html_chunks)},
                colors={"chunk_count": Fore.YELLOW}
@@ -887,7 +887,7 @@ class LLMContentFilter(RelevantContentFilter):
            for i, chunk in enumerate(html_chunks):
                if self.logger:
                    self.logger.debug(
-                        "Processing chunk {chunk_num}/{total_chunks}", 
+                        "LLM markdown: Processing chunk {chunk_num}/{total_chunks}", 
                        tag="CHUNK",
                        params={
                            "chunk_num": i + 1,
@@ -904,16 +904,38 @@ class LLMContentFilter(RelevantContentFilter):
                for var, value in prompt_variables.items():
                    prompt = prompt.replace("{" + var + "}", value)

+                def _proceed_with_chunk(
+                        provider: str,
+                        prompt: str,
+                        api_token: str,
+                        base_url: Optional[str] = None,
+                        extra_args: Dict = {}
+                    ) -> List[str]:
+                    if self.logger:
+                        self.logger.info(
+                            "LLM Markdown: Processing chunk {chunk_num}", 
+                            tag="CHUNK",
+                            params={"chunk_num": i + 1}
+                        )
+                    return perform_completion_with_backoff(
+                        provider,
+                        prompt,
+                        api_token,
+                        base_url=base_url,
+                        extra_args=extra_args
+                    )
+
                future = executor.submit(
-                    perform_completion_with_backoff,
+                    _proceed_with_chunk,
                    self.provider,
                    prompt,
                    self.api_token,
-                    base_url=self.api_base,
-                    extra_args=self.extra_args
+                    self.api_base,
+                    self.extra_args
                )
                futures.append((i, future))

+
            # Collect results in order
            ordered_results = []
            for i, future in sorted(futures):
@@ -940,14 +962,14 @@ class LLMContentFilter(RelevantContentFilter):
                        ordered_results.append(blocks)
                        if self.logger:
                            self.logger.success(
-                                "Successfully processed chunk {chunk_num}", 
+                                "LLM markdown: Successfully processed chunk {chunk_num}", 
                                tag="CHUNK",
                                params={"chunk_num": i + 1}
                            )
                except Exception as e:
                    if self.logger:
                        self.logger.error(
-                            "Error processing chunk {chunk_num}: {error}", 
+                            "LLM markdown: Error processing chunk {chunk_num}: {error}", 
                            tag="CHUNK",
                            params={
                                "chunk_num": i + 1,
@@ -958,7 +980,7 @@ class LLMContentFilter(RelevantContentFilter):
        end_time = time.time()
        if self.logger:
            self.logger.success(
-                "Completed processing in {time:.2f}s", 
+                "LLM markdown: Completed processing in {time:.2f}s", 
                tag="LLM",
                params={"time": end_time - start_time},
                colors={"time": Fore.YELLOW}
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -21,6 +21,9 @@ from .utils import (
    extract_xml_data,
    split_and_parse_json_objects,
    sanitize_input_encode,
+    chunk_documents,
+    merge_chunks,
+    advanced_split,
 )
 from .models import * # noqa: F403

@@ -501,6 +504,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
        instruction: str = None,
        schema: Dict = None,
        extraction_type="block",
+        chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
+        overlap_rate=OVERLAP_RATE,
+        word_token_rate=WORD_TOKEN_RATE,
+        apply_chunking=True,
        **kwargs,
    ):
        """
@@ -652,53 +659,16 @@ class LLMExtractionStrategy(ExtractionStrategy):
            )
        return blocks

-    def _merge(self, documents, chunk_token_threshold, overlap):
+    def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
        """
        Merge documents into sections based on chunk_token_threshold and overlap.
        """
-        # chunks = []
-        sections = []
-        total_tokens = 0
-
-        # Calculate the total tokens across all documents
-        for document in documents:
-            total_tokens += len(document.split(" ")) * self.word_token_rate
-
-        # Calculate the number of sections needed
-        num_sections = math.floor(total_tokens / chunk_token_threshold)
-        if num_sections < 1:
-            num_sections = 1  # Ensure there is at least one section
-        adjusted_chunk_threshold = total_tokens / num_sections
-
-        total_token_so_far = 0
-        current_chunk = []
-
-        for document in documents:
-            tokens = document.split(" ")
-            token_count = len(tokens) * self.word_token_rate
-
-            if total_token_so_far + token_count <= adjusted_chunk_threshold:
-                current_chunk.extend(tokens)
-                total_token_so_far += token_count
-            else:
-                # Ensure to handle the last section properly
-                if len(sections) == num_sections - 1:
-                    current_chunk.extend(tokens)
-                    continue
-
-                # Add overlap if specified
-                if overlap > 0 and current_chunk:
-                    overlap_tokens = current_chunk[-overlap:]
-                    current_chunk.extend(overlap_tokens)
-
-                sections.append(" ".join(current_chunk))
-                current_chunk = tokens
-                total_token_so_far = token_count
-
-        # Add the last chunk
-        if current_chunk:
-            sections.append(" ".join(current_chunk))
-
+        sections =  merge_chunks(
+            docs = documents,
+            target_size= chunk_token_threshold,
+            overlap=overlap,
+            word_token_ratio=self.word_token_rate
+        )
        return sections

    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from tabnanny import verbose
 from typing import Optional, Dict, Any, Tuple
 from .models import MarkdownGenerationResult
 from .html2text import CustomHTML2Text
@@ -29,9 +30,11 @@ class MarkdownGenerationStrategy(ABC):
        self,
        content_filter: Optional[RelevantContentFilter] = None,
        options: Optional[Dict[str, Any]] = None,
+        verbose: bool = False,
    ):
        self.content_filter = content_filter
        self.options = options or {}
+        self.verbose = verbose

    @abstractmethod
    def generate_markdown(
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -206,17 +206,6 @@ Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags.

 PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.

-INPUT HTML: 
-<|HTML_CONTENT_START|>
-{HTML}
-<|HTML_CONTENT_END|>
-
-
-SPECIFIC INSTRUCTION: 
-<|USER_INSTRUCTION_START|>
-{REQUEST}
-<|USER_INSTRUCTION_END|>
-
 TASK DETAILS:
 1. Content Selection
 - DO: Keep essential information, main content, key details
@@ -240,15 +229,7 @@ TASK DETAILS:
 - DON'T: Fragment related content
 - DON'T: Duplicate information

-Example Input:
-<div class="main-content"><h1>Setup Guide</h1><p>Follow these steps...</p></div>
-<div class="sidebar">Related articles...</div>
-
-Example Output:
-# Setup Guide
-Follow these steps...
-
-IMPORTANT: If specific instruction is provided above, prioritize those requirements over these general guidelines.
+IMPORTANT: If user specific instruction is provided, ignore above guideline and prioritize those requirements over these general guidelines.

 OUTPUT FORMAT: 
 Wrap your response in <content> tags. Use proper markdown throughout.
@@ -256,7 +237,18 @@ Wrap your response in <content> tags. Use proper markdown throughout.
 [Your markdown content here]
 </content>

-Begin filtering now."""
+Begin filtering now.
+
+--------------------------------------------
+
+<|HTML_CONTENT_START|>
+{HTML}
+<|HTML_CONTENT_END|>
+
+<|USER_INSTRUCTION_START|>
+{REQUEST}
+<|USER_INSTRUCTION_END|>
+"""

 JSON_SCHEMA_BUILDER= """
 # HTML Schema Generation Instructions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1,3 +1,4 @@
+from ast import Call
 import time
 from urllib.parse import urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -8,9 +9,10 @@ import re
 import os
 import platform
 from .prompts import PROMPT_EXTRACT_BLOCKS
+from array import array
 from .config import *
 from pathlib import Path
-from typing import Dict, Any
+from typing import Dict, Any, List, Tuple, Union, Optional, Callable
 from urllib.parse import urljoin
 import requests
 from requests.exceptions import InvalidSchema
@@ -31,6 +33,154 @@ import aiohttp
 from pathlib import Path
 from packaging import version
 from . import __version__
+from typing import Sequence, List
+from array import array
+from itertools import chain
+from collections import deque
+from typing import Callable, Generator, Iterable, List, Optional
+
+def chunk_documents(
+    documents: Iterable[str],
+    chunk_token_threshold: int,
+    overlap: int,
+    word_token_rate: float = 0.75,
+    tokenizer: Optional[Callable[[str], List[str]]] = None,
+) -> Generator[str, None, None]:
+    """
+    Efficiently chunks documents into token-limited sections with overlap between chunks.
+
+    Args:
+        documents: Iterable of document strings
+        chunk_token_threshold: Maximum tokens per chunk
+        overlap: Number of tokens to overlap between chunks
+        word_token_rate: Token estimate per word when not using a tokenizer
+        tokenizer: Function that splits text into tokens (if available)
+
+    Yields:
+        Text chunks as strings
+    """
+    token_queue = deque()
+    contribution_queue = deque()
+    current_token_count = 0.0
+
+    for doc in documents:
+        # Tokenize document
+        if tokenizer:
+            tokens = tokenizer(doc)
+            contributions = [1.0] * len(tokens)
+        else:
+            tokens = doc.split()
+            contributions = [word_token_rate] * len(tokens)
+
+        # Add to processing queues
+        token_queue.extend(tokens)
+        contribution_queue.extend(contributions)
+        current_token_count += sum(contributions)
+
+        # Process full chunks
+        while current_token_count >= chunk_token_threshold:
+            # Find chunk split point
+            chunk_tokens = []
+            chunk_contrib = []
+            chunk_total = 0.0
+            
+            # Build chunk up to threshold
+            while contribution_queue:
+                next_contrib = contribution_queue[0]
+                if chunk_total + next_contrib > chunk_token_threshold:
+                    break
+                
+                chunk_total += next_contrib
+                chunk_contrib.append(contribution_queue.popleft())
+                chunk_tokens.append(token_queue.popleft())
+
+            # Handle edge case where first token exceeds threshold
+            if not chunk_contrib:  # Single token exceeds threshold
+                chunk_contrib.append(contribution_queue.popleft())
+                chunk_tokens.append(token_queue.popleft())
+
+            # Calculate overlap
+            overlap_total = 0.0
+            overlap_idx = 0
+            for contrib in reversed(chunk_contrib):
+                if overlap_total + contrib > overlap:
+                    break
+                overlap_total += contrib
+                overlap_idx += 1
+
+            # Prepend overlap to queues
+            if overlap_idx > 0:
+                overlap_tokens = chunk_tokens[-overlap_idx:]
+                overlap_contrib = chunk_contrib[-overlap_idx:]
+                
+                token_queue.extendleft(reversed(overlap_tokens))
+                contribution_queue.extendleft(reversed(overlap_contrib))
+                current_token_count += overlap_total
+
+            # Update current token count and yield chunk
+            current_token_count -= sum(chunk_contrib)
+            yield " ".join(chunk_tokens[:len(chunk_tokens)-overlap_idx] if overlap_idx else chunk_tokens)
+
+    # Yield remaining tokens
+    if token_queue:
+        yield " ".join(token_queue)
+
+def merge_chunks(
+    docs: Sequence[str], 
+    target_size: int,
+    overlap: int = 0,
+    word_token_ratio: float = 1.0,
+    splitter: Callable = None
+) -> List[str]:
+    """Merges documents into chunks of specified token size.
+    
+    Args:
+        docs: Input documents
+        target_size: Desired token count per chunk
+        overlap: Number of tokens to overlap between chunks
+        word_token_ratio: Multiplier for word->token conversion
+    """
+    # Pre-tokenize all docs and store token counts
+    splitter = splitter or str.split
+    token_counts = array('I')
+    all_tokens: List[List[str]] = []
+    total_tokens = 0
+    
+    for doc in docs:
+        tokens = doc.split()
+        count = int(len(tokens) * word_token_ratio)
+        if count:  # Skip empty docs
+            token_counts.append(count)
+            all_tokens.append(tokens)
+            total_tokens += count
+    
+    if not total_tokens:
+        return []
+
+    # Pre-allocate chunks
+    num_chunks = max(1, (total_tokens + target_size - 1) // target_size)
+    chunks: List[List[str]] = [[] for _ in range(num_chunks)]
+    
+    curr_chunk = 0
+    curr_size = 0
+    
+    # Distribute tokens
+    for tokens in chain.from_iterable(all_tokens):
+        if curr_size >= target_size and curr_chunk < num_chunks - 1:
+            if overlap > 0:
+                overlap_tokens = chunks[curr_chunk][-overlap:]
+                curr_chunk += 1
+                chunks[curr_chunk].extend(overlap_tokens)
+                curr_size = len(overlap_tokens)
+            else:
+                curr_chunk += 1
+                curr_size = 0
+                
+        chunks[curr_chunk].append(tokens)
+        curr_size += 1
+
+    # Return only non-empty chunks
+    return [' '.join(chunk) for chunk in chunks if chunk]


 class VersionManager:
@@ -189,6 +339,77 @@ class InvalidCSSSelectorError(Exception):
    pass


+SPLITS = bytearray([
+    # Control chars (0-31) + space (32)
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    # Special chars (33-47): ! " # $ % & ' ( ) * + , - . /
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    # Numbers (48-57): Treat as non-splits
+    0,0,0,0,0,0,0,0,0,0,
+    # More special chars (58-64): : ; < = > ? @
+    1,1,1,1,1,1,1,
+    # Uppercase (65-90): Keep
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    # More special chars (91-96): [ \ ] ^ _ `
+    1,1,1,1,1,1,
+    # Lowercase (97-122): Keep
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    # Special chars (123-126): { | } ~
+    1,1,1,1,
+    # Extended ASCII
+    *([1] * 128)
+])
+
+# Additional split chars for HTML/code
+HTML_CODE_CHARS = {
+    # HTML specific
+    '•', '►', '▼', '©', '®', '™', '→', '⇒', '≈', '≤', '≥',
+    # Programming symbols  
+    '+=', '-=', '*=', '/=', '=>', '<=>', '!=', '==', '===',
+    '++', '--', '<<', '>>', '&&', '||', '??', '?:', '?.', 
+    # Common Unicode
+    '…', '"', '"', ''', ''', '«', '»', '—', '–',
+    # Additional splits
+    '+', '=', '~', '@', '#', '$', '%', '^', '&', '*',
+    '(', ')', '{', '}', '[', ']', '|', '\\', '/', '`',
+    '<', '>', ',', '.', '?', '!', ':', ';', '-', '_'
+}
+
+def advanced_split(text: str) -> list[str]:
+    result = []
+    word = array('u')
+    
+    i = 0
+    text_len = len(text)
+    
+    while i < text_len:
+        char = text[i]
+        o = ord(char)
+        
+        # Fast path for ASCII
+        if o < 256 and SPLITS[o]:
+            if word:
+                result.append(word.tounicode())
+                word = array('u')
+        # Check for multi-char symbols
+        elif i < text_len - 1:
+            two_chars = char + text[i + 1]
+            if two_chars in HTML_CODE_CHARS:
+                if word:
+                    result.append(word.tounicode())
+                    word = array('u')
+                i += 1  # Skip next char since we used it
+            else:
+                word.append(char)
+        else:
+            word.append(char)
+        i += 1
+            
+    if word:
+        result.append(word.tounicode())
+        
+    return result
+
 def create_box_message(
    message: str,
    type: str = "info",