feat(content-filter): add LLMContentFilter for intelligent markdown generation

Add new LLMContentFilter class that uses LLMs to generate high-quality markdown content: - Implement intelligent content filtering with customizable instructions - Add chunk processing for handling large documents - Support parallel processing of content chunks - Include caching mechanism for filtered results - Add usage tracking and statistics - Update documentation with examples and use cases Also includes minor changes: - Disable Pydantic warnings in __init__.py - Add new prompt template for content filtering
2025-01-18 19:31:07 +08:00
parent 2d6b19e1a2
commit 3d09b6a221
5 changed files with 495 additions and 5 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -76,3 +76,10 @@ else:
    WebCrawler = None
    # import warnings
    # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
+
+import warnings
+from pydantic import warnings as pydantic_warnings
+
+# Disable all Pydantic warnings
+warnings.filterwarnings("ignore", module="pydantic")
+# pydantic_warnings.filter_warnings()
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -1,14 +1,24 @@
 import re
+import time
 from bs4 import BeautifulSoup, Tag
-from typing import List, Tuple
+from typing import List, Tuple, Dict, Optional
 from rank_bm25 import BM25Okapi
 from collections import deque
 from bs4 import NavigableString, Comment
-from .utils import clean_tokens
+from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data
 from abc import ABC, abstractmethod
 import math
 from snowballstemmer import stemmer
-
+from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
+from .models import TokenUsage
+from .prompts import PROMPT_FILTER_CONTENT
+import os
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .async_logger import AsyncLogger, LogLevel
+from colorama import Fore, Style, init

 class RelevantContentFilter(ABC):
    """Abstract base class for content filtering strategies"""
@@ -343,7 +353,6 @@ class RelevantContentFilter(ABC):
        except Exception:
            return str(tag)  # Fallback to original if anything fails

-
 class BM25ContentFilter(RelevantContentFilter):
    """
    Content filtering using BM25 algorithm with priority tag handling.
@@ -486,7 +495,6 @@ class BM25ContentFilter(RelevantContentFilter):

        return [self.clean_element(tag) for _, _, tag in selected_candidates]

-
 class PruningContentFilter(RelevantContentFilter):
    """
    Content filtering using pruning algorithm with dynamic threshold.
@@ -732,3 +740,260 @@ class PruningContentFilter(RelevantContentFilter):
            if self.negative_patterns.match(element_id):
                class_id_score -= 0.5
        return class_id_score
+
+class LLMContentFilter(RelevantContentFilter):
+    """Content filtering using LLMs to generate relevant markdown."""
+
+    def __init__(
+        self,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        instruction: str = None,
+        chunk_token_threshold: int = int(1e9),
+        overlap_rate: float = OVERLAP_RATE,
+        word_token_rate: float = WORD_TOKEN_RATE,
+        base_url: Optional[str] = None,
+        api_base: Optional[str] = None,
+        extra_args: Dict = None,
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+    ):
+        super().__init__(None)
+        self.provider = provider
+        self.api_token = (
+            api_token
+            or PROVIDER_MODELS.get(provider, "no-token")
+            or os.getenv("OPENAI_API_KEY")
+        )
+        self.instruction = instruction
+        self.chunk_token_threshold = chunk_token_threshold
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate
+        self.base_url = base_url
+        self.api_base = api_base or base_url
+        self.extra_args = extra_args or {}
+        self.verbose = verbose
+        
+        # Setup logger with custom styling for LLM operations
+        if logger:
+            self.logger = logger
+        elif verbose:
+            self.logger = AsyncLogger(
+                verbose=True,
+                icons={
+                    **AsyncLogger.DEFAULT_ICONS,
+                    "LLM": "★",  # Star for LLM operations
+                    "CHUNK": "◈",  # Diamond for chunks
+                    "CACHE": "⚡", # Lightning for cache operations
+                },
+                colors={
+                    **AsyncLogger.DEFAULT_COLORS,
+                    LogLevel.INFO: Fore.MAGENTA + Style.DIM,  # Dimmed purple for LLM ops
+                }
+            )
+        else:
+            self.logger = None
+        
+        self.usages = []
+        self.total_usage = TokenUsage()
+
+    def _get_cache_key(self, html: str, instruction: str) -> str:
+        """Generate a unique cache key based on HTML and instruction"""
+        content = f"{html}{instruction}"
+        return hashlib.md5(content.encode()).hexdigest()
+
+    def _merge_chunks(self, text: str) -> List[str]:
+        """Split text into chunks with overlap"""
+        # Calculate tokens and sections
+        total_tokens = len(text.split()) * self.word_token_rate
+        num_sections = max(1, math.floor(total_tokens / self.chunk_token_threshold))
+        adjusted_chunk_threshold = total_tokens / num_sections
+
+        # Split into words
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_token_count = 0
+
+        for word in words:
+            word_tokens = len(word) * self.word_token_rate
+            if current_token_count + word_tokens <= adjusted_chunk_threshold:
+                current_chunk.append(word)
+                current_token_count += word_tokens
+            else:
+                # Add overlap if not the last chunk
+                if chunks and self.overlap_rate > 0:
+                    overlap_size = int(len(current_chunk) * self.overlap_rate)
+                    current_chunk.extend(current_chunk[-overlap_size:])
+                
+                chunks.append(" ".join(current_chunk))
+                current_chunk = [word]
+                current_token_count = word_tokens
+
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+
+        return chunks
+
+    def filter_content(self, html: str, ignore_cache: bool = False) -> List[str]:
+        if not html or not isinstance(html, str):
+            return []
+
+        if self.logger:
+            self.logger.info(
+                "Starting LLM content filtering process", 
+                tag="LLM",
+                params={"provider": self.provider},
+                colors={"provider": Fore.CYAN}
+            )
+
+        # Cache handling
+        cache_dir = Path(get_home_folder()) / "llm_cache" / "content_filter"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_key = self._get_cache_key(html, self.instruction or "")
+        cache_file = cache_dir / f"{cache_key}.json"
+
+        if not ignore_cache and cache_file.exists():
+            if self.logger:
+                self.logger.info("Found cached result", tag="CACHE")
+            try:
+                with cache_file.open('r') as f:
+                    cached_data = json.load(f)
+                    usage = TokenUsage(**cached_data['usage'])
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+                    return cached_data['blocks']
+            except Exception as e:
+                if self.logger:
+                    self.logger.error(f"Cache read error: {str(e)}", tag="CACHE")
+
+        # Split into chunks
+        html_chunks = self._merge_chunks(html)
+        if self.logger:
+            self.logger.info(
+                "Split content into {chunk_count} chunks", 
+                tag="CHUNK",
+                params={"chunk_count": len(html_chunks)},
+                colors={"chunk_count": Fore.YELLOW}
+            )
+        
+        extracted_content = []
+        start_time = time.time()
+        
+        # Process chunks in parallel
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = []
+            for i, chunk in enumerate(html_chunks):
+                if self.logger:
+                    self.logger.debug(
+                        "Processing chunk {chunk_num}/{total_chunks}", 
+                        tag="CHUNK",
+                        params={
+                            "chunk_num": i + 1,
+                            "total_chunks": len(html_chunks)
+                        }
+                    )
+
+                prompt_variables = {
+                    "HTML": escape_json_string(sanitize_html(chunk)),
+                    "REQUEST": self.instruction or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content."
+                }
+
+                prompt = PROMPT_FILTER_CONTENT
+                for var, value in prompt_variables.items():
+                    prompt = prompt.replace("{" + var + "}", value)
+
+                future = executor.submit(
+                    perform_completion_with_backoff,
+                    self.provider,
+                    prompt,
+                    self.api_token,
+                    base_url=self.api_base,
+                    extra_args=self.extra_args
+                )
+                futures.append((i, future))
+
+            # Collect results in order
+            ordered_results = []
+            for i, future in sorted(futures):
+                try:
+                    response = future.result()
+                    
+                    # Track usage
+                    usage = TokenUsage(
+                        completion_tokens=response.usage.completion_tokens,
+                        prompt_tokens=response.usage.prompt_tokens,
+                        total_tokens=response.usage.total_tokens,
+                        completion_tokens_details=response.usage.completion_tokens_details.__dict__ 
+                        if response.usage.completion_tokens_details else {},
+                        prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+                        if response.usage.prompt_tokens_details else {},
+                    )
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+
+                    blocks = extract_xml_data(["content"], response.choices[0].message.content)["content"]
+                    if blocks:
+                        ordered_results.append(blocks)
+                        if self.logger:
+                            self.logger.success(
+                                "Successfully processed chunk {chunk_num}", 
+                                tag="CHUNK",
+                                params={"chunk_num": i + 1}
+                            )
+                except Exception as e:
+                    if self.logger:
+                        self.logger.error(
+                            "Error processing chunk {chunk_num}: {error}", 
+                            tag="CHUNK",
+                            params={
+                                "chunk_num": i + 1,
+                                "error": str(e)
+                            }
+                        )
+
+        end_time = time.time()
+        if self.logger:
+            self.logger.success(
+                "Completed processing in {time:.2f}s", 
+                tag="LLM",
+                params={"time": end_time - start_time},
+                colors={"time": Fore.YELLOW}
+            )
+
+        result = ordered_results if ordered_results else []
+
+        # Cache the final result
+        cache_data = {
+            'blocks': result,
+            'usage': self.total_usage.__dict__
+        }
+        with cache_file.open('w') as f:
+            json.dump(cache_data, f)
+            if self.logger:
+                self.logger.info("Cached results for future use", tag="CACHE")
+
+        return result
+
+    def show_usage(self) -> None:
+        """Print usage statistics"""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        if self.usages:
+            print("\n=== Usage History ===")
+            print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+            print("-" * 48)
+            for i, usage in enumerate(self.usages, 1):
+                print(
+                    f"{i:<10} {usage.completion_tokens:>12,} "
+                    f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+                )
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -202,3 +202,58 @@ Avoid Common Mistakes:

 Result
 Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
+
+
+PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.
+
+INPUT HTML: 
+<|HTML_CONTENT_START|>
+{HTML}
+<|HTML_CONTENT_END|>
+
+
+SPECIFIC INSTRUCTION: 
+<|USER_INSTRUCTION_START|>
+{REQUEST}
+<|USER_INSTRUCTION_END|>
+
+TASK DETAILS:
+1. Content Selection
+- DO: Keep essential information, main content, key details
+- DO: Preserve hierarchical structure using markdown headers
+- DO: Keep code blocks, tables, key lists
+- DON'T: Include navigation menus, ads, footers, cookie notices
+- DON'T: Keep social media widgets, sidebars, related content
+
+2. Content Transformation
+- DO: Use proper markdown syntax (#, ##, **, `, etc)
+- DO: Convert tables to markdown tables
+- DO: Preserve code formatting with ```language blocks
+- DO: Maintain link texts but remove tracking parameters
+- DON'T: Include HTML tags in output
+- DON'T: Keep class names, ids, or other HTML attributes
+
+3. Content Organization
+- DO: Maintain logical flow of information
+- DO: Group related content under appropriate headers
+- DO: Use consistent header levels
+- DON'T: Fragment related content
+- DON'T: Duplicate information
+
+Example Input:
+<div class="main-content"><h1>Setup Guide</h1><p>Follow these steps...</p></div>
+<div class="sidebar">Related articles...</div>
+
+Example Output:
+# Setup Guide
+Follow these steps...
+
+IMPORTANT: If specific instruction is provided above, prioritize those requirements over these general guidelines.
+
+OUTPUT FORMAT: 
+Wrap your response in <content> tags. Use proper markdown throughout.
+<content>
+[Your markdown content here]
+</content>
+
+Begin filtering now."""
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -170,6 +170,82 @@ prune_filter = PruningContentFilter(
 - You want a broad cleanup without a user query.  
 - The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.

+### 4.3 LLMContentFilter
+
+For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def main():
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        provider="openai/gpt-4",  # or your preferred provider
+        api_token="your-api-token",  # or use environment variable
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=4096,  # Adjust based on your needs
+        verbose=True
+    )
+
+    config = CrawlerRunConfig(
+        content_filter=filter
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        print(result.fit_markdown)  # Filtered markdown content
+```
+
+**Key Features:**
+- **Intelligent Filtering**: Uses LLMs to understand and extract relevant content while maintaining context
+- **Customizable Instructions**: Tailor the filtering process with specific instructions
+- **Chunk Processing**: Handles large documents by processing them in chunks (controlled by `chunk_token_threshold`)
+- **Parallel Processing**: For better performance, use smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks
+
+**Two Common Use Cases:**
+
+1. **Exact Content Preservation**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Extract the main educational content while preserving its original wording and substance completely.
+    1. Maintain the exact language and terminology
+    2. Keep all technical explanations and examples intact
+    3. Preserve the original flow and structure
+    4. Remove only clearly irrelevant elements like navigation menus and ads
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+2. **Focused Content Extraction**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Focus on extracting specific types of content:
+    - Technical documentation
+    - Code examples
+    - API references
+    Reformat the content into clear, well-structured markdown
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+> **Performance Tip**: Set a smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks. The default value is infinity, which processes the entire content as a single chunk.
+
 ---

 ## 5. Using Fit Markdown
--- a/tests/20241401/test_llm_filter.py
+++ b/tests/20241401/test_llm_filter.py
@@ -0,0 +1,87 @@
+import os
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def test_llm_filter():
+    # Create an HTML source that needs intelligent filtering
+    url = "https://docs.python.org/3/tutorial/classes.html"
+    
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+    
+    # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # First get the raw HTML
+        result = await crawler.arun(url, config=run_config)
+        html = result.cleaned_html
+
+        # Initialize LLM filter with focused instruction
+        filter = LLMContentFilter(
+            provider="openai/gpt-4o",
+            api_token=os.getenv('OPENAI_API_KEY'),
+            instruction="""
+            Focus on extracting the core educational content about Python classes.
+            Include:
+            - Key concepts and their explanations
+            - Important code examples
+            - Essential technical details
+            Exclude:
+            - Navigation elements
+            - Sidebars
+            - Footer content
+            - Version information
+            - Any non-essential UI elements
+            
+            Format the output as clean markdown with proper code blocks and headers.
+            """,
+            verbose=True
+        )
+        
+        filter = LLMContentFilter(
+            provider="openai/gpt-4o",
+            api_token=os.getenv('OPENAI_API_KEY'),
+            chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
+            instruction="""
+            Extract the main educational content while preserving its original wording and substance completely. Your task is to:
+
+            1. Maintain the exact language and terminology used in the main content
+            2. Keep all technical explanations, examples, and educational content intact
+            3. Preserve the original flow and structure of the core content
+            4. Remove only clearly irrelevant elements like:
+            - Navigation menus
+            - Advertisement sections
+            - Cookie notices
+            - Footers with site information
+            - Sidebars with external links
+            - Any UI elements that don't contribute to learning
+
+            The goal is to create a clean markdown version that reads exactly like the original article, 
+            keeping all valuable content but free from distracting elements. Imagine you're creating 
+            a perfect reading experience where nothing valuable is lost, but all noise is removed.
+            """,
+            verbose=True
+        )        
+
+        # Apply filtering
+        filtered_content = filter.filter_content(html, ignore_cache = True)
+        
+        # Show results
+        print("\nFiltered Content Length:", len(filtered_content))
+        print("\nFirst 500 chars of filtered content:")
+        if filtered_content:
+            print(filtered_content[0][:500])
+        
+        # Save on disc the markdown version
+        with open("filtered_content.md", "w", encoding="utf-8") as f:
+            f.write("\n".join(filtered_content))
+        
+        # Show token usage
+        filter.show_usage()
+
+if __name__ == "__main__":
+    asyncio.run(test_llm_filter())