feat(content-filter): add LLMContentFilter for intelligent markdown generation
Add new LLMContentFilter class that uses LLMs to generate high-quality markdown content: - Implement intelligent content filtering with customizable instructions - Add chunk processing for handling large documents - Support parallel processing of content chunks - Include caching mechanism for filtered results - Add usage tracking and statistics - Update documentation with examples and use cases Also includes minor changes: - Disable Pydantic warnings in __init__.py - Add new prompt template for content filtering
This commit is contained in:
@@ -76,3 +76,10 @@ else:
|
|||||||
WebCrawler = None
|
WebCrawler = None
|
||||||
# import warnings
|
# import warnings
|
||||||
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
from pydantic import warnings as pydantic_warnings
|
||||||
|
|
||||||
|
# Disable all Pydantic warnings
|
||||||
|
warnings.filterwarnings("ignore", module="pydantic")
|
||||||
|
# pydantic_warnings.filter_warnings()
|
||||||
@@ -1,14 +1,24 @@
|
|||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple, Dict, Optional
|
||||||
from rank_bm25 import BM25Okapi
|
from rank_bm25 import BM25Okapi
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from bs4 import NavigableString, Comment
|
from bs4 import NavigableString, Comment
|
||||||
from .utils import clean_tokens
|
from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
import math
|
import math
|
||||||
from snowballstemmer import stemmer
|
from snowballstemmer import stemmer
|
||||||
|
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
|
||||||
|
from .models import TokenUsage
|
||||||
|
from .prompts import PROMPT_FILTER_CONTENT
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import hashlib
|
||||||
|
from pathlib import Path
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from .async_logger import AsyncLogger, LogLevel
|
||||||
|
from colorama import Fore, Style, init
|
||||||
|
|
||||||
class RelevantContentFilter(ABC):
|
class RelevantContentFilter(ABC):
|
||||||
"""Abstract base class for content filtering strategies"""
|
"""Abstract base class for content filtering strategies"""
|
||||||
@@ -343,7 +353,6 @@ class RelevantContentFilter(ABC):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return str(tag) # Fallback to original if anything fails
|
return str(tag) # Fallback to original if anything fails
|
||||||
|
|
||||||
|
|
||||||
class BM25ContentFilter(RelevantContentFilter):
|
class BM25ContentFilter(RelevantContentFilter):
|
||||||
"""
|
"""
|
||||||
Content filtering using BM25 algorithm with priority tag handling.
|
Content filtering using BM25 algorithm with priority tag handling.
|
||||||
@@ -486,7 +495,6 @@ class BM25ContentFilter(RelevantContentFilter):
|
|||||||
|
|
||||||
return [self.clean_element(tag) for _, _, tag in selected_candidates]
|
return [self.clean_element(tag) for _, _, tag in selected_candidates]
|
||||||
|
|
||||||
|
|
||||||
class PruningContentFilter(RelevantContentFilter):
|
class PruningContentFilter(RelevantContentFilter):
|
||||||
"""
|
"""
|
||||||
Content filtering using pruning algorithm with dynamic threshold.
|
Content filtering using pruning algorithm with dynamic threshold.
|
||||||
@@ -732,3 +740,260 @@ class PruningContentFilter(RelevantContentFilter):
|
|||||||
if self.negative_patterns.match(element_id):
|
if self.negative_patterns.match(element_id):
|
||||||
class_id_score -= 0.5
|
class_id_score -= 0.5
|
||||||
return class_id_score
|
return class_id_score
|
||||||
|
|
||||||
|
class LLMContentFilter(RelevantContentFilter):
|
||||||
|
"""Content filtering using LLMs to generate relevant markdown."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
provider: str = DEFAULT_PROVIDER,
|
||||||
|
api_token: Optional[str] = None,
|
||||||
|
instruction: str = None,
|
||||||
|
chunk_token_threshold: int = int(1e9),
|
||||||
|
overlap_rate: float = OVERLAP_RATE,
|
||||||
|
word_token_rate: float = WORD_TOKEN_RATE,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
api_base: Optional[str] = None,
|
||||||
|
extra_args: Dict = None,
|
||||||
|
verbose: bool = False,
|
||||||
|
logger: Optional[AsyncLogger] = None,
|
||||||
|
):
|
||||||
|
super().__init__(None)
|
||||||
|
self.provider = provider
|
||||||
|
self.api_token = (
|
||||||
|
api_token
|
||||||
|
or PROVIDER_MODELS.get(provider, "no-token")
|
||||||
|
or os.getenv("OPENAI_API_KEY")
|
||||||
|
)
|
||||||
|
self.instruction = instruction
|
||||||
|
self.chunk_token_threshold = chunk_token_threshold
|
||||||
|
self.overlap_rate = overlap_rate
|
||||||
|
self.word_token_rate = word_token_rate
|
||||||
|
self.base_url = base_url
|
||||||
|
self.api_base = api_base or base_url
|
||||||
|
self.extra_args = extra_args or {}
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
|
# Setup logger with custom styling for LLM operations
|
||||||
|
if logger:
|
||||||
|
self.logger = logger
|
||||||
|
elif verbose:
|
||||||
|
self.logger = AsyncLogger(
|
||||||
|
verbose=True,
|
||||||
|
icons={
|
||||||
|
**AsyncLogger.DEFAULT_ICONS,
|
||||||
|
"LLM": "★", # Star for LLM operations
|
||||||
|
"CHUNK": "◈", # Diamond for chunks
|
||||||
|
"CACHE": "⚡", # Lightning for cache operations
|
||||||
|
},
|
||||||
|
colors={
|
||||||
|
**AsyncLogger.DEFAULT_COLORS,
|
||||||
|
LogLevel.INFO: Fore.MAGENTA + Style.DIM, # Dimmed purple for LLM ops
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger = None
|
||||||
|
|
||||||
|
self.usages = []
|
||||||
|
self.total_usage = TokenUsage()
|
||||||
|
|
||||||
|
def _get_cache_key(self, html: str, instruction: str) -> str:
|
||||||
|
"""Generate a unique cache key based on HTML and instruction"""
|
||||||
|
content = f"{html}{instruction}"
|
||||||
|
return hashlib.md5(content.encode()).hexdigest()
|
||||||
|
|
||||||
|
def _merge_chunks(self, text: str) -> List[str]:
|
||||||
|
"""Split text into chunks with overlap"""
|
||||||
|
# Calculate tokens and sections
|
||||||
|
total_tokens = len(text.split()) * self.word_token_rate
|
||||||
|
num_sections = max(1, math.floor(total_tokens / self.chunk_token_threshold))
|
||||||
|
adjusted_chunk_threshold = total_tokens / num_sections
|
||||||
|
|
||||||
|
# Split into words
|
||||||
|
words = text.split()
|
||||||
|
chunks = []
|
||||||
|
current_chunk = []
|
||||||
|
current_token_count = 0
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
word_tokens = len(word) * self.word_token_rate
|
||||||
|
if current_token_count + word_tokens <= adjusted_chunk_threshold:
|
||||||
|
current_chunk.append(word)
|
||||||
|
current_token_count += word_tokens
|
||||||
|
else:
|
||||||
|
# Add overlap if not the last chunk
|
||||||
|
if chunks and self.overlap_rate > 0:
|
||||||
|
overlap_size = int(len(current_chunk) * self.overlap_rate)
|
||||||
|
current_chunk.extend(current_chunk[-overlap_size:])
|
||||||
|
|
||||||
|
chunks.append(" ".join(current_chunk))
|
||||||
|
current_chunk = [word]
|
||||||
|
current_token_count = word_tokens
|
||||||
|
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append(" ".join(current_chunk))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def filter_content(self, html: str, ignore_cache: bool = False) -> List[str]:
|
||||||
|
if not html or not isinstance(html, str):
|
||||||
|
return []
|
||||||
|
|
||||||
|
if self.logger:
|
||||||
|
self.logger.info(
|
||||||
|
"Starting LLM content filtering process",
|
||||||
|
tag="LLM",
|
||||||
|
params={"provider": self.provider},
|
||||||
|
colors={"provider": Fore.CYAN}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cache handling
|
||||||
|
cache_dir = Path(get_home_folder()) / "llm_cache" / "content_filter"
|
||||||
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
cache_key = self._get_cache_key(html, self.instruction or "")
|
||||||
|
cache_file = cache_dir / f"{cache_key}.json"
|
||||||
|
|
||||||
|
if not ignore_cache and cache_file.exists():
|
||||||
|
if self.logger:
|
||||||
|
self.logger.info("Found cached result", tag="CACHE")
|
||||||
|
try:
|
||||||
|
with cache_file.open('r') as f:
|
||||||
|
cached_data = json.load(f)
|
||||||
|
usage = TokenUsage(**cached_data['usage'])
|
||||||
|
self.usages.append(usage)
|
||||||
|
self.total_usage.completion_tokens += usage.completion_tokens
|
||||||
|
self.total_usage.prompt_tokens += usage.prompt_tokens
|
||||||
|
self.total_usage.total_tokens += usage.total_tokens
|
||||||
|
return cached_data['blocks']
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.error(f"Cache read error: {str(e)}", tag="CACHE")
|
||||||
|
|
||||||
|
# Split into chunks
|
||||||
|
html_chunks = self._merge_chunks(html)
|
||||||
|
if self.logger:
|
||||||
|
self.logger.info(
|
||||||
|
"Split content into {chunk_count} chunks",
|
||||||
|
tag="CHUNK",
|
||||||
|
params={"chunk_count": len(html_chunks)},
|
||||||
|
colors={"chunk_count": Fore.YELLOW}
|
||||||
|
)
|
||||||
|
|
||||||
|
extracted_content = []
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Process chunks in parallel
|
||||||
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
|
futures = []
|
||||||
|
for i, chunk in enumerate(html_chunks):
|
||||||
|
if self.logger:
|
||||||
|
self.logger.debug(
|
||||||
|
"Processing chunk {chunk_num}/{total_chunks}",
|
||||||
|
tag="CHUNK",
|
||||||
|
params={
|
||||||
|
"chunk_num": i + 1,
|
||||||
|
"total_chunks": len(html_chunks)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt_variables = {
|
||||||
|
"HTML": escape_json_string(sanitize_html(chunk)),
|
||||||
|
"REQUEST": self.instruction or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content."
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt = PROMPT_FILTER_CONTENT
|
||||||
|
for var, value in prompt_variables.items():
|
||||||
|
prompt = prompt.replace("{" + var + "}", value)
|
||||||
|
|
||||||
|
future = executor.submit(
|
||||||
|
perform_completion_with_backoff,
|
||||||
|
self.provider,
|
||||||
|
prompt,
|
||||||
|
self.api_token,
|
||||||
|
base_url=self.api_base,
|
||||||
|
extra_args=self.extra_args
|
||||||
|
)
|
||||||
|
futures.append((i, future))
|
||||||
|
|
||||||
|
# Collect results in order
|
||||||
|
ordered_results = []
|
||||||
|
for i, future in sorted(futures):
|
||||||
|
try:
|
||||||
|
response = future.result()
|
||||||
|
|
||||||
|
# Track usage
|
||||||
|
usage = TokenUsage(
|
||||||
|
completion_tokens=response.usage.completion_tokens,
|
||||||
|
prompt_tokens=response.usage.prompt_tokens,
|
||||||
|
total_tokens=response.usage.total_tokens,
|
||||||
|
completion_tokens_details=response.usage.completion_tokens_details.__dict__
|
||||||
|
if response.usage.completion_tokens_details else {},
|
||||||
|
prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
|
||||||
|
if response.usage.prompt_tokens_details else {},
|
||||||
|
)
|
||||||
|
self.usages.append(usage)
|
||||||
|
self.total_usage.completion_tokens += usage.completion_tokens
|
||||||
|
self.total_usage.prompt_tokens += usage.prompt_tokens
|
||||||
|
self.total_usage.total_tokens += usage.total_tokens
|
||||||
|
|
||||||
|
blocks = extract_xml_data(["content"], response.choices[0].message.content)["content"]
|
||||||
|
if blocks:
|
||||||
|
ordered_results.append(blocks)
|
||||||
|
if self.logger:
|
||||||
|
self.logger.success(
|
||||||
|
"Successfully processed chunk {chunk_num}",
|
||||||
|
tag="CHUNK",
|
||||||
|
params={"chunk_num": i + 1}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.error(
|
||||||
|
"Error processing chunk {chunk_num}: {error}",
|
||||||
|
tag="CHUNK",
|
||||||
|
params={
|
||||||
|
"chunk_num": i + 1,
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
if self.logger:
|
||||||
|
self.logger.success(
|
||||||
|
"Completed processing in {time:.2f}s",
|
||||||
|
tag="LLM",
|
||||||
|
params={"time": end_time - start_time},
|
||||||
|
colors={"time": Fore.YELLOW}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = ordered_results if ordered_results else []
|
||||||
|
|
||||||
|
# Cache the final result
|
||||||
|
cache_data = {
|
||||||
|
'blocks': result,
|
||||||
|
'usage': self.total_usage.__dict__
|
||||||
|
}
|
||||||
|
with cache_file.open('w') as f:
|
||||||
|
json.dump(cache_data, f)
|
||||||
|
if self.logger:
|
||||||
|
self.logger.info("Cached results for future use", tag="CACHE")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def show_usage(self) -> None:
|
||||||
|
"""Print usage statistics"""
|
||||||
|
print("\n=== Token Usage Summary ===")
|
||||||
|
print(f"{'Type':<15} {'Count':>12}")
|
||||||
|
print("-" * 30)
|
||||||
|
print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
|
||||||
|
print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
|
||||||
|
print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
|
||||||
|
|
||||||
|
if self.usages:
|
||||||
|
print("\n=== Usage History ===")
|
||||||
|
print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
|
||||||
|
print("-" * 48)
|
||||||
|
for i, usage in enumerate(self.usages, 1):
|
||||||
|
print(
|
||||||
|
f"{i:<10} {usage.completion_tokens:>12,} "
|
||||||
|
f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
|
||||||
|
)
|
||||||
@@ -202,3 +202,58 @@ Avoid Common Mistakes:
|
|||||||
|
|
||||||
Result
|
Result
|
||||||
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.
|
||||||
|
|
||||||
|
INPUT HTML:
|
||||||
|
<|HTML_CONTENT_START|>
|
||||||
|
{HTML}
|
||||||
|
<|HTML_CONTENT_END|>
|
||||||
|
|
||||||
|
|
||||||
|
SPECIFIC INSTRUCTION:
|
||||||
|
<|USER_INSTRUCTION_START|>
|
||||||
|
{REQUEST}
|
||||||
|
<|USER_INSTRUCTION_END|>
|
||||||
|
|
||||||
|
TASK DETAILS:
|
||||||
|
1. Content Selection
|
||||||
|
- DO: Keep essential information, main content, key details
|
||||||
|
- DO: Preserve hierarchical structure using markdown headers
|
||||||
|
- DO: Keep code blocks, tables, key lists
|
||||||
|
- DON'T: Include navigation menus, ads, footers, cookie notices
|
||||||
|
- DON'T: Keep social media widgets, sidebars, related content
|
||||||
|
|
||||||
|
2. Content Transformation
|
||||||
|
- DO: Use proper markdown syntax (#, ##, **, `, etc)
|
||||||
|
- DO: Convert tables to markdown tables
|
||||||
|
- DO: Preserve code formatting with ```language blocks
|
||||||
|
- DO: Maintain link texts but remove tracking parameters
|
||||||
|
- DON'T: Include HTML tags in output
|
||||||
|
- DON'T: Keep class names, ids, or other HTML attributes
|
||||||
|
|
||||||
|
3. Content Organization
|
||||||
|
- DO: Maintain logical flow of information
|
||||||
|
- DO: Group related content under appropriate headers
|
||||||
|
- DO: Use consistent header levels
|
||||||
|
- DON'T: Fragment related content
|
||||||
|
- DON'T: Duplicate information
|
||||||
|
|
||||||
|
Example Input:
|
||||||
|
<div class="main-content"><h1>Setup Guide</h1><p>Follow these steps...</p></div>
|
||||||
|
<div class="sidebar">Related articles...</div>
|
||||||
|
|
||||||
|
Example Output:
|
||||||
|
# Setup Guide
|
||||||
|
Follow these steps...
|
||||||
|
|
||||||
|
IMPORTANT: If specific instruction is provided above, prioritize those requirements over these general guidelines.
|
||||||
|
|
||||||
|
OUTPUT FORMAT:
|
||||||
|
Wrap your response in <content> tags. Use proper markdown throughout.
|
||||||
|
<content>
|
||||||
|
[Your markdown content here]
|
||||||
|
</content>
|
||||||
|
|
||||||
|
Begin filtering now."""
|
||||||
@@ -170,6 +170,82 @@ prune_filter = PruningContentFilter(
|
|||||||
- You want a broad cleanup without a user query.
|
- You want a broad cleanup without a user query.
|
||||||
- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
|
- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
|
||||||
|
|
||||||
|
### 4.3 LLMContentFilter
|
||||||
|
|
||||||
|
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Initialize LLM filter with specific instruction
|
||||||
|
filter = LLMContentFilter(
|
||||||
|
provider="openai/gpt-4", # or your preferred provider
|
||||||
|
api_token="your-api-token", # or use environment variable
|
||||||
|
instruction="""
|
||||||
|
Focus on extracting the core educational content.
|
||||||
|
Include:
|
||||||
|
- Key concepts and explanations
|
||||||
|
- Important code examples
|
||||||
|
- Essential technical details
|
||||||
|
Exclude:
|
||||||
|
- Navigation elements
|
||||||
|
- Sidebars
|
||||||
|
- Footer content
|
||||||
|
Format the output as clean markdown with proper code blocks and headers.
|
||||||
|
""",
|
||||||
|
chunk_token_threshold=4096, # Adjust based on your needs
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
content_filter=filter
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun("https://example.com", config=config)
|
||||||
|
print(result.fit_markdown) # Filtered markdown content
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Features:**
|
||||||
|
- **Intelligent Filtering**: Uses LLMs to understand and extract relevant content while maintaining context
|
||||||
|
- **Customizable Instructions**: Tailor the filtering process with specific instructions
|
||||||
|
- **Chunk Processing**: Handles large documents by processing them in chunks (controlled by `chunk_token_threshold`)
|
||||||
|
- **Parallel Processing**: For better performance, use smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks
|
||||||
|
|
||||||
|
**Two Common Use Cases:**
|
||||||
|
|
||||||
|
1. **Exact Content Preservation**:
|
||||||
|
```python
|
||||||
|
filter = LLMContentFilter(
|
||||||
|
instruction="""
|
||||||
|
Extract the main educational content while preserving its original wording and substance completely.
|
||||||
|
1. Maintain the exact language and terminology
|
||||||
|
2. Keep all technical explanations and examples intact
|
||||||
|
3. Preserve the original flow and structure
|
||||||
|
4. Remove only clearly irrelevant elements like navigation menus and ads
|
||||||
|
""",
|
||||||
|
chunk_token_threshold=4096
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Focused Content Extraction**:
|
||||||
|
```python
|
||||||
|
filter = LLMContentFilter(
|
||||||
|
instruction="""
|
||||||
|
Focus on extracting specific types of content:
|
||||||
|
- Technical documentation
|
||||||
|
- Code examples
|
||||||
|
- API references
|
||||||
|
Reformat the content into clear, well-structured markdown
|
||||||
|
""",
|
||||||
|
chunk_token_threshold=4096
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Performance Tip**: Set a smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks. The default value is infinity, which processes the entire content as a single chunk.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 5. Using Fit Markdown
|
## 5. Using Fit Markdown
|
||||||
|
|||||||
87
tests/20241401/test_llm_filter.py
Normal file
87
tests/20241401/test_llm_filter.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
|
async def test_llm_filter():
|
||||||
|
# Create an HTML source that needs intelligent filtering
|
||||||
|
url = "https://docs.python.org/3/tutorial/classes.html"
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
# First get the raw HTML
|
||||||
|
result = await crawler.arun(url, config=run_config)
|
||||||
|
html = result.cleaned_html
|
||||||
|
|
||||||
|
# Initialize LLM filter with focused instruction
|
||||||
|
filter = LLMContentFilter(
|
||||||
|
provider="openai/gpt-4o",
|
||||||
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
|
instruction="""
|
||||||
|
Focus on extracting the core educational content about Python classes.
|
||||||
|
Include:
|
||||||
|
- Key concepts and their explanations
|
||||||
|
- Important code examples
|
||||||
|
- Essential technical details
|
||||||
|
Exclude:
|
||||||
|
- Navigation elements
|
||||||
|
- Sidebars
|
||||||
|
- Footer content
|
||||||
|
- Version information
|
||||||
|
- Any non-essential UI elements
|
||||||
|
|
||||||
|
Format the output as clean markdown with proper code blocks and headers.
|
||||||
|
""",
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
filter = LLMContentFilter(
|
||||||
|
provider="openai/gpt-4o",
|
||||||
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
|
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||||
|
instruction="""
|
||||||
|
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
||||||
|
|
||||||
|
1. Maintain the exact language and terminology used in the main content
|
||||||
|
2. Keep all technical explanations, examples, and educational content intact
|
||||||
|
3. Preserve the original flow and structure of the core content
|
||||||
|
4. Remove only clearly irrelevant elements like:
|
||||||
|
- Navigation menus
|
||||||
|
- Advertisement sections
|
||||||
|
- Cookie notices
|
||||||
|
- Footers with site information
|
||||||
|
- Sidebars with external links
|
||||||
|
- Any UI elements that don't contribute to learning
|
||||||
|
|
||||||
|
The goal is to create a clean markdown version that reads exactly like the original article,
|
||||||
|
keeping all valuable content but free from distracting elements. Imagine you're creating
|
||||||
|
a perfect reading experience where nothing valuable is lost, but all noise is removed.
|
||||||
|
""",
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply filtering
|
||||||
|
filtered_content = filter.filter_content(html, ignore_cache = True)
|
||||||
|
|
||||||
|
# Show results
|
||||||
|
print("\nFiltered Content Length:", len(filtered_content))
|
||||||
|
print("\nFirst 500 chars of filtered content:")
|
||||||
|
if filtered_content:
|
||||||
|
print(filtered_content[0][:500])
|
||||||
|
|
||||||
|
# Save on disc the markdown version
|
||||||
|
with open("filtered_content.md", "w", encoding="utf-8") as f:
|
||||||
|
f.write("\n".join(filtered_content))
|
||||||
|
|
||||||
|
# Show token usage
|
||||||
|
filter.show_usage()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_llm_filter())
|
||||||
Reference in New Issue
Block a user