Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions. Update all imports and usages to use the new name. Update documentation and examples to reflect the change. BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.
87 lines
3.4 KiB
Python
87 lines
3.4 KiB
Python
import os
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|
from crawl4ai.types import LLMConfig
|
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
|
|
|
async def test_llm_filter():
|
|
# Create an HTML source that needs intelligent filtering
|
|
url = "https://docs.python.org/3/tutorial/classes.html"
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
verbose=True
|
|
)
|
|
|
|
# run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# First get the raw HTML
|
|
result = await crawler.arun(url, config=run_config)
|
|
html = result.cleaned_html
|
|
|
|
# Initialize LLM filter with focused instruction
|
|
filter = LLMContentFilter(
|
|
llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
|
instruction="""
|
|
Focus on extracting the core educational content about Python classes.
|
|
Include:
|
|
- Key concepts and their explanations
|
|
- Important code examples
|
|
- Essential technical details
|
|
Exclude:
|
|
- Navigation elements
|
|
- Sidebars
|
|
- Footer content
|
|
- Version information
|
|
- Any non-essential UI elements
|
|
|
|
Format the output as clean markdown with proper code blocks and headers.
|
|
""",
|
|
verbose=True
|
|
)
|
|
|
|
filter = LLMContentFilter(
|
|
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
|
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
|
ignore_cache = True,
|
|
instruction="""
|
|
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
|
|
|
1. Maintain the exact language and terminology used in the main content
|
|
2. Keep all technical explanations, examples, and educational content intact
|
|
3. Preserve the original flow and structure of the core content
|
|
4. Remove only clearly irrelevant elements like:
|
|
- Navigation menus
|
|
- Advertisement sections
|
|
- Cookie notices
|
|
- Footers with site information
|
|
- Sidebars with external links
|
|
- Any UI elements that don't contribute to learning
|
|
|
|
The goal is to create a clean markdown version that reads exactly like the original article,
|
|
keeping all valuable content but free from distracting elements. Imagine you're creating
|
|
a perfect reading experience where nothing valuable is lost, but all noise is removed.
|
|
""",
|
|
verbose=True
|
|
)
|
|
|
|
# Apply filtering
|
|
filtered_content = filter.filter_content(html)
|
|
|
|
# Show results
|
|
print("\nFiltered Content Length:", len(filtered_content))
|
|
print("\nFirst 500 chars of filtered content:")
|
|
if filtered_content:
|
|
print(filtered_content[0][:500])
|
|
|
|
# Save on disc the markdown version
|
|
with open("filtered_content.md", "w", encoding="utf-8") as f:
|
|
f.write("\n".join(filtered_content))
|
|
|
|
# Show token usage
|
|
filter.show_usage()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_llm_filter()) |