feat(release): prepare v0.4.3 beta release

Prepare the v0.4.3 beta release with major feature additions and improvements:
- Add JsonXPathExtractionStrategy and LLMContentFilter to exports
- Update version to 0.4.3b1
- Improve documentation for dispatchers and markdown generation
- Update development status to Beta
- Reorganize changelog format

BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit
This commit is contained in:
UncleCode
2025-01-21 21:03:11 +08:00
parent d09c611d15
commit 16b8d4945b
12 changed files with 885 additions and 287 deletions

View File

@@ -12,6 +12,7 @@ from crawl4ai import (
CrawlerMonitor,
DisplayMode,
CacheMode,
LXMLWebScrapingStrategy,
)
@@ -113,7 +114,7 @@ def create_performance_table(results):
async def main():
urls = [f"https://example.com/page{i}" for i in range(1, 20)]
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())
results = {
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),

View File

@@ -0,0 +1,87 @@
import os
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter():
# Create an HTML source that needs intelligent filtering
url = "https://docs.python.org/3/tutorial/classes.html"
browser_config = BrowserConfig(
headless=True,
verbose=True
)
# run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
async with AsyncWebCrawler(config=browser_config) as crawler:
# First get the raw HTML
result = await crawler.arun(url, config=run_config)
html = result.cleaned_html
# Initialize LLM filter with focused instruction
filter = LLMContentFilter(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
instruction="""
Focus on extracting the core educational content about Python classes.
Include:
- Key concepts and their explanations
- Important code examples
- Essential technical details
Exclude:
- Navigation elements
- Sidebars
- Footer content
- Version information
- Any non-essential UI elements
Format the output as clean markdown with proper code blocks and headers.
""",
verbose=True
)
filter = LLMContentFilter(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
instruction="""
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
1. Maintain the exact language and terminology used in the main content
2. Keep all technical explanations, examples, and educational content intact
3. Preserve the original flow and structure of the core content
4. Remove only clearly irrelevant elements like:
- Navigation menus
- Advertisement sections
- Cookie notices
- Footers with site information
- Sidebars with external links
- Any UI elements that don't contribute to learning
The goal is to create a clean markdown version that reads exactly like the original article,
keeping all valuable content but free from distracting elements. Imagine you're creating
a perfect reading experience where nothing valuable is lost, but all noise is removed.
""",
verbose=True
)
# Apply filtering
filtered_content = filter.filter_content(html, ignore_cache = True)
# Show results
print("\nFiltered Content Length:", len(filtered_content))
print("\nFirst 500 chars of filtered content:")
if filtered_content:
print(filtered_content[0][:500])
# Save on disc the markdown version
with open("filtered_content.md", "w", encoding="utf-8") as f:
f.write("\n".join(filtered_content))
# Show token usage
filter.show_usage()
if __name__ == "__main__":
asyncio.run(test_llm_filter())

View File

@@ -0,0 +1,135 @@
import time, re
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
import time
import functools
from collections import defaultdict
class TimingStats:
def __init__(self):
self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0}))
def add(self, strategy_name, func_name, elapsed):
self.stats[strategy_name][func_name]["calls"] += 1
self.stats[strategy_name][func_name]["total_time"] += elapsed
def report(self):
for strategy_name, funcs in self.stats.items():
print(f"\n{strategy_name} Timing Breakdown:")
print("-" * 60)
print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}")
print("-" * 60)
for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True):
avg_ms = (data["total_time"] / data["calls"]) * 1000
print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}")
timing_stats = TimingStats()
# Modify timing decorator
def timing_decorator(strategy_name):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
elapsed = time.time() - start
timing_stats.add(strategy_name, func.__name__, elapsed)
return result
return wrapper
return decorator
# Modified decorator application
def apply_decorators(cls, method_name, strategy_name):
try:
original_method = getattr(cls, method_name)
decorated_method = timing_decorator(strategy_name)(original_method)
setattr(cls, method_name, decorated_method)
except AttributeError:
print(f"Method {method_name} not found in class {cls.__name__}.")
# Apply to key methods
methods_to_profile = [
'_scrap',
# 'process_element',
'_process_element',
'process_image',
]
# Apply decorators to both strategies
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
for method in methods_to_profile:
apply_decorators(strategy, method, name)
def generate_large_html(n_elements=1000):
html = ['<!DOCTYPE html><html><head></head><body>']
for i in range(n_elements):
html.append(f'''
<div class="article">
<h2>Heading {i}</h2>
<div>
<div>
<p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
</div>
</div>
<img src="image{i}.jpg" alt="Image {i}">
<ul>
<li>List item {i}.1</li>
<li>List item {i}.2</li>
</ul>
</div>
''')
html.append('</body></html>')
return ''.join(html)
def test_scraping():
# Initialize both scrapers
original_scraper = WebScrapingStrategy()
selected_scraper = LXMLWebScrapingStrategy()
# Generate test HTML
print("Generating HTML...")
html = generate_large_html(5000)
print(f"HTML Size: {len(html)/1024:.2f} KB")
# Time the scraping
print("\nStarting scrape...")
start_time = time.time()
kwargs = {
"url": "http://example.com",
"html": html,
"word_count_threshold": 5,
"keep_data_attributes": True
}
t1 = time.perf_counter()
result_selected = selected_scraper.scrap(**kwargs)
t2 = time.perf_counter()
result_original = original_scraper.scrap(**kwargs)
t3 = time.perf_counter()
elapsed = t3 - start_time
print(f"\nScraping completed in {elapsed:.2f} seconds")
timing_stats.report()
# Print stats of LXML output
print("\nLXML Output:")
print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}")
print(f"Extracted images: {len(result_selected['media']['images'])}")
print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB")
print(f"Scraping time: {t2 - t1:.2f} seconds")
# Print stats of original output
print("\nOriginal Output:")
print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}")
print(f"Extracted images: {len(result_original['media']['images'])}")
print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB")
print(f"Scraping time: {t3 - t1:.2f} seconds")
if __name__ == "__main__":
test_scraping()

View File

@@ -0,0 +1,252 @@
"""
Crawl4ai v0.4.3 Features Demo
============================
This example demonstrates the major new features introduced in Crawl4ai v0.4.3.
Each section showcases a specific feature with practical examples and explanations.
"""
import asyncio
import os
from crawl4ai import *
async def demo_memory_dispatcher():
"""
1. Memory Dispatcher System Demo
===============================
Shows how to use the new memory dispatcher with monitoring
"""
print("\n=== 1. Memory Dispatcher System Demo ===")
# Configure crawler
browser_config = BrowserConfig(headless=True, verbose=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator()
)
# Test URLs
urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
async with AsyncWebCrawler(config=browser_config) as crawler:
# Initialize dispatcher with monitoring
monitor = CrawlerMonitor(
max_visible_rows=10,
display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0, # Memory usage threshold
check_interval=0.5, # How often to check memory
max_session_permit=5, # Max concurrent crawls
monitor=monitor, # Pass the monitor
)
# Run with memory monitoring
print("Starting batch crawl with memory monitoring...")
results = await dispatcher.run_urls(
urls=urls,
crawler=crawler,
config=crawler_config,
)
print(f"Completed {len(results)} URLs")
async def demo_streaming_support():
"""
2. Streaming Support Demo
======================
Shows how to process URLs as they complete using streaming
"""
print("\n=== 2. Streaming Support Demo ===")
browser_config = BrowserConfig(headless=True, verbose=True)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
# Test URLs
urls = ["http://example.com", "http://example.org", "http://example.net"] * 2
async with AsyncWebCrawler(config=browser_config) as crawler:
# Initialize dispatcher for streaming
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)
print("Starting streaming crawl...")
async for result in dispatcher.run_urls_stream(
urls=urls, crawler=crawler, config=crawler_config
):
# Process each result as it arrives
print(
f"Received result for {result.url} - Success: {result.result.success}"
)
if result.result.success:
print(f"Content length: {len(result.result.markdown)}")
async def demo_content_scraping():
"""
3. Content Scraping Strategy Demo
==============================
Demonstrates the new LXMLWebScrapingStrategy for faster content scraping.
"""
print("\n=== 3. Content Scraping Strategy Demo ===")
crawler = AsyncWebCrawler()
url = "https://example.com/article"
# Configure with the new LXML strategy
config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True)
print("Scraping content with LXML strategy...")
async with crawler:
result = await crawler.arun(url, config=config)
if result.success:
print("Successfully scraped content using LXML strategy")
async def demo_llm_markdown():
"""
4. LLM-Powered Markdown Generation Demo
===================================
Shows how to use the new LLM-powered content filtering and markdown generation.
"""
print("\n=== 4. LLM-Powered Markdown Generation Demo ===")
crawler = AsyncWebCrawler()
url = "https://docs.python.org/3/tutorial/classes.html"
content_filter = LLMContentFilter(
provider="openai/gpt-4o",
api_token=os.getenv("OPENAI_API_KEY"),
instruction="""
Focus on extracting the core educational content about Python classes.
Include:
- Key concepts and their explanations
- Important code examples
- Essential technical details
Exclude:
- Navigation elements
- Sidebars
- Footer content
- Version information
- Any non-essential UI elements
Format the output as clean markdown with proper code blocks and headers.
""",
verbose=True,
)
# Configure LLM-powered markdown generation
config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=content_filter
),
cache_mode = CacheMode.BYPASS,
verbose=True
)
print("Generating focused markdown with LLM...")
async with crawler:
result = await crawler.arun(url, config=config)
if result.success and result.markdown_v2:
print("Successfully generated LLM-filtered markdown")
print("First 500 chars of filtered content:")
print(result.markdown_v2.fit_markdown[:500])
print("Successfully generated LLM-filtered markdown")
async def demo_robots_compliance():
"""
5. Robots.txt Compliance Demo
==========================
Demonstrates the new robots.txt compliance feature with SQLite caching.
"""
print("\n=== 5. Robots.txt Compliance Demo ===")
crawler = AsyncWebCrawler()
urls = ["https://example.com", "https://facebook.com", "https://twitter.com"]
# Enable robots.txt checking
config = CrawlerRunConfig(check_robots_txt=True, verbose=True)
print("Crawling with robots.txt compliance...")
async with crawler:
results = await crawler.arun_many(urls, config=config)
for result in results:
if result.status_code == 403:
print(f"Access blocked by robots.txt: {result.url}")
elif result.success:
print(f"Successfully crawled: {result.url}")
async def demo_llm_schema_generation():
"""
7. LLM-Powered Schema Generation Demo
=================================
Demonstrates automatic CSS and XPath schema generation using LLM models.
"""
print("\n=== 7. LLM-Powered Schema Generation Demo ===")
# Example HTML content for a job listing
html_content = """
<div class="job-listing">
<h1 class="job-title">Senior Software Engineer</h1>
<div class="job-details">
<span class="location">San Francisco, CA</span>
<span class="salary">$150,000 - $200,000</span>
<div class="requirements">
<h2>Requirements</h2>
<ul>
<li>5+ years Python experience</li>
<li>Strong background in web crawling</li>
</ul>
</div>
</div>
</div>
"""
print("Generating CSS selectors schema...")
# Generate CSS selectors with a specific query
css_schema = JsonCssExtractionStrategy.generate_schema(
html_content,
schema_type="CSS",
query="Extract job title, location, and salary information",
provider="openai/gpt-4o", # or use other providers like "ollama"
)
print("\nGenerated CSS Schema:")
print(css_schema)
# Example of using the generated schema with crawler
crawler = AsyncWebCrawler()
url = "https://example.com/job-listing"
# Create an extraction strategy with the generated schema
extraction_strategy = JsonCssExtractionStrategy(schema=css_schema)
config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True)
print("\nTesting generated schema with crawler...")
async with crawler:
result = await crawler.arun(url, config=config)
if result.success:
print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)
print("Successfully used generated schema for crawling")
async def main():
"""Run all feature demonstrations."""
demo_memory_dispatcher(),
print("\n" + "=" * 50 + "\n")
demo_streaming_support(),
print("\n" + "=" * 50 + "\n")
demo_content_scraping(),
print("\n" + "=" * 50 + "\n")
demo_llm_schema_generation(),
print("\n" + "=" * 50 + "\n")
demo_llm_markdown(),
print("\n" + "=" * 50 + "\n")
demo_robots_compliance(),
print("\n" + "=" * 50 + "\n")
if __name__ == "__main__":
asyncio.run(main())