feat(release): prepare v0.4.3 beta release
Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit
This commit is contained in:
@@ -12,6 +12,7 @@ from crawl4ai import (
|
||||
CrawlerMonitor,
|
||||
DisplayMode,
|
||||
CacheMode,
|
||||
LXMLWebScrapingStrategy,
|
||||
)
|
||||
|
||||
|
||||
@@ -113,7 +114,7 @@ def create_performance_table(results):
|
||||
async def main():
|
||||
urls = [f"https://example.com/page{i}" for i in range(1, 20)]
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())
|
||||
|
||||
results = {
|
||||
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
|
||||
|
||||
87
docs/examples/llm_markdown_generator.py
Normal file
87
docs/examples/llm_markdown_generator.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
# Create an HTML source that needs intelligent filtering
|
||||
url = "https://docs.python.org/3/tutorial/classes.html"
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# First get the raw HTML
|
||||
result = await crawler.arun(url, config=run_config)
|
||||
html = result.cleaned_html
|
||||
|
||||
# Initialize LLM filter with focused instruction
|
||||
filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
- Key concepts and their explanations
|
||||
- Important code examples
|
||||
- Essential technical details
|
||||
Exclude:
|
||||
- Navigation elements
|
||||
- Sidebars
|
||||
- Footer content
|
||||
- Version information
|
||||
- Any non-essential UI elements
|
||||
|
||||
Format the output as clean markdown with proper code blocks and headers.
|
||||
""",
|
||||
verbose=True
|
||||
)
|
||||
|
||||
filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||
instruction="""
|
||||
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
||||
|
||||
1. Maintain the exact language and terminology used in the main content
|
||||
2. Keep all technical explanations, examples, and educational content intact
|
||||
3. Preserve the original flow and structure of the core content
|
||||
4. Remove only clearly irrelevant elements like:
|
||||
- Navigation menus
|
||||
- Advertisement sections
|
||||
- Cookie notices
|
||||
- Footers with site information
|
||||
- Sidebars with external links
|
||||
- Any UI elements that don't contribute to learning
|
||||
|
||||
The goal is to create a clean markdown version that reads exactly like the original article,
|
||||
keeping all valuable content but free from distracting elements. Imagine you're creating
|
||||
a perfect reading experience where nothing valuable is lost, but all noise is removed.
|
||||
""",
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Apply filtering
|
||||
filtered_content = filter.filter_content(html, ignore_cache = True)
|
||||
|
||||
# Show results
|
||||
print("\nFiltered Content Length:", len(filtered_content))
|
||||
print("\nFirst 500 chars of filtered content:")
|
||||
if filtered_content:
|
||||
print(filtered_content[0][:500])
|
||||
|
||||
# Save on disc the markdown version
|
||||
with open("filtered_content.md", "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(filtered_content))
|
||||
|
||||
# Show token usage
|
||||
filter.show_usage()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_llm_filter())
|
||||
135
docs/examples/scraping_strategies_performance.py
Normal file
135
docs/examples/scraping_strategies_performance.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import time, re
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
import time
|
||||
import functools
|
||||
from collections import defaultdict
|
||||
|
||||
class TimingStats:
|
||||
def __init__(self):
|
||||
self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0}))
|
||||
|
||||
def add(self, strategy_name, func_name, elapsed):
|
||||
self.stats[strategy_name][func_name]["calls"] += 1
|
||||
self.stats[strategy_name][func_name]["total_time"] += elapsed
|
||||
|
||||
def report(self):
|
||||
for strategy_name, funcs in self.stats.items():
|
||||
print(f"\n{strategy_name} Timing Breakdown:")
|
||||
print("-" * 60)
|
||||
print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}")
|
||||
print("-" * 60)
|
||||
|
||||
for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True):
|
||||
avg_ms = (data["total_time"] / data["calls"]) * 1000
|
||||
print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}")
|
||||
|
||||
timing_stats = TimingStats()
|
||||
|
||||
# Modify timing decorator
|
||||
def timing_decorator(strategy_name):
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
start = time.time()
|
||||
result = func(*args, **kwargs)
|
||||
elapsed = time.time() - start
|
||||
timing_stats.add(strategy_name, func.__name__, elapsed)
|
||||
return result
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
# Modified decorator application
|
||||
def apply_decorators(cls, method_name, strategy_name):
|
||||
try:
|
||||
original_method = getattr(cls, method_name)
|
||||
decorated_method = timing_decorator(strategy_name)(original_method)
|
||||
setattr(cls, method_name, decorated_method)
|
||||
except AttributeError:
|
||||
print(f"Method {method_name} not found in class {cls.__name__}.")
|
||||
|
||||
# Apply to key methods
|
||||
methods_to_profile = [
|
||||
'_scrap',
|
||||
# 'process_element',
|
||||
'_process_element',
|
||||
'process_image',
|
||||
]
|
||||
|
||||
|
||||
# Apply decorators to both strategies
|
||||
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
|
||||
for method in methods_to_profile:
|
||||
apply_decorators(strategy, method, name)
|
||||
|
||||
|
||||
def generate_large_html(n_elements=1000):
|
||||
html = ['<!DOCTYPE html><html><head></head><body>']
|
||||
for i in range(n_elements):
|
||||
html.append(f'''
|
||||
<div class="article">
|
||||
<h2>Heading {i}</h2>
|
||||
<div>
|
||||
<div>
|
||||
<p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
|
||||
</div>
|
||||
</div>
|
||||
<img src="image{i}.jpg" alt="Image {i}">
|
||||
<ul>
|
||||
<li>List item {i}.1</li>
|
||||
<li>List item {i}.2</li>
|
||||
</ul>
|
||||
</div>
|
||||
''')
|
||||
html.append('</body></html>')
|
||||
return ''.join(html)
|
||||
|
||||
def test_scraping():
|
||||
# Initialize both scrapers
|
||||
original_scraper = WebScrapingStrategy()
|
||||
selected_scraper = LXMLWebScrapingStrategy()
|
||||
|
||||
# Generate test HTML
|
||||
print("Generating HTML...")
|
||||
html = generate_large_html(5000)
|
||||
print(f"HTML Size: {len(html)/1024:.2f} KB")
|
||||
|
||||
# Time the scraping
|
||||
print("\nStarting scrape...")
|
||||
start_time = time.time()
|
||||
|
||||
kwargs = {
|
||||
"url": "http://example.com",
|
||||
"html": html,
|
||||
"word_count_threshold": 5,
|
||||
"keep_data_attributes": True
|
||||
}
|
||||
|
||||
t1 = time.perf_counter()
|
||||
result_selected = selected_scraper.scrap(**kwargs)
|
||||
t2 = time.perf_counter()
|
||||
|
||||
result_original = original_scraper.scrap(**kwargs)
|
||||
t3 = time.perf_counter()
|
||||
|
||||
elapsed = t3 - start_time
|
||||
print(f"\nScraping completed in {elapsed:.2f} seconds")
|
||||
|
||||
timing_stats.report()
|
||||
|
||||
# Print stats of LXML output
|
||||
print("\nLXML Output:")
|
||||
print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}")
|
||||
print(f"Extracted images: {len(result_selected['media']['images'])}")
|
||||
print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB")
|
||||
print(f"Scraping time: {t2 - t1:.2f} seconds")
|
||||
|
||||
# Print stats of original output
|
||||
print("\nOriginal Output:")
|
||||
print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}")
|
||||
print(f"Extracted images: {len(result_original['media']['images'])}")
|
||||
print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB")
|
||||
print(f"Scraping time: {t3 - t1:.2f} seconds")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_scraping()
|
||||
252
docs/examples/v0_4_3_features_demo.py
Normal file
252
docs/examples/v0_4_3_features_demo.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""
|
||||
Crawl4ai v0.4.3 Features Demo
|
||||
============================
|
||||
|
||||
This example demonstrates the major new features introduced in Crawl4ai v0.4.3.
|
||||
Each section showcases a specific feature with practical examples and explanations.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import *
|
||||
|
||||
|
||||
async def demo_memory_dispatcher():
|
||||
"""
|
||||
1. Memory Dispatcher System Demo
|
||||
===============================
|
||||
Shows how to use the new memory dispatcher with monitoring
|
||||
"""
|
||||
print("\n=== 1. Memory Dispatcher System Demo ===")
|
||||
|
||||
# Configure crawler
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator()
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Initialize dispatcher with monitoring
|
||||
monitor = CrawlerMonitor(
|
||||
max_visible_rows=10,
|
||||
display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0, # Memory usage threshold
|
||||
check_interval=0.5, # How often to check memory
|
||||
max_session_permit=5, # Max concurrent crawls
|
||||
monitor=monitor, # Pass the monitor
|
||||
)
|
||||
|
||||
# Run with memory monitoring
|
||||
print("Starting batch crawl with memory monitoring...")
|
||||
results = await dispatcher.run_urls(
|
||||
urls=urls,
|
||||
crawler=crawler,
|
||||
config=crawler_config,
|
||||
)
|
||||
print(f"Completed {len(results)} URLs")
|
||||
|
||||
|
||||
async def demo_streaming_support():
|
||||
"""
|
||||
2. Streaming Support Demo
|
||||
======================
|
||||
Shows how to process URLs as they complete using streaming
|
||||
"""
|
||||
print("\n=== 2. Streaming Support Demo ===")
|
||||
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
|
||||
|
||||
# Test URLs
|
||||
urls = ["http://example.com", "http://example.org", "http://example.net"] * 2
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Initialize dispatcher for streaming
|
||||
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)
|
||||
|
||||
print("Starting streaming crawl...")
|
||||
async for result in dispatcher.run_urls_stream(
|
||||
urls=urls, crawler=crawler, config=crawler_config
|
||||
):
|
||||
# Process each result as it arrives
|
||||
print(
|
||||
f"Received result for {result.url} - Success: {result.result.success}"
|
||||
)
|
||||
if result.result.success:
|
||||
print(f"Content length: {len(result.result.markdown)}")
|
||||
|
||||
|
||||
async def demo_content_scraping():
|
||||
"""
|
||||
3. Content Scraping Strategy Demo
|
||||
==============================
|
||||
Demonstrates the new LXMLWebScrapingStrategy for faster content scraping.
|
||||
"""
|
||||
print("\n=== 3. Content Scraping Strategy Demo ===")
|
||||
|
||||
crawler = AsyncWebCrawler()
|
||||
url = "https://example.com/article"
|
||||
|
||||
# Configure with the new LXML strategy
|
||||
config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True)
|
||||
|
||||
print("Scraping content with LXML strategy...")
|
||||
async with crawler:
|
||||
result = await crawler.arun(url, config=config)
|
||||
if result.success:
|
||||
print("Successfully scraped content using LXML strategy")
|
||||
|
||||
|
||||
async def demo_llm_markdown():
|
||||
"""
|
||||
4. LLM-Powered Markdown Generation Demo
|
||||
===================================
|
||||
Shows how to use the new LLM-powered content filtering and markdown generation.
|
||||
"""
|
||||
print("\n=== 4. LLM-Powered Markdown Generation Demo ===")
|
||||
|
||||
crawler = AsyncWebCrawler()
|
||||
url = "https://docs.python.org/3/tutorial/classes.html"
|
||||
|
||||
content_filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
- Key concepts and their explanations
|
||||
- Important code examples
|
||||
- Essential technical details
|
||||
Exclude:
|
||||
- Navigation elements
|
||||
- Sidebars
|
||||
- Footer content
|
||||
- Version information
|
||||
- Any non-essential UI elements
|
||||
|
||||
Format the output as clean markdown with proper code blocks and headers.
|
||||
""",
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Configure LLM-powered markdown generation
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=content_filter
|
||||
),
|
||||
cache_mode = CacheMode.BYPASS,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
print("Generating focused markdown with LLM...")
|
||||
async with crawler:
|
||||
result = await crawler.arun(url, config=config)
|
||||
if result.success and result.markdown_v2:
|
||||
print("Successfully generated LLM-filtered markdown")
|
||||
print("First 500 chars of filtered content:")
|
||||
print(result.markdown_v2.fit_markdown[:500])
|
||||
print("Successfully generated LLM-filtered markdown")
|
||||
|
||||
|
||||
async def demo_robots_compliance():
|
||||
"""
|
||||
5. Robots.txt Compliance Demo
|
||||
==========================
|
||||
Demonstrates the new robots.txt compliance feature with SQLite caching.
|
||||
"""
|
||||
print("\n=== 5. Robots.txt Compliance Demo ===")
|
||||
|
||||
crawler = AsyncWebCrawler()
|
||||
urls = ["https://example.com", "https://facebook.com", "https://twitter.com"]
|
||||
|
||||
# Enable robots.txt checking
|
||||
config = CrawlerRunConfig(check_robots_txt=True, verbose=True)
|
||||
|
||||
print("Crawling with robots.txt compliance...")
|
||||
async with crawler:
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
for result in results:
|
||||
if result.status_code == 403:
|
||||
print(f"Access blocked by robots.txt: {result.url}")
|
||||
elif result.success:
|
||||
print(f"Successfully crawled: {result.url}")
|
||||
|
||||
|
||||
|
||||
async def demo_llm_schema_generation():
|
||||
"""
|
||||
7. LLM-Powered Schema Generation Demo
|
||||
=================================
|
||||
Demonstrates automatic CSS and XPath schema generation using LLM models.
|
||||
"""
|
||||
print("\n=== 7. LLM-Powered Schema Generation Demo ===")
|
||||
|
||||
# Example HTML content for a job listing
|
||||
html_content = """
|
||||
<div class="job-listing">
|
||||
<h1 class="job-title">Senior Software Engineer</h1>
|
||||
<div class="job-details">
|
||||
<span class="location">San Francisco, CA</span>
|
||||
<span class="salary">$150,000 - $200,000</span>
|
||||
<div class="requirements">
|
||||
<h2>Requirements</h2>
|
||||
<ul>
|
||||
<li>5+ years Python experience</li>
|
||||
<li>Strong background in web crawling</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
print("Generating CSS selectors schema...")
|
||||
# Generate CSS selectors with a specific query
|
||||
css_schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html_content,
|
||||
schema_type="CSS",
|
||||
query="Extract job title, location, and salary information",
|
||||
provider="openai/gpt-4o", # or use other providers like "ollama"
|
||||
)
|
||||
print("\nGenerated CSS Schema:")
|
||||
print(css_schema)
|
||||
|
||||
# Example of using the generated schema with crawler
|
||||
crawler = AsyncWebCrawler()
|
||||
url = "https://example.com/job-listing"
|
||||
|
||||
# Create an extraction strategy with the generated schema
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema=css_schema)
|
||||
|
||||
config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True)
|
||||
|
||||
print("\nTesting generated schema with crawler...")
|
||||
async with crawler:
|
||||
result = await crawler.arun(url, config=config)
|
||||
if result.success:
|
||||
print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)
|
||||
print("Successfully used generated schema for crawling")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all feature demonstrations."""
|
||||
demo_memory_dispatcher(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_streaming_support(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_content_scraping(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_llm_schema_generation(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_llm_markdown(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_robots_compliance(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user