Implements dynamic proxy rotation functionality with authentication support and IP verification. Updates include: - Added proxy rotation demo in features example - Updated proxy configuration handling in BrowserManager - Added proxy rotation documentation - Updated README with new proxy rotation feature - Bumped version to 0.4.3b2 This change enables users to dynamically switch between proxies and verify IP addresses for each request.
313 lines
10 KiB
Python
313 lines
10 KiB
Python
"""
|
|
Crawl4ai v0.4.3 Features Demo
|
|
============================
|
|
|
|
This example demonstrates the major new features introduced in Crawl4ai v0.4.3.
|
|
Each section showcases a specific feature with practical examples and explanations.
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
from crawl4ai import *
|
|
|
|
|
|
async def demo_memory_dispatcher():
|
|
"""
|
|
1. Memory Dispatcher System Demo
|
|
===============================
|
|
Shows how to use the new memory dispatcher with monitoring
|
|
"""
|
|
print("\n=== 1. Memory Dispatcher System Demo ===")
|
|
|
|
# Configure crawler
|
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator()
|
|
)
|
|
|
|
# Test URLs
|
|
urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Initialize dispatcher with monitoring
|
|
monitor = CrawlerMonitor(
|
|
max_visible_rows=10,
|
|
display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED
|
|
)
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=80.0, # Memory usage threshold
|
|
check_interval=0.5, # How often to check memory
|
|
max_session_permit=5, # Max concurrent crawls
|
|
monitor=monitor, # Pass the monitor
|
|
)
|
|
|
|
# Run with memory monitoring
|
|
print("Starting batch crawl with memory monitoring...")
|
|
results = await dispatcher.run_urls(
|
|
urls=urls,
|
|
crawler=crawler,
|
|
config=crawler_config,
|
|
)
|
|
print(f"Completed {len(results)} URLs")
|
|
|
|
|
|
async def demo_streaming_support():
|
|
"""
|
|
2. Streaming Support Demo
|
|
======================
|
|
Shows how to process URLs as they complete using streaming
|
|
"""
|
|
print("\n=== 2. Streaming Support Demo ===")
|
|
|
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
|
|
|
|
# Test URLs
|
|
urls = ["http://example.com", "http://example.org", "http://example.net"] * 2
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Initialize dispatcher for streaming
|
|
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)
|
|
|
|
print("Starting streaming crawl...")
|
|
async for result in dispatcher.run_urls_stream(
|
|
urls=urls, crawler=crawler, config=crawler_config
|
|
):
|
|
# Process each result as it arrives
|
|
print(
|
|
f"Received result for {result.url} - Success: {result.result.success}"
|
|
)
|
|
if result.result.success:
|
|
print(f"Content length: {len(result.result.markdown)}")
|
|
|
|
|
|
async def demo_content_scraping():
|
|
"""
|
|
3. Content Scraping Strategy Demo
|
|
==============================
|
|
Demonstrates the new LXMLWebScrapingStrategy for faster content scraping.
|
|
"""
|
|
print("\n=== 3. Content Scraping Strategy Demo ===")
|
|
|
|
crawler = AsyncWebCrawler()
|
|
url = "https://example.com/article"
|
|
|
|
# Configure with the new LXML strategy
|
|
config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True)
|
|
|
|
print("Scraping content with LXML strategy...")
|
|
async with crawler:
|
|
result = await crawler.arun(url, config=config)
|
|
if result.success:
|
|
print("Successfully scraped content using LXML strategy")
|
|
|
|
|
|
async def demo_llm_markdown():
|
|
"""
|
|
4. LLM-Powered Markdown Generation Demo
|
|
===================================
|
|
Shows how to use the new LLM-powered content filtering and markdown generation.
|
|
"""
|
|
print("\n=== 4. LLM-Powered Markdown Generation Demo ===")
|
|
|
|
crawler = AsyncWebCrawler()
|
|
url = "https://docs.python.org/3/tutorial/classes.html"
|
|
|
|
content_filter = LLMContentFilter(
|
|
provider="openai/gpt-4o",
|
|
api_token=os.getenv("OPENAI_API_KEY"),
|
|
instruction="""
|
|
Focus on extracting the core educational content about Python classes.
|
|
Include:
|
|
- Key concepts and their explanations
|
|
- Important code examples
|
|
- Essential technical details
|
|
Exclude:
|
|
- Navigation elements
|
|
- Sidebars
|
|
- Footer content
|
|
- Version information
|
|
- Any non-essential UI elements
|
|
|
|
Format the output as clean markdown with proper code blocks and headers.
|
|
""",
|
|
verbose=True,
|
|
)
|
|
|
|
# Configure LLM-powered markdown generation
|
|
config = CrawlerRunConfig(
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=content_filter
|
|
),
|
|
cache_mode = CacheMode.BYPASS,
|
|
verbose=True
|
|
)
|
|
|
|
print("Generating focused markdown with LLM...")
|
|
async with crawler:
|
|
result = await crawler.arun(url, config=config)
|
|
if result.success and result.markdown_v2:
|
|
print("Successfully generated LLM-filtered markdown")
|
|
print("First 500 chars of filtered content:")
|
|
print(result.markdown_v2.fit_markdown[:500])
|
|
print("Successfully generated LLM-filtered markdown")
|
|
|
|
|
|
async def demo_robots_compliance():
|
|
"""
|
|
5. Robots.txt Compliance Demo
|
|
==========================
|
|
Demonstrates the new robots.txt compliance feature with SQLite caching.
|
|
"""
|
|
print("\n=== 5. Robots.txt Compliance Demo ===")
|
|
|
|
crawler = AsyncWebCrawler()
|
|
urls = ["https://example.com", "https://facebook.com", "https://twitter.com"]
|
|
|
|
# Enable robots.txt checking
|
|
config = CrawlerRunConfig(check_robots_txt=True, verbose=True)
|
|
|
|
print("Crawling with robots.txt compliance...")
|
|
async with crawler:
|
|
results = await crawler.arun_many(urls, config=config)
|
|
for result in results:
|
|
if result.status_code == 403:
|
|
print(f"Access blocked by robots.txt: {result.url}")
|
|
elif result.success:
|
|
print(f"Successfully crawled: {result.url}")
|
|
|
|
|
|
|
|
async def demo_llm_schema_generation():
|
|
"""
|
|
7. LLM-Powered Schema Generation Demo
|
|
=================================
|
|
Demonstrates automatic CSS and XPath schema generation using LLM models.
|
|
"""
|
|
print("\n=== 7. LLM-Powered Schema Generation Demo ===")
|
|
|
|
# Example HTML content for a job listing
|
|
html_content = """
|
|
<div class="job-listing">
|
|
<h1 class="job-title">Senior Software Engineer</h1>
|
|
<div class="job-details">
|
|
<span class="location">San Francisco, CA</span>
|
|
<span class="salary">$150,000 - $200,000</span>
|
|
<div class="requirements">
|
|
<h2>Requirements</h2>
|
|
<ul>
|
|
<li>5+ years Python experience</li>
|
|
<li>Strong background in web crawling</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
"""
|
|
|
|
print("Generating CSS selectors schema...")
|
|
# Generate CSS selectors with a specific query
|
|
css_schema = JsonCssExtractionStrategy.generate_schema(
|
|
html_content,
|
|
schema_type="CSS",
|
|
query="Extract job title, location, and salary information",
|
|
provider="openai/gpt-4o", # or use other providers like "ollama"
|
|
)
|
|
print("\nGenerated CSS Schema:")
|
|
print(css_schema)
|
|
|
|
# Example of using the generated schema with crawler
|
|
crawler = AsyncWebCrawler()
|
|
url = "https://example.com/job-listing"
|
|
|
|
# Create an extraction strategy with the generated schema
|
|
extraction_strategy = JsonCssExtractionStrategy(schema=css_schema)
|
|
|
|
config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True)
|
|
|
|
print("\nTesting generated schema with crawler...")
|
|
async with crawler:
|
|
result = await crawler.arun(url, config=config)
|
|
if result.success:
|
|
print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)
|
|
print("Successfully used generated schema for crawling")
|
|
|
|
|
|
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
|
|
"""Get next proxy from local file"""
|
|
try:
|
|
with open(proxy_file) as f:
|
|
proxies = f.read().splitlines()
|
|
if not proxies:
|
|
return None
|
|
|
|
ip, port, username, password = random.choice(proxies).split(":")
|
|
return {
|
|
"server": f"http://{ip}:{port}",
|
|
"username": username,
|
|
"password": password,
|
|
"ip": ip # Store original IP for verification
|
|
}
|
|
except Exception as e:
|
|
print(f"Error loading proxy: {e}")
|
|
return None
|
|
|
|
async def demo_proxy_rotation():
|
|
"""
|
|
8. Proxy Rotation Demo
|
|
===================
|
|
Demonstrates how to rotate proxies for each request using Crawl4ai.
|
|
"""
|
|
print("\n=== 8. Proxy Rotation Demo ===")
|
|
|
|
|
|
# Create 10 test requests to httpbin
|
|
urls = ["https://httpbin.org/ip"] * 3
|
|
|
|
browser_config = BrowserConfig(headless=True)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
for url in urls:
|
|
proxy = await get_next_proxy()
|
|
if not proxy:
|
|
print("No proxy available, skipping...")
|
|
continue
|
|
|
|
# Create new config with proxy
|
|
current_config = run_config.clone(proxy_config=proxy)
|
|
result = await crawler.arun(url=url, config=current_config)
|
|
|
|
if result.success:
|
|
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
|
print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
|
|
verified = ip_match.group(0) == proxy['ip']
|
|
if verified:
|
|
print(f"✅ Proxy working! IP matches: {proxy['ip']}")
|
|
else:
|
|
print(f"❌ Proxy failed or IP mismatch!")
|
|
else:
|
|
print(f"Failed with proxy {proxy['ip']}")
|
|
|
|
if __name__ == "__main__":
|
|
|
|
async def main():
|
|
"""Run all feature demonstrations."""
|
|
demo_memory_dispatcher(),
|
|
print("\n" + "=" * 50 + "\n")
|
|
demo_streaming_support(),
|
|
print("\n" + "=" * 50 + "\n")
|
|
demo_content_scraping(),
|
|
print("\n" + "=" * 50 + "\n")
|
|
demo_llm_schema_generation(),
|
|
print("\n" + "=" * 50 + "\n")
|
|
demo_llm_markdown(),
|
|
print("\n" + "=" * 50 + "\n")
|
|
demo_robots_compliance(),
|
|
print("\n" + "=" * 50 + "\n")
|
|
demo_proxy_rotation()
|
|
print("\n" + "=" * 50 + "\n")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|