refactor(dispatcher): migrate to modular dispatcher system with enhanced monitoring
Reorganize dispatcher functionality into separate components: - Create dedicated dispatcher classes (MemoryAdaptive, Semaphore) - Add RateLimiter for smart request throttling - Implement CrawlerMonitor for real-time progress tracking - Move dispatcher config from CrawlerRunConfig to separate classes BREAKING CHANGE: Dispatcher configuration moved from CrawlerRunConfig to dedicated dispatcher classes. Users need to update their configuration approach for multi-URL crawling.
This commit is contained in:
@@ -1,67 +1,121 @@
|
||||
import asyncio, time
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
||||
from crawl4ai.dispatcher import DisplayMode
|
||||
import asyncio
|
||||
import time
|
||||
from rich import print
|
||||
from rich.table import Table
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
|
||||
MemoryAdaptiveDispatcher, SemaphoreDispatcher,
|
||||
RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
|
||||
)
|
||||
|
||||
async def crawl_with_rate_limiting(urls):
|
||||
"""
|
||||
Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
|
||||
|
||||
Args:
|
||||
urls (List[str]): List of URLs to crawl
|
||||
|
||||
Returns:
|
||||
List[CrawlResult]: List of crawl results for each URL
|
||||
"""
|
||||
# Configure browser settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, # Run browser in headless mode
|
||||
verbose=False # Minimize browser logging
|
||||
)
|
||||
|
||||
# Configure crawler settings with rate limiting
|
||||
run_config = CrawlerRunConfig(
|
||||
# Enable rate limiting
|
||||
enable_rate_limiting=True,
|
||||
rate_limit_config=RateLimitConfig(
|
||||
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds between requests
|
||||
max_delay=30.0, # Maximum delay after rate limit hits
|
||||
max_retries=2, # Number of retries before giving up
|
||||
rate_limit_codes=[429, 503] # HTTP status codes to trigger rate limiting
|
||||
),
|
||||
# Resource monitoring settings
|
||||
memory_threshold_percent=70.0, # Pause crawling if memory usage exceeds this
|
||||
check_interval=0.5, # How often to check resource usage
|
||||
max_session_permit=10, # Maximum concurrent crawls
|
||||
display_mode=DisplayMode.DETAILED.value # Show detailed progress
|
||||
)
|
||||
|
||||
# Create and use crawler with context manager
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(urls, config=run_config)
|
||||
return results
|
||||
|
||||
def main():
|
||||
# Example URLs (replace with real URLs)
|
||||
urls = [
|
||||
f"https://example.com/page{i}" for i in range(1, 40)
|
||||
]
|
||||
|
||||
async def memory_adaptive(urls, browser_config, run_config):
|
||||
"""Memory adaptive crawler with monitoring"""
|
||||
start = time.perf_counter()
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
max_session_permit=10,
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
)
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.perf_counter() - start
|
||||
return len(results), duration
|
||||
|
||||
async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
|
||||
"""Memory adaptive crawler with rate limiting"""
|
||||
start = time.perf_counter()
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
max_session_permit=10,
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(1.0, 2.0),
|
||||
max_delay=30.0,
|
||||
max_retries=2
|
||||
),
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
)
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.perf_counter() - start
|
||||
return len(results), duration
|
||||
|
||||
async def semaphore(urls, browser_config, run_config):
|
||||
"""Basic semaphore crawler"""
|
||||
start = time.perf_counter()
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
semaphore_count=5,
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
)
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.perf_counter() - start
|
||||
return len(results), duration
|
||||
|
||||
async def semaphore_with_rate_limit(urls, browser_config, run_config):
|
||||
"""Semaphore crawler with rate limiting"""
|
||||
start = time.perf_counter()
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
semaphore_count=5,
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(1.0, 2.0),
|
||||
max_delay=30.0,
|
||||
max_retries=2
|
||||
),
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
)
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.perf_counter() - start
|
||||
return len(results), duration
|
||||
|
||||
def create_performance_table(results):
|
||||
"""Creates a rich table showing performance results"""
|
||||
table = Table(title="Crawler Strategy Performance Comparison")
|
||||
table.add_column("Strategy", style="cyan")
|
||||
table.add_column("URLs Crawled", justify="right", style="green")
|
||||
table.add_column("Time (seconds)", justify="right", style="yellow")
|
||||
table.add_column("URLs/second", justify="right", style="magenta")
|
||||
|
||||
sorted_results = sorted(results.items(), key=lambda x: x[1][1])
|
||||
|
||||
# Run the crawler
|
||||
results = asyncio.run(crawl_with_rate_limiting(urls))
|
||||
for strategy, (urls_crawled, duration) in sorted_results:
|
||||
urls_per_second = urls_crawled / duration
|
||||
table.add_row(
|
||||
strategy,
|
||||
str(urls_crawled),
|
||||
f"{duration:.2f}",
|
||||
f"{urls_per_second:.2f}"
|
||||
)
|
||||
|
||||
# Process results
|
||||
successful_results = [result for result in results if result.success]
|
||||
failed_results = [result for result in results if not result.success]
|
||||
|
||||
end = time.perf_counter()
|
||||
|
||||
# Print results
|
||||
print(f"Successful crawls: {len(successful_results)}")
|
||||
print(f"Failed crawls: {len(failed_results)}")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
return table
|
||||
|
||||
async def main():
|
||||
urls = [f"https://example.com/page{i}" for i in range(1, 20)]
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
results = {
|
||||
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
|
||||
"Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(urls, browser_config, run_config),
|
||||
"Semaphore": await semaphore(urls, browser_config, run_config),
|
||||
"Semaphore + Rate Limit": await semaphore_with_rate_limit(urls, browser_config, run_config),
|
||||
}
|
||||
|
||||
table = create_performance_table(results)
|
||||
print("\nPerformance Summary:")
|
||||
print(table)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user