refactor(dispatcher): migrate to modular dispatcher system with enhanced monitoring

Reorganize dispatcher functionality into separate components: - Create dedicated dispatcher classes (MemoryAdaptive, Semaphore) - Add RateLimiter for smart request throttling - Implement CrawlerMonitor for real-time progress tracking - Move dispatcher config from CrawlerRunConfig to separate classes BREAKING CHANGE: Dispatcher configuration moved from CrawlerRunConfig to dedicated dispatcher classes. Users need to update their configuration approach for multi-URL crawling.
2025-01-11 21:10:27 +08:00
parent 3865342c93
commit 825c78a048
19 changed files with 1742 additions and 484 deletions
--- a/docs/examples/dispatcher_example.py
+++ b/docs/examples/dispatcher_example.py
@@ -1,67 +1,121 @@
-import asyncio, time
-from crawl4ai.async_webcrawler import AsyncWebCrawler
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
-from crawl4ai.dispatcher import DisplayMode
+import asyncio
+import time
+from rich import print
+from rich.table import Table
+from crawl4ai import (
+    AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, 
+    MemoryAdaptiveDispatcher, SemaphoreDispatcher,
+    RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
+)

-async def crawl_with_rate_limiting(urls):
-    """
-    Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
-    
-    Args:
-        urls (List[str]): List of URLs to crawl
-        
-    Returns:
-        List[CrawlResult]: List of crawl results for each URL
-    """
-    # Configure browser settings
-    browser_config = BrowserConfig(
-        headless=True,  # Run browser in headless mode
-        verbose=False   # Minimize browser logging
-    )
-    
-    # Configure crawler settings with rate limiting
-    run_config = CrawlerRunConfig(
-        # Enable rate limiting
-        enable_rate_limiting=True,
-        rate_limit_config=RateLimitConfig(
-            base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds between requests
-            max_delay=30.0,         # Maximum delay after rate limit hits
-            max_retries=2,          # Number of retries before giving up
-            rate_limit_codes=[429, 503]  # HTTP status codes to trigger rate limiting
-        ),
-        # Resource monitoring settings
-        memory_threshold_percent=70.0,  # Pause crawling if memory usage exceeds this
-        check_interval=0.5,            # How often to check resource usage
-        max_session_permit=10,          # Maximum concurrent crawls
-        display_mode=DisplayMode.DETAILED.value  # Show detailed progress
-    )
-    
-    # Create and use crawler with context manager
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        results = await crawler.arun_many(urls, config=run_config)
-        return results
-
-def main():
-    # Example URLs (replace with real URLs)
-    urls = [
-        f"https://example.com/page{i}" for i in range(1, 40)
-    ]
-    
+async def memory_adaptive(urls, browser_config, run_config):
+    """Memory adaptive crawler with monitoring"""
    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=70.0,
+            max_session_permit=10,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15,
+                display_mode=DisplayMode.DETAILED
+            )
+        )
+        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
+    """Memory adaptive crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=70.0,
+            max_session_permit=10,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0),
+                max_delay=30.0,
+                max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15,
+                display_mode=DisplayMode.DETAILED
+            )
+        )
+        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+async def semaphore(urls, browser_config, run_config):
+    """Basic semaphore crawler"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15,
+                display_mode=DisplayMode.DETAILED
+            )
+        )
+        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+async def semaphore_with_rate_limit(urls, browser_config, run_config):
+    """Semaphore crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0),
+                max_delay=30.0,
+                max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15,
+                display_mode=DisplayMode.DETAILED
+            )
+        )
+        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+def create_performance_table(results):
+    """Creates a rich table showing performance results"""
+    table = Table(title="Crawler Strategy Performance Comparison")
+    table.add_column("Strategy", style="cyan")
+    table.add_column("URLs Crawled", justify="right", style="green")
+    table.add_column("Time (seconds)", justify="right", style="yellow")
+    table.add_column("URLs/second", justify="right", style="magenta")
+
+    sorted_results = sorted(results.items(), key=lambda x: x[1][1])
    
-    # Run the crawler
-    results = asyncio.run(crawl_with_rate_limiting(urls))
+    for strategy, (urls_crawled, duration) in sorted_results:
+        urls_per_second = urls_crawled / duration
+        table.add_row(
+            strategy,
+            str(urls_crawled),
+            f"{duration:.2f}",
+            f"{urls_per_second:.2f}"
+        )
    
-    # Process results
-    successful_results = [result for result in results if result.success]
-    failed_results = [result for result in results if not result.success]
-    
-    end = time.perf_counter()
-    
-    # Print results
-    print(f"Successful crawls: {len(successful_results)}")
-    print(f"Failed crawls: {len(failed_results)}")
-    print(f"Time taken: {end - start:.2f} seconds")
+    return table
+
+async def main():
+    urls = [f"https://example.com/page{i}" for i in range(1, 20)]
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    results = {
+        "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
+        "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(urls, browser_config, run_config),
+        "Semaphore": await semaphore(urls, browser_config, run_config),
+        "Semaphore + Rate Limit": await semaphore_with_rate_limit(urls, browser_config, run_config),
+    }
+
+    table = create_performance_table(results)
+    print("\nPerformance Summary:")
+    print(table)

 if __name__ == "__main__":
-    main()
+    asyncio.run(main())