feat(crawler): add memory-adaptive dispatcher with rate limiting

Implements a new MemoryAdaptiveDispatcher class to manage concurrent crawling operations with memory monitoring and rate limiting capabilities. Changes include: - Added RateLimitConfig dataclass for configuring rate limiting behavior - Extended CrawlerRunConfig with dispatcher-related settings - Refactored arun_many to use the new dispatcher system - Added memory threshold and session permit controls - Integrated optional progress monitoring display BREAKING CHANGE: The arun_many method now uses MemoryAdaptiveDispatcher by default, which may affect concurrent crawling behavior
2025-01-10 16:01:18 +08:00
parent 196dc79ec7
commit ac5f461d40
6 changed files with 675 additions and 70 deletions
--- a/docs/examples/dispatcher_example.py
+++ b/docs/examples/dispatcher_example.py
@@ -0,0 +1,67 @@
+import asyncio, time
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
+from crawl4ai.dispatcher import DisplayMode
+
+async def crawl_with_rate_limiting(urls):
+    """
+    Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
+    
+    Args:
+        urls (List[str]): List of URLs to crawl
+        
+    Returns:
+        List[CrawlResult]: List of crawl results for each URL
+    """
+    # Configure browser settings
+    browser_config = BrowserConfig(
+        headless=True,  # Run browser in headless mode
+        verbose=False   # Minimize browser logging
+    )
+    
+    # Configure crawler settings with rate limiting
+    run_config = CrawlerRunConfig(
+        # Enable rate limiting
+        enable_rate_limiting=True,
+        rate_limit_config=RateLimitConfig(
+            base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds between requests
+            max_delay=30.0,         # Maximum delay after rate limit hits
+            max_retries=2,          # Number of retries before giving up
+            rate_limit_codes=[429, 503]  # HTTP status codes to trigger rate limiting
+        ),
+        # Resource monitoring settings
+        memory_threshold_percent=70.0,  # Pause crawling if memory usage exceeds this
+        check_interval=0.5,            # How often to check resource usage
+        max_session_permit=10,          # Maximum concurrent crawls
+        display_mode=DisplayMode.DETAILED.value  # Show detailed progress
+    )
+    
+    # Create and use crawler with context manager
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(urls, config=run_config)
+        return results
+
+def main():
+    # Example URLs (replace with real URLs)
+    urls = [
+        f"https://example.com/page{i}" for i in range(1, 40)
+    ]
+    
+    start = time.perf_counter()
+    
+    # Run the crawler
+    results = asyncio.run(crawl_with_rate_limiting(urls))
+    
+    # Process results
+    successful_results = [result for result in results if result.success]
+    failed_results = [result for result in results if not result.success]
+    
+    end = time.perf_counter()
+    
+    # Print results
+    print(f"Successful crawls: {len(successful_results)}")
+    print(f"Failed crawls: {len(failed_results)}")
+    print(f"Time taken: {end - start:.2f} seconds")
+
+if __name__ == "__main__":
+    main()