feat(dispatcher): add streaming support for URL processing

Add new streaming capability to the MemoryAdaptiveDispatcher and AsyncWebCrawler to allow processing URLs with real-time result streaming. This enables processing results as they become available rather than waiting for all URLs to complete. Key changes: - Add run_urls_stream method to MemoryAdaptiveDispatcher - Update AsyncWebCrawler.arun_many to support streaming mode - Add result queue for better result handling - Improve type hints and documentation BREAKING CHANGE: The return type of arun_many now depends on the 'stream' parameter, returning either List[CrawlResult] or AsyncGenerator[CrawlResult, None]
2025-01-19 14:03:34 +08:00
parent 3d09b6a221
commit e363234172
5 changed files with 817 additions and 83 deletions
--- a/tests/20241401/test_stream.py
+++ b/tests/20241401/test_stream.py
@@ -0,0 +1,54 @@
+import os, sys
+# append 2 parent directories to sys.path to import crawl4ai
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+
+import asyncio
+from crawl4ai import *
+
+async def test_crawler():
+    # Setup configurations
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, 
+                threshold_type="fixed", 
+                min_word_threshold=0
+            )
+        ),
+    )
+
+    # Test URLs - mix of different sites
+    urls = [
+        "http://example.com",
+        "http://example.org",
+        "http://example.net",
+    ] * 10  # 15 total URLs
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        print("\n=== Testing Streaming Mode ===")
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=crawler_config,
+            stream=True,
+            verbose=True
+        ):
+            print(f"Received result for: {result.url} - Success: {result.success}")
+            
+        print("\n=== Testing Batch Mode ===")
+        results = await crawler.arun_many(
+            urls=urls,
+            config=crawler_config,
+            stream=False,
+            verbose=True
+        )
+        print(f"Received all {len(results)} results at once")
+        for result in results:
+            print(f"Batch result for: {result.url} - Success: {result.success}")
+
+if __name__ == "__main__":
+    asyncio.run(test_crawler())
--- a/tests/20241401/test_stream_dispatch.py
+++ b/tests/20241401/test_stream_dispatch.py
@@ -0,0 +1,39 @@
+import os, sys
+# append 2 parent directories to sys.path to import crawl4ai
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+
+
+import asyncio
+from typing import List
+from crawl4ai import *
+from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
+
+async def test_streaming():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            # content_filter=PruningContentFilter(
+            #     threshold=0.48, 
+            #     threshold_type="fixed", 
+            #     min_word_threshold=0
+            # )
+        ),
+    )
+
+    urls = ["http://example.com"] * 10
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            max_session_permit=5,
+            check_interval=0.5
+        )
+        
+        async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
+            print(f"Got result for {result.url} - Success: {result.result.success}")
+
+if __name__ == "__main__":
+    asyncio.run(test_streaming())