feat(dispatcher): add streaming support for URL processing

Add new streaming capability to the MemoryAdaptiveDispatcher and AsyncWebCrawler
to allow processing URLs with real-time result streaming. This enables
processing results as they become available rather than waiting for all
URLs to complete.

Key changes:
- Add run_urls_stream method to MemoryAdaptiveDispatcher
- Update AsyncWebCrawler.arun_many to support streaming mode
- Add result queue for better result handling
- Improve type hints and documentation

BREAKING CHANGE: The return type of arun_many now depends on the 'stream'
parameter, returning either List[CrawlResult] or AsyncGenerator[CrawlResult, None]
This commit is contained in:
UncleCode
2025-01-19 14:03:34 +08:00
parent 3d09b6a221
commit e363234172
5 changed files with 817 additions and 83 deletions

View File

@@ -0,0 +1,54 @@
import os, sys
# append 2 parent directories to sys.path to import crawl4ai
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
parent_parent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_parent_dir)
import asyncio
from crawl4ai import *
async def test_crawler():
# Setup configurations
browser_config = BrowserConfig(headless=True, verbose=False)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
)
),
)
# Test URLs - mix of different sites
urls = [
"http://example.com",
"http://example.org",
"http://example.net",
] * 10 # 15 total URLs
async with AsyncWebCrawler(config=browser_config) as crawler:
print("\n=== Testing Streaming Mode ===")
async for result in await crawler.arun_many(
urls=urls,
config=crawler_config,
stream=True,
verbose=True
):
print(f"Received result for: {result.url} - Success: {result.success}")
print("\n=== Testing Batch Mode ===")
results = await crawler.arun_many(
urls=urls,
config=crawler_config,
stream=False,
verbose=True
)
print(f"Received all {len(results)} results at once")
for result in results:
print(f"Batch result for: {result.url} - Success: {result.success}")
if __name__ == "__main__":
asyncio.run(test_crawler())

View File

@@ -0,0 +1,39 @@
import os, sys
# append 2 parent directories to sys.path to import crawl4ai
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
parent_parent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_parent_dir)
import asyncio
from typing import List
from crawl4ai import *
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
async def test_streaming():
browser_config = BrowserConfig(headless=True, verbose=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
# content_filter=PruningContentFilter(
# threshold=0.48,
# threshold_type="fixed",
# min_word_threshold=0
# )
),
)
urls = ["http://example.com"] * 10
async with AsyncWebCrawler(config=browser_config) as crawler:
dispatcher = MemoryAdaptiveDispatcher(
max_session_permit=5,
check_interval=0.5
)
async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
print(f"Got result for {result.url} - Success: {result.result.success}")
if __name__ == "__main__":
asyncio.run(test_streaming())