feat(dispatcher): add streaming support for URL processing
Add new streaming capability to the MemoryAdaptiveDispatcher and AsyncWebCrawler to allow processing URLs with real-time result streaming. This enables processing results as they become available rather than waiting for all URLs to complete. Key changes: - Add run_urls_stream method to MemoryAdaptiveDispatcher - Update AsyncWebCrawler.arun_many to support streaming mode - Add result queue for better result handling - Improve type hints and documentation BREAKING CHANGE: The return type of arun_many now depends on the 'stream' parameter, returning either List[CrawlResult] or AsyncGenerator[CrawlResult, None]
This commit is contained in:
54
tests/20241401/test_stream.py
Normal file
54
tests/20241401/test_stream.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import os, sys
|
||||
# append 2 parent directories to sys.path to import crawl4ai
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
parent_parent_dir = os.path.dirname(parent_dir)
|
||||
sys.path.append(parent_parent_dir)
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import *
|
||||
|
||||
async def test_crawler():
|
||||
# Setup configurations
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=0
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# Test URLs - mix of different sites
|
||||
urls = [
|
||||
"http://example.com",
|
||||
"http://example.org",
|
||||
"http://example.net",
|
||||
] * 10 # 15 total URLs
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
print("\n=== Testing Streaming Mode ===")
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
stream=True,
|
||||
verbose=True
|
||||
):
|
||||
print(f"Received result for: {result.url} - Success: {result.success}")
|
||||
|
||||
print("\n=== Testing Batch Mode ===")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
stream=False,
|
||||
verbose=True
|
||||
)
|
||||
print(f"Received all {len(results)} results at once")
|
||||
for result in results:
|
||||
print(f"Batch result for: {result.url} - Success: {result.success}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_crawler())
|
||||
39
tests/20241401/test_stream_dispatch.py
Normal file
39
tests/20241401/test_stream_dispatch.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import os, sys
|
||||
# append 2 parent directories to sys.path to import crawl4ai
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
parent_parent_dir = os.path.dirname(parent_dir)
|
||||
sys.path.append(parent_parent_dir)
|
||||
|
||||
|
||||
import asyncio
|
||||
from typing import List
|
||||
from crawl4ai import *
|
||||
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
|
||||
|
||||
async def test_streaming():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(
|
||||
# threshold=0.48,
|
||||
# threshold_type="fixed",
|
||||
# min_word_threshold=0
|
||||
# )
|
||||
),
|
||||
)
|
||||
|
||||
urls = ["http://example.com"] * 10
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
max_session_permit=5,
|
||||
check_interval=0.5
|
||||
)
|
||||
|
||||
async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
|
||||
print(f"Got result for {result.url} - Success: {result.result.success}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_streaming())
|
||||
Reference in New Issue
Block a user