feat(crawler): add memory-adaptive dispatcher with rate limiting
Implements a new MemoryAdaptiveDispatcher class to manage concurrent crawling operations with memory monitoring and rate limiting capabilities. Changes include: - Added RateLimitConfig dataclass for configuring rate limiting behavior - Extended CrawlerRunConfig with dispatcher-related settings - Refactored arun_many to use the new dispatcher system - Added memory threshold and session permit controls - Integrated optional progress monitoring display BREAKING CHANGE: The arun_many method now uses MemoryAdaptiveDispatcher by default, which may affect concurrent crawling behavior
This commit is contained in:
38
tests/test_dispatcher.py
Normal file
38
tests/test_dispatcher.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
||||
from crawl4ai.dispatcher import DisplayMode
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawler_with_dispatcher():
|
||||
# Create test URLs
|
||||
urls = [f"https://example.com/page_{i}" for i in range(5)]
|
||||
|
||||
# Configure browser
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
|
||||
# Configure crawler with rate limiting
|
||||
run_config = CrawlerRunConfig(
|
||||
enable_rate_limiting=True,
|
||||
rate_limit_config=RateLimitConfig(
|
||||
base_delay=(1.0, 2.0),
|
||||
max_delay=30.0,
|
||||
max_retries=2,
|
||||
rate_limit_codes=[429, 503]
|
||||
),
|
||||
memory_threshold_percent=70.0,
|
||||
check_interval=0.5,
|
||||
max_session_permit=3,
|
||||
display_mode=DisplayMode.DETAILED.value
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(urls, config=run_config)
|
||||
|
||||
# Basic validation
|
||||
assert len(results) == len(urls)
|
||||
for result in results:
|
||||
assert result is not None
|
||||
# Note: example.com URLs will fail, which is expected for this test
|
||||
assert not result.success # We expect these to fail since they're fake URLs
|
||||
Reference in New Issue
Block a user