Implements a new MemoryAdaptiveDispatcher class to manage concurrent crawling operations with memory monitoring and rate limiting capabilities. Changes include: - Added RateLimitConfig dataclass for configuring rate limiting behavior - Extended CrawlerRunConfig with dispatcher-related settings - Refactored arun_many to use the new dispatcher system - Added memory threshold and session permit controls - Integrated optional progress monitoring display BREAKING CHANGE: The arun_many method now uses MemoryAdaptiveDispatcher by default, which may affect concurrent crawling behavior
39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
import pytest
|
|
import asyncio
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
|
from crawl4ai.dispatcher import DisplayMode
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawler_with_dispatcher():
|
|
# Create test URLs
|
|
urls = [f"https://example.com/page_{i}" for i in range(5)]
|
|
|
|
# Configure browser
|
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
|
|
|
# Configure crawler with rate limiting
|
|
run_config = CrawlerRunConfig(
|
|
enable_rate_limiting=True,
|
|
rate_limit_config=RateLimitConfig(
|
|
base_delay=(1.0, 2.0),
|
|
max_delay=30.0,
|
|
max_retries=2,
|
|
rate_limit_codes=[429, 503]
|
|
),
|
|
memory_threshold_percent=70.0,
|
|
check_interval=0.5,
|
|
max_session_permit=3,
|
|
display_mode=DisplayMode.DETAILED.value
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
results = await crawler.arun_many(urls, config=run_config)
|
|
|
|
# Basic validation
|
|
assert len(results) == len(urls)
|
|
for result in results:
|
|
assert result is not None
|
|
# Note: example.com URLs will fail, which is expected for this test
|
|
assert not result.success # We expect these to fail since they're fake URLs
|