Files
crawl4ai/tests/test_dispatcher.py
UncleCode ac5f461d40 feat(crawler): add memory-adaptive dispatcher with rate limiting
Implements a new MemoryAdaptiveDispatcher class to manage concurrent crawling operations with memory monitoring and rate limiting capabilities. Changes include:

- Added RateLimitConfig dataclass for configuring rate limiting behavior
- Extended CrawlerRunConfig with dispatcher-related settings
- Refactored arun_many to use the new dispatcher system
- Added memory threshold and session permit controls
- Integrated optional progress monitoring display

BREAKING CHANGE: The arun_many method now uses MemoryAdaptiveDispatcher by default, which may affect concurrent crawling behavior
2025-01-10 16:01:18 +08:00

39 lines
1.3 KiB
Python

import pytest
import asyncio
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
from crawl4ai.dispatcher import DisplayMode
@pytest.mark.asyncio
async def test_crawler_with_dispatcher():
# Create test URLs
urls = [f"https://example.com/page_{i}" for i in range(5)]
# Configure browser
browser_config = BrowserConfig(headless=True, verbose=False)
# Configure crawler with rate limiting
run_config = CrawlerRunConfig(
enable_rate_limiting=True,
rate_limit_config=RateLimitConfig(
base_delay=(1.0, 2.0),
max_delay=30.0,
max_retries=2,
rate_limit_codes=[429, 503]
),
memory_threshold_percent=70.0,
check_interval=0.5,
max_session_permit=3,
display_mode=DisplayMode.DETAILED.value
)
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(urls, config=run_config)
# Basic validation
assert len(results) == len(urls)
for result in results:
assert result is not None
# Note: example.com URLs will fail, which is expected for this test
assert not result.success # We expect these to fail since they're fake URLs