refactor(dispatcher): migrate to modular dispatcher system with enhanced monitoring
Reorganize dispatcher functionality into separate components: - Create dedicated dispatcher classes (MemoryAdaptive, Semaphore) - Add RateLimiter for smart request throttling - Implement CrawlerMonitor for real-time progress tracking - Move dispatcher config from CrawlerRunConfig to separate classes BREAKING CHANGE: Dispatcher configuration moved from CrawlerRunConfig to dedicated dispatcher classes. Users need to update their configuration approach for multi-URL crawling.
This commit is contained in:
147
tests/async/test_dispatchers.py
Normal file
147
tests/async/test_dispatchers.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import pytest
|
||||
import asyncio, time
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
|
||||
MemoryAdaptiveDispatcher, SemaphoreDispatcher,
|
||||
RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def browser_config():
|
||||
return BrowserConfig(
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def run_config():
|
||||
return CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def test_urls():
|
||||
return [
|
||||
"http://example.com",
|
||||
"http://example.com/page1",
|
||||
"http://example.com/page2"
|
||||
]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestDispatchStrategies:
|
||||
|
||||
async def test_memory_adaptive_basic(self, browser_config, run_config, test_urls):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
max_session_permit=2,
|
||||
check_interval=0.1
|
||||
)
|
||||
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||
assert len(results) == len(test_urls)
|
||||
assert all(r.success for r in results)
|
||||
|
||||
async def test_memory_adaptive_with_rate_limit(self, browser_config, run_config, test_urls):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
max_session_permit=2,
|
||||
check_interval=0.1,
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(0.1, 0.2),
|
||||
max_delay=1.0,
|
||||
max_retries=2
|
||||
)
|
||||
)
|
||||
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||
assert len(results) == len(test_urls)
|
||||
assert all(r.success for r in results)
|
||||
|
||||
async def test_semaphore_basic(self, browser_config, run_config, test_urls):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
semaphore_count=2
|
||||
)
|
||||
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||
assert len(results) == len(test_urls)
|
||||
assert all(r.success for r in results)
|
||||
|
||||
async def test_semaphore_with_rate_limit(self, browser_config, run_config, test_urls):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
semaphore_count=2,
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(0.1, 0.2),
|
||||
max_delay=1.0,
|
||||
max_retries=2
|
||||
)
|
||||
)
|
||||
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||
assert len(results) == len(test_urls)
|
||||
assert all(r.success for r in results)
|
||||
|
||||
async def test_memory_adaptive_memory_error(self, browser_config, run_config, test_urls):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=1.0, # Set unrealistically low threshold
|
||||
max_session_permit=2,
|
||||
check_interval=0.1,
|
||||
memory_wait_timeout=1.0 # Short timeout for testing
|
||||
)
|
||||
with pytest.raises(MemoryError):
|
||||
await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||
|
||||
async def test_empty_urls(self, browser_config, run_config):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
|
||||
results = await crawler.arun_many([], config=run_config, dispatcher=dispatcher)
|
||||
assert len(results) == 0
|
||||
|
||||
async def test_single_url(self, browser_config, run_config):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
|
||||
results = await crawler.arun_many(["http://example.com"], config=run_config, dispatcher=dispatcher)
|
||||
assert len(results) == 1
|
||||
assert results[0].success
|
||||
|
||||
async def test_invalid_urls(self, browser_config, run_config):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
|
||||
results = await crawler.arun_many(["http://invalid.url.that.doesnt.exist"], config=run_config, dispatcher=dispatcher)
|
||||
assert len(results) == 1
|
||||
assert not results[0].success
|
||||
|
||||
async def test_rate_limit_backoff(self, browser_config, run_config):
|
||||
urls = ["http://example.com"] * 5 # Multiple requests to same domain
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
max_session_permit=2,
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(0.1, 0.2),
|
||||
max_delay=1.0,
|
||||
max_retries=2,
|
||||
rate_limit_codes=[200] # Force rate limiting for testing
|
||||
)
|
||||
)
|
||||
start_time = time.time()
|
||||
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||
duration = time.time() - start_time
|
||||
assert len(results) == len(urls)
|
||||
assert duration > 1.0 # Ensure rate limiting caused delays
|
||||
|
||||
async def test_monitor_integration(self, browser_config, run_config, test_urls):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
monitor = CrawlerMonitor(max_visible_rows=5, display_mode=DisplayMode.DETAILED)
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
max_session_permit=2,
|
||||
monitor=monitor
|
||||
)
|
||||
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||
assert len(results) == len(test_urls)
|
||||
# Check monitor stats
|
||||
assert len(monitor.stats) == len(test_urls)
|
||||
assert all(stat.end_time is not None for stat in monitor.stats.values())
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v", "--asyncio-mode=auto"])
|
||||
@@ -1,38 +0,0 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
||||
from crawl4ai.dispatcher import DisplayMode
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawler_with_dispatcher():
|
||||
# Create test URLs
|
||||
urls = [f"https://example.com/page_{i}" for i in range(5)]
|
||||
|
||||
# Configure browser
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
|
||||
# Configure crawler with rate limiting
|
||||
run_config = CrawlerRunConfig(
|
||||
enable_rate_limiting=True,
|
||||
rate_limit_config=RateLimitConfig(
|
||||
base_delay=(1.0, 2.0),
|
||||
max_delay=30.0,
|
||||
max_retries=2,
|
||||
rate_limit_codes=[429, 503]
|
||||
),
|
||||
memory_threshold_percent=70.0,
|
||||
check_interval=0.5,
|
||||
max_session_permit=3,
|
||||
display_mode=DisplayMode.DETAILED.value
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(urls, config=run_config)
|
||||
|
||||
# Basic validation
|
||||
assert len(results) == len(urls)
|
||||
for result in results:
|
||||
assert result is not None
|
||||
# Note: example.com URLs will fail, which is expected for this test
|
||||
assert not result.success # We expect these to fail since they're fake URLs
|
||||
Reference in New Issue
Block a user