Files
crawl4ai/tests/pipeline/test_batch_crawl.py
UncleCode 72d8e679ad feat(pipeline): add high-level Crawler utility class for simplified web crawling
Add new Crawler class that provides a simplified interface for both single and batch URL crawling operations. Key features include:
- Simple single URL crawling with configurable options
- Parallel batch crawling with concurrency control
- Shared browser hub support for resource efficiency
- Progress tracking and custom retry strategies
- Comprehensive error handling and retry logic

Remove demo and extended test files in favor of new focused test suite.
2025-04-07 22:50:44 +08:00

163 lines
5.1 KiB
Python

"""Test the Crawler class for batch crawling capabilities."""
import asyncio
import pytest
from typing import List, Dict, Any, Optional, Tuple
from crawl4ai import Crawler
from crawl4ai import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
from crawl4ai.models import CrawlResult, CrawlResultContainer
from crawl4ai.browser import BrowserHub
from crawl4ai.cache_context import CacheMode
# Test URLs for crawling
SAFE_URLS = [
"https://example.com",
"https://httpbin.org/html",
"https://httpbin.org/headers",
"https://httpbin.org/ip",
"https://httpbin.org/user-agent",
"https://httpstat.us/200",
"https://jsonplaceholder.typicode.com/posts/1",
"https://jsonplaceholder.typicode.com/comments/1",
"https://iana.org",
"https://www.python.org"
]
# Simple test for batch crawling
@pytest.mark.asyncio
async def test_batch_crawl_simple():
"""Test simple batch crawling with multiple URLs."""
# Use a few test URLs
urls = SAFE_URLS[:3]
# Custom crawler config
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
wait_until="domcontentloaded"
)
# Crawl multiple URLs using batch crawl
results = await Crawler.crawl(
urls,
crawler_config=crawler_config
)
# Verify the results
assert isinstance(results, dict)
assert len(results) == len(urls)
for url in urls:
assert url in results
assert results[url].success
assert results[url].html is not None
# Test parallel batch crawling
@pytest.mark.asyncio
async def test_parallel_batch_crawl():
"""Test parallel batch crawling with multiple URLs."""
# Use several URLs for parallel crawling
urls = SAFE_URLS[:5]
# Basic crawler config
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
wait_until="domcontentloaded"
)
# Crawl in parallel
start_time = asyncio.get_event_loop().time()
results = await Crawler.parallel_crawl(
urls,
crawler_config=crawler_config
)
end_time = asyncio.get_event_loop().time()
# Verify results
assert len(results) == len(urls)
successful = sum(1 for r in results.values() if r.success)
print(f"Parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s")
print(f"Success rate: {successful}/{len(urls)}")
# At least 80% should succeed
assert successful / len(urls) >= 0.8
# Test batch crawling with different configurations
@pytest.mark.asyncio
async def test_batch_crawl_mixed_configs():
"""Test batch crawling with different configurations for different URLs."""
# Create URL batches with different configurations
batch1 = (SAFE_URLS[:2], CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False))
batch2 = (SAFE_URLS[2:4], CrawlerRunConfig(wait_until="networkidle", screenshot=True))
# Crawl with mixed configurations
start_time = asyncio.get_event_loop().time()
results = await Crawler.parallel_crawl([batch1, batch2])
end_time = asyncio.get_event_loop().time()
# Extract all URLs
all_urls = batch1[0] + batch2[0]
# Verify results
assert len(results) == len(all_urls)
# Check that screenshots are present only for batch2
for url in batch1[0]:
assert results[url].screenshot is None
for url in batch2[0]:
assert results[url].screenshot is not None
print(f"Mixed-config parallel crawl of {len(all_urls)} URLs completed in {end_time - start_time:.2f}s")
# Test shared browser hub
@pytest.mark.asyncio
async def test_batch_crawl_shared_hub():
"""Test batch crawling with a shared browser hub."""
# Create and initialize a browser hub
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
browser_hub = await BrowserHub.get_browser_manager(
config=browser_config,
max_browsers_per_config=3,
max_pages_per_browser=4,
initial_pool_size=1
)
try:
# Use the hub for parallel crawling
urls = SAFE_URLS[:3]
start_time = asyncio.get_event_loop().time()
results = await Crawler.parallel_crawl(
urls,
browser_hub=browser_hub,
crawler_config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
wait_until="domcontentloaded"
)
)
end_time = asyncio.get_event_loop().time()
# Verify results
assert len(results) == len(urls)
successful = sum(1 for r in results.values() if r.success)
print(f"Shared hub parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s")
print(f"Success rate: {successful}/{len(urls)}")
# Get browser hub statistics
hub_stats = await browser_hub.get_pool_status()
print(f"Browser hub stats: {hub_stats}")
# At least 80% should succeed
assert successful / len(urls) >= 0.8
finally:
# Clean up the browser hub
await browser_hub.close()