Files
crawl4ai/tests/pipeline/test_crawler.py
UncleCode 72d8e679ad feat(pipeline): add high-level Crawler utility class for simplified web crawling
Add new Crawler class that provides a simplified interface for both single and batch URL crawling operations. Key features include:
- Simple single URL crawling with configurable options
- Parallel batch crawling with concurrency control
- Shared browser hub support for resource efficiency
- Progress tracking and custom retry strategies
- Comprehensive error handling and retry logic

Remove demo and extended test files in favor of new focused test suite.
2025-04-07 22:50:44 +08:00

448 lines
15 KiB
Python

# test_crawler.py
import asyncio
import warnings
import pytest
import pytest_asyncio
from typing import Optional, Tuple
# Define test fixtures
@pytest_asyncio.fixture
async def clean_browser_hub():
"""Fixture to ensure clean browser hub state between tests."""
# Yield control to the test
yield
# After test, cleanup all browser hubs
from crawl4ai.browser import BrowserHub
try:
await BrowserHub.shutdown_all()
except Exception as e:
print(f"Error during browser cleanup: {e}")
from crawl4ai import Crawler
from crawl4ai import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
from crawl4ai.models import CrawlResultContainer
from crawl4ai.browser import BrowserHub
from crawl4ai.cache_context import CacheMode
import warnings
from pydantic import PydanticDeprecatedSince20
# Test URLs for crawling
SAFE_URLS = [
"https://example.com",
"https://httpbin.org/html",
"https://httpbin.org/headers",
"https://httpbin.org/ip",
"https://httpbin.org/user-agent",
"https://httpstat.us/200",
"https://jsonplaceholder.typicode.com/posts/1",
"https://jsonplaceholder.typicode.com/comments/1",
"https://iana.org",
"https://www.python.org",
]
class TestCrawlerBasic:
"""Basic tests for the Crawler utility class"""
@pytest.mark.asyncio
async def test_simple_crawl_single_url(self, clean_browser_hub):
"""Test crawling a single URL with default configuration"""
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=Warning)
# Basic logger
logger = AsyncLogger(verbose=True)
# Basic single URL crawl with default configuration
url = "https://example.com"
result = await Crawler.crawl(url)
# Verify the result
assert isinstance(result, CrawlResultContainer)
assert result.success
assert result.url == url
assert result.html is not None
assert len(result.html) > 0
@pytest.mark.asyncio
async def test_crawl_with_custom_config(self, clean_browser_hub):
"""Test crawling with custom browser and crawler configuration"""
# Custom browser config
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
viewport_width=1280,
viewport_height=800,
)
# Custom crawler config
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, wait_until="networkidle", screenshot=True
)
# Crawl with custom configuration
url = "https://httpbin.org/html"
result = await Crawler.crawl(
url, browser_config=browser_config, crawler_config=crawler_config
)
# Verify the result
assert result.success
assert result.url == url
assert result.screenshot is not None
@pytest.mark.asyncio
async def test_crawl_multiple_urls_sequential(self, clean_browser_hub):
"""Test crawling multiple URLs sequentially"""
# Use a few test URLs
urls = SAFE_URLS[:3]
# Custom crawler config
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
)
# Crawl multiple URLs sequentially
results = await Crawler.crawl(urls, crawler_config=crawler_config)
# Verify the results
assert isinstance(results, dict)
assert len(results) == len(urls)
for url in urls:
assert url in results
assert results[url].success
assert results[url].html is not None
@pytest.mark.asyncio
async def test_crawl_with_error_handling(self, clean_browser_hub):
"""Test error handling during crawling"""
# Include a valid URL and a non-existent URL
urls = ["https://example.com", "https://non-existent-domain-123456789.com"]
# Crawl with retries
results = await Crawler.crawl(urls, max_retries=2, retry_delay=1.0)
# Verify results for both URLs
assert len(results) == 2
# Valid URL should succeed
assert results[urls[0]].success
# Invalid URL should fail but be in results
assert urls[1] in results
assert not results[urls[1]].success
assert results[urls[1]].error_message is not None
class TestCrawlerParallel:
"""Tests for the parallel crawling capabilities of Crawler"""
@pytest.mark.asyncio
async def test_parallel_crawl_simple(self, clean_browser_hub):
"""Test basic parallel crawling with same configuration"""
# Use several URLs for parallel crawling
urls = SAFE_URLS[:5]
# Basic crawler config
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
)
# Crawl in parallel with default concurrency
start_time = asyncio.get_event_loop().time()
results = await Crawler.parallel_crawl(urls, crawler_config=crawler_config)
end_time = asyncio.get_event_loop().time()
# Verify results
assert len(results) == len(urls)
successful = sum(1 for r in results.values() if r.success)
print(
f"Parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
)
print(f"Success rate: {successful}/{len(urls)}")
# At least 80% should succeed
assert successful / len(urls) >= 0.8
@pytest.mark.asyncio
async def test_parallel_crawl_with_concurrency_limit(self, clean_browser_hub):
"""Test parallel crawling with concurrency limit"""
# Use more URLs to test concurrency control
urls = SAFE_URLS[:8]
# Custom crawler config
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
)
# Limited concurrency
concurrency = 2
# Time the crawl
start_time = asyncio.get_event_loop().time()
results = await Crawler.parallel_crawl(
urls, crawler_config=crawler_config, concurrency=concurrency
)
end_time = asyncio.get_event_loop().time()
# Verify results
assert len(results) == len(urls)
successful = sum(1 for r in results.values() if r.success)
print(
f"Parallel crawl with concurrency={concurrency} of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
)
print(f"Success rate: {successful}/{len(urls)}")
# At least 80% should succeed
assert successful / len(urls) >= 0.8
@pytest.mark.asyncio
async def test_parallel_crawl_with_different_configs(self, clean_browser_hub):
"""Test parallel crawling with different configurations for different URLs"""
# Create URL batches with different configurations
batch1 = (
SAFE_URLS[:2],
CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
)
batch2 = (
SAFE_URLS[2:4],
CrawlerRunConfig(wait_until="networkidle", screenshot=True),
)
batch3 = (
SAFE_URLS[4:6],
CrawlerRunConfig(wait_until="load", scan_full_page=True),
)
# Crawl with mixed configurations
start_time = asyncio.get_event_loop().time()
results = await Crawler.parallel_crawl([batch1, batch2, batch3])
end_time = asyncio.get_event_loop().time()
# Extract all URLs
all_urls = batch1[0] + batch2[0] + batch3[0]
# Verify results
assert len(results) == len(all_urls)
# Check that screenshots are present only for batch2
for url in batch1[0]:
assert not results[url].screenshot
for url in batch2[0]:
assert results[url].screenshot
print(
f"Mixed-config parallel crawl of {len(all_urls)} URLs completed in {end_time - start_time:.2f}s"
)
@pytest.mark.asyncio
async def test_parallel_crawl_with_shared_browser_hub(self, clean_browser_hub):
"""Test parallel crawling with a shared browser hub"""
# Create and initialize a browser hub
browser_config = BrowserConfig(browser_type="chromium", headless=True)
browser_hub = await BrowserHub.get_browser_manager(
config=browser_config,
max_browsers_per_config=3,
max_pages_per_browser=4,
initial_pool_size=1,
)
try:
# Use the hub for parallel crawling
urls = SAFE_URLS[:6]
start_time = asyncio.get_event_loop().time()
results = await Crawler.parallel_crawl(
urls,
browser_hub=browser_hub,
crawler_config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
),
)
end_time = asyncio.get_event_loop().time()
# Verify results
# assert (len(results), len(urls))
assert len(results) == len(urls)
successful = sum(1 for r in results.values() if r.success)
print(
f"Shared hub parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
)
print(f"Success rate: {successful}/{len(urls)}")
# Get browser hub statistics
hub_stats = await browser_hub.get_pool_status()
print(f"Browser hub stats: {hub_stats}")
# At least 80% should succeed
# assert (successful / len(urls), 0.8)
assert successful / len(urls) >= 0.8
finally:
# Clean up the browser hub
await browser_hub.close()
class TestCrawlerAdvanced:
"""Advanced tests for the Crawler utility class"""
@pytest.mark.asyncio
async def test_crawl_with_customized_batch_config(self, clean_browser_hub):
"""Test crawling with fully customized batch configuration"""
# Create URL batches with different browser and crawler configurations
browser_config1 = BrowserConfig(browser_type="chromium", headless=True)
browser_config2 = BrowserConfig(
browser_type="chromium", headless=False, viewport_width=1920
)
crawler_config1 = CrawlerRunConfig(wait_until="domcontentloaded")
crawler_config2 = CrawlerRunConfig(wait_until="networkidle", screenshot=True)
batch1 = (SAFE_URLS[:2], browser_config1, crawler_config1)
batch2 = (SAFE_URLS[2:4], browser_config2, crawler_config2)
# Crawl with mixed configurations
results = await Crawler.parallel_crawl([batch1, batch2])
# Extract all URLs
all_urls = batch1[0] + batch2[0]
# Verify results
# assert (len(results), len(all_urls))
assert len(results) == len(all_urls)
# Verify batch-specific processing
for url in batch1[0]:
assert results[url].screenshot is None # No screenshots for batch1
for url in batch2[0]:
assert results[url].screenshot is not None # Should have screenshots for batch2
@pytest.mark.asyncio
async def test_crawl_with_progress_callback(self, clean_browser_hub):
"""Test crawling with progress callback"""
# Use several URLs
urls = SAFE_URLS[:5]
# Track progress
progress_data = {"started": 0, "completed": 0, "failed": 0, "updates": []}
# Progress callback
async def on_progress(
status: str, url: str, result: Optional[CrawlResultContainer] = None
):
if status == "started":
progress_data["started"] += 1
elif status == "completed":
progress_data["completed"] += 1
if not result.success:
progress_data["failed"] += 1
progress_data["updates"].append((status, url))
print(f"Progress: {status} - {url}")
# Crawl with progress tracking
results = await Crawler.parallel_crawl(
urls,
crawler_config=CrawlerRunConfig(wait_until="domcontentloaded"),
progress_callback=on_progress,
)
# Verify progress tracking
assert progress_data["started"] == len(urls)
assert progress_data["completed"] == len(urls)
assert len(progress_data["updates"]) == len(urls) * 2 # start + complete events
@pytest.mark.asyncio
async def test_crawl_with_dynamic_retry_strategy(self, clean_browser_hub):
"""Test crawling with a dynamic retry strategy"""
# Include URLs that might fail
urls = [
"https://example.com",
"https://httpstat.us/500",
"https://httpstat.us/404",
]
# Custom retry strategy
async def retry_strategy(
url: str, attempt: int, error: Exception
) -> Tuple[bool, float]:
# Only retry 500 errors, not 404s
if "500" in url:
return True, 1.0 # Retry with 1 second delay
return False, 0.0 # Don't retry other errors
# Crawl with custom retry strategy
results = await Crawler.parallel_crawl(
urls,
crawler_config=CrawlerRunConfig(wait_until="domcontentloaded"),
retry_strategy=retry_strategy,
max_retries=3,
)
# Verify results
assert len(results) == len(urls)
# Example.com should succeed
assert results[urls[0]].success
# httpstat.us pages return content even for error status codes
# so our crawler marks them as successful since it got HTML content
# Verify that we got the expected status code
assert results[urls[1]].status_code == 500
# 404 should have the correct status code
assert results[urls[2]].status_code == 404
@pytest.mark.asyncio
async def test_crawl_with_very_large_batch(self, clean_browser_hub):
"""Test crawling with a very large batch of URLs"""
# Create a batch by repeating our safe URLs
# Note: In a real test, we'd use more URLs, but for simplicity we'll use a smaller set
large_batch = list(dict.fromkeys(SAFE_URLS[:5] * 2)) # ~10 unique URLs
# Set a reasonable concurrency limit
concurrency = 10
# Time the crawl
start_time = asyncio.get_event_loop().time()
results = await Crawler.parallel_crawl(
large_batch,
crawler_config=CrawlerRunConfig(
wait_until="domcontentloaded",
page_timeout=10000, # Shorter timeout for large batch
),
concurrency=concurrency,
)
end_time = asyncio.get_event_loop().time()
# Verify results
# assert (len(results), len(large_batch))
assert len(results) == len(large_batch)
successful = sum(1 for r in results.values() if r.success)
print(
f"Large batch crawl of {len(large_batch)} URLs completed in {end_time - start_time:.2f}s"
)
print(f"Success rate: {successful}/{len(large_batch)}")
print(
f"Average time per URL: {(end_time - start_time) / len(large_batch):.2f}s"
)
# At least 80% should succeed (from our unique URLs)
assert successful / len(results) >= 0.8
if __name__ == "__main__":
# Use pytest for async tests
pytest.main(["-xvs", __file__])