Add new Crawler class that provides a simplified interface for both single and batch URL crawling operations. Key features include: - Simple single URL crawling with configurable options - Parallel batch crawling with concurrency control - Shared browser hub support for resource efficiency - Progress tracking and custom retry strategies - Comprehensive error handling and retry logic Remove demo and extended test files in favor of new focused test suite.
448 lines
15 KiB
Python
448 lines
15 KiB
Python
# test_crawler.py
|
|
import asyncio
|
|
import warnings
|
|
import pytest
|
|
import pytest_asyncio
|
|
from typing import Optional, Tuple
|
|
|
|
# Define test fixtures
|
|
@pytest_asyncio.fixture
|
|
async def clean_browser_hub():
|
|
"""Fixture to ensure clean browser hub state between tests."""
|
|
# Yield control to the test
|
|
yield
|
|
|
|
# After test, cleanup all browser hubs
|
|
from crawl4ai.browser import BrowserHub
|
|
try:
|
|
await BrowserHub.shutdown_all()
|
|
except Exception as e:
|
|
print(f"Error during browser cleanup: {e}")
|
|
|
|
from crawl4ai import Crawler
|
|
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.async_logger import AsyncLogger
|
|
from crawl4ai.models import CrawlResultContainer
|
|
from crawl4ai.browser import BrowserHub
|
|
from crawl4ai.cache_context import CacheMode
|
|
|
|
import warnings
|
|
from pydantic import PydanticDeprecatedSince20
|
|
|
|
|
|
|
|
# Test URLs for crawling
|
|
SAFE_URLS = [
|
|
"https://example.com",
|
|
"https://httpbin.org/html",
|
|
"https://httpbin.org/headers",
|
|
"https://httpbin.org/ip",
|
|
"https://httpbin.org/user-agent",
|
|
"https://httpstat.us/200",
|
|
"https://jsonplaceholder.typicode.com/posts/1",
|
|
"https://jsonplaceholder.typicode.com/comments/1",
|
|
"https://iana.org",
|
|
"https://www.python.org",
|
|
]
|
|
|
|
|
|
class TestCrawlerBasic:
|
|
"""Basic tests for the Crawler utility class"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_simple_crawl_single_url(self, clean_browser_hub):
|
|
"""Test crawling a single URL with default configuration"""
|
|
with warnings.catch_warnings():
|
|
warnings.filterwarnings("ignore", category=Warning)
|
|
# Basic logger
|
|
logger = AsyncLogger(verbose=True)
|
|
|
|
# Basic single URL crawl with default configuration
|
|
url = "https://example.com"
|
|
result = await Crawler.crawl(url)
|
|
|
|
# Verify the result
|
|
assert isinstance(result, CrawlResultContainer)
|
|
assert result.success
|
|
assert result.url == url
|
|
assert result.html is not None
|
|
assert len(result.html) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_with_custom_config(self, clean_browser_hub):
|
|
"""Test crawling with custom browser and crawler configuration"""
|
|
# Custom browser config
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True,
|
|
viewport_width=1280,
|
|
viewport_height=800,
|
|
)
|
|
|
|
# Custom crawler config
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, wait_until="networkidle", screenshot=True
|
|
)
|
|
|
|
# Crawl with custom configuration
|
|
url = "https://httpbin.org/html"
|
|
result = await Crawler.crawl(
|
|
url, browser_config=browser_config, crawler_config=crawler_config
|
|
)
|
|
|
|
# Verify the result
|
|
assert result.success
|
|
assert result.url == url
|
|
assert result.screenshot is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_multiple_urls_sequential(self, clean_browser_hub):
|
|
"""Test crawling multiple URLs sequentially"""
|
|
# Use a few test URLs
|
|
urls = SAFE_URLS[:3]
|
|
|
|
# Custom crawler config
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
|
|
)
|
|
|
|
# Crawl multiple URLs sequentially
|
|
results = await Crawler.crawl(urls, crawler_config=crawler_config)
|
|
|
|
# Verify the results
|
|
assert isinstance(results, dict)
|
|
assert len(results) == len(urls)
|
|
|
|
for url in urls:
|
|
assert url in results
|
|
assert results[url].success
|
|
assert results[url].html is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_with_error_handling(self, clean_browser_hub):
|
|
"""Test error handling during crawling"""
|
|
# Include a valid URL and a non-existent URL
|
|
urls = ["https://example.com", "https://non-existent-domain-123456789.com"]
|
|
|
|
# Crawl with retries
|
|
results = await Crawler.crawl(urls, max_retries=2, retry_delay=1.0)
|
|
|
|
# Verify results for both URLs
|
|
assert len(results) == 2
|
|
|
|
# Valid URL should succeed
|
|
assert results[urls[0]].success
|
|
|
|
# Invalid URL should fail but be in results
|
|
assert urls[1] in results
|
|
assert not results[urls[1]].success
|
|
assert results[urls[1]].error_message is not None
|
|
|
|
|
|
class TestCrawlerParallel:
|
|
"""Tests for the parallel crawling capabilities of Crawler"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_parallel_crawl_simple(self, clean_browser_hub):
|
|
"""Test basic parallel crawling with same configuration"""
|
|
# Use several URLs for parallel crawling
|
|
urls = SAFE_URLS[:5]
|
|
|
|
# Basic crawler config
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
|
|
)
|
|
|
|
# Crawl in parallel with default concurrency
|
|
start_time = asyncio.get_event_loop().time()
|
|
results = await Crawler.parallel_crawl(urls, crawler_config=crawler_config)
|
|
end_time = asyncio.get_event_loop().time()
|
|
|
|
# Verify results
|
|
assert len(results) == len(urls)
|
|
successful = sum(1 for r in results.values() if r.success)
|
|
|
|
print(
|
|
f"Parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
|
|
)
|
|
print(f"Success rate: {successful}/{len(urls)}")
|
|
|
|
# At least 80% should succeed
|
|
assert successful / len(urls) >= 0.8
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_parallel_crawl_with_concurrency_limit(self, clean_browser_hub):
|
|
"""Test parallel crawling with concurrency limit"""
|
|
# Use more URLs to test concurrency control
|
|
urls = SAFE_URLS[:8]
|
|
|
|
# Custom crawler config
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
|
|
)
|
|
|
|
# Limited concurrency
|
|
concurrency = 2
|
|
|
|
# Time the crawl
|
|
start_time = asyncio.get_event_loop().time()
|
|
results = await Crawler.parallel_crawl(
|
|
urls, crawler_config=crawler_config, concurrency=concurrency
|
|
)
|
|
end_time = asyncio.get_event_loop().time()
|
|
|
|
# Verify results
|
|
assert len(results) == len(urls)
|
|
successful = sum(1 for r in results.values() if r.success)
|
|
|
|
print(
|
|
f"Parallel crawl with concurrency={concurrency} of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
|
|
)
|
|
print(f"Success rate: {successful}/{len(urls)}")
|
|
|
|
# At least 80% should succeed
|
|
assert successful / len(urls) >= 0.8
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_parallel_crawl_with_different_configs(self, clean_browser_hub):
|
|
"""Test parallel crawling with different configurations for different URLs"""
|
|
# Create URL batches with different configurations
|
|
batch1 = (
|
|
SAFE_URLS[:2],
|
|
CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
|
|
)
|
|
batch2 = (
|
|
SAFE_URLS[2:4],
|
|
CrawlerRunConfig(wait_until="networkidle", screenshot=True),
|
|
)
|
|
batch3 = (
|
|
SAFE_URLS[4:6],
|
|
CrawlerRunConfig(wait_until="load", scan_full_page=True),
|
|
)
|
|
|
|
# Crawl with mixed configurations
|
|
start_time = asyncio.get_event_loop().time()
|
|
results = await Crawler.parallel_crawl([batch1, batch2, batch3])
|
|
end_time = asyncio.get_event_loop().time()
|
|
|
|
# Extract all URLs
|
|
all_urls = batch1[0] + batch2[0] + batch3[0]
|
|
|
|
# Verify results
|
|
assert len(results) == len(all_urls)
|
|
|
|
# Check that screenshots are present only for batch2
|
|
for url in batch1[0]:
|
|
assert not results[url].screenshot
|
|
|
|
for url in batch2[0]:
|
|
assert results[url].screenshot
|
|
|
|
print(
|
|
f"Mixed-config parallel crawl of {len(all_urls)} URLs completed in {end_time - start_time:.2f}s"
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_parallel_crawl_with_shared_browser_hub(self, clean_browser_hub):
|
|
"""Test parallel crawling with a shared browser hub"""
|
|
# Create and initialize a browser hub
|
|
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
|
|
|
browser_hub = await BrowserHub.get_browser_manager(
|
|
config=browser_config,
|
|
max_browsers_per_config=3,
|
|
max_pages_per_browser=4,
|
|
initial_pool_size=1,
|
|
)
|
|
|
|
try:
|
|
# Use the hub for parallel crawling
|
|
urls = SAFE_URLS[:6]
|
|
|
|
start_time = asyncio.get_event_loop().time()
|
|
results = await Crawler.parallel_crawl(
|
|
urls,
|
|
browser_hub=browser_hub,
|
|
crawler_config=CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
|
|
),
|
|
)
|
|
end_time = asyncio.get_event_loop().time()
|
|
|
|
# Verify results
|
|
# assert (len(results), len(urls))
|
|
assert len(results) == len(urls)
|
|
successful = sum(1 for r in results.values() if r.success)
|
|
|
|
print(
|
|
f"Shared hub parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
|
|
)
|
|
print(f"Success rate: {successful}/{len(urls)}")
|
|
|
|
# Get browser hub statistics
|
|
hub_stats = await browser_hub.get_pool_status()
|
|
print(f"Browser hub stats: {hub_stats}")
|
|
|
|
# At least 80% should succeed
|
|
# assert (successful / len(urls), 0.8)
|
|
assert successful / len(urls) >= 0.8
|
|
|
|
finally:
|
|
# Clean up the browser hub
|
|
await browser_hub.close()
|
|
|
|
|
|
class TestCrawlerAdvanced:
|
|
"""Advanced tests for the Crawler utility class"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_with_customized_batch_config(self, clean_browser_hub):
|
|
"""Test crawling with fully customized batch configuration"""
|
|
# Create URL batches with different browser and crawler configurations
|
|
browser_config1 = BrowserConfig(browser_type="chromium", headless=True)
|
|
browser_config2 = BrowserConfig(
|
|
browser_type="chromium", headless=False, viewport_width=1920
|
|
)
|
|
|
|
crawler_config1 = CrawlerRunConfig(wait_until="domcontentloaded")
|
|
crawler_config2 = CrawlerRunConfig(wait_until="networkidle", screenshot=True)
|
|
|
|
batch1 = (SAFE_URLS[:2], browser_config1, crawler_config1)
|
|
batch2 = (SAFE_URLS[2:4], browser_config2, crawler_config2)
|
|
|
|
# Crawl with mixed configurations
|
|
results = await Crawler.parallel_crawl([batch1, batch2])
|
|
|
|
# Extract all URLs
|
|
all_urls = batch1[0] + batch2[0]
|
|
|
|
# Verify results
|
|
# assert (len(results), len(all_urls))
|
|
assert len(results) == len(all_urls)
|
|
|
|
# Verify batch-specific processing
|
|
for url in batch1[0]:
|
|
assert results[url].screenshot is None # No screenshots for batch1
|
|
|
|
for url in batch2[0]:
|
|
assert results[url].screenshot is not None # Should have screenshots for batch2
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_with_progress_callback(self, clean_browser_hub):
|
|
"""Test crawling with progress callback"""
|
|
# Use several URLs
|
|
urls = SAFE_URLS[:5]
|
|
|
|
# Track progress
|
|
progress_data = {"started": 0, "completed": 0, "failed": 0, "updates": []}
|
|
|
|
# Progress callback
|
|
async def on_progress(
|
|
status: str, url: str, result: Optional[CrawlResultContainer] = None
|
|
):
|
|
if status == "started":
|
|
progress_data["started"] += 1
|
|
elif status == "completed":
|
|
progress_data["completed"] += 1
|
|
if not result.success:
|
|
progress_data["failed"] += 1
|
|
|
|
progress_data["updates"].append((status, url))
|
|
print(f"Progress: {status} - {url}")
|
|
|
|
# Crawl with progress tracking
|
|
results = await Crawler.parallel_crawl(
|
|
urls,
|
|
crawler_config=CrawlerRunConfig(wait_until="domcontentloaded"),
|
|
progress_callback=on_progress,
|
|
)
|
|
|
|
# Verify progress tracking
|
|
assert progress_data["started"] == len(urls)
|
|
assert progress_data["completed"] == len(urls)
|
|
assert len(progress_data["updates"]) == len(urls) * 2 # start + complete events
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_with_dynamic_retry_strategy(self, clean_browser_hub):
|
|
"""Test crawling with a dynamic retry strategy"""
|
|
# Include URLs that might fail
|
|
urls = [
|
|
"https://example.com",
|
|
"https://httpstat.us/500",
|
|
"https://httpstat.us/404",
|
|
]
|
|
|
|
# Custom retry strategy
|
|
async def retry_strategy(
|
|
url: str, attempt: int, error: Exception
|
|
) -> Tuple[bool, float]:
|
|
# Only retry 500 errors, not 404s
|
|
if "500" in url:
|
|
return True, 1.0 # Retry with 1 second delay
|
|
return False, 0.0 # Don't retry other errors
|
|
|
|
# Crawl with custom retry strategy
|
|
results = await Crawler.parallel_crawl(
|
|
urls,
|
|
crawler_config=CrawlerRunConfig(wait_until="domcontentloaded"),
|
|
retry_strategy=retry_strategy,
|
|
max_retries=3,
|
|
)
|
|
|
|
# Verify results
|
|
assert len(results) == len(urls)
|
|
|
|
# Example.com should succeed
|
|
assert results[urls[0]].success
|
|
|
|
# httpstat.us pages return content even for error status codes
|
|
# so our crawler marks them as successful since it got HTML content
|
|
# Verify that we got the expected status code
|
|
assert results[urls[1]].status_code == 500
|
|
|
|
# 404 should have the correct status code
|
|
assert results[urls[2]].status_code == 404
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_with_very_large_batch(self, clean_browser_hub):
|
|
"""Test crawling with a very large batch of URLs"""
|
|
# Create a batch by repeating our safe URLs
|
|
# Note: In a real test, we'd use more URLs, but for simplicity we'll use a smaller set
|
|
large_batch = list(dict.fromkeys(SAFE_URLS[:5] * 2)) # ~10 unique URLs
|
|
|
|
# Set a reasonable concurrency limit
|
|
concurrency = 10
|
|
|
|
# Time the crawl
|
|
start_time = asyncio.get_event_loop().time()
|
|
results = await Crawler.parallel_crawl(
|
|
large_batch,
|
|
crawler_config=CrawlerRunConfig(
|
|
wait_until="domcontentloaded",
|
|
page_timeout=10000, # Shorter timeout for large batch
|
|
),
|
|
concurrency=concurrency,
|
|
)
|
|
end_time = asyncio.get_event_loop().time()
|
|
|
|
# Verify results
|
|
# assert (len(results), len(large_batch))
|
|
assert len(results) == len(large_batch)
|
|
successful = sum(1 for r in results.values() if r.success)
|
|
|
|
print(
|
|
f"Large batch crawl of {len(large_batch)} URLs completed in {end_time - start_time:.2f}s"
|
|
)
|
|
print(f"Success rate: {successful}/{len(large_batch)}")
|
|
print(
|
|
f"Average time per URL: {(end_time - start_time) / len(large_batch):.2f}s"
|
|
)
|
|
|
|
# At least 80% should succeed (from our unique URLs)
|
|
assert successful / len(results) >= 0.8
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Use pytest for async tests
|
|
pytest.main(["-xvs", __file__])
|