Add new Crawler class that provides a simplified interface for both single and batch URL crawling operations. Key features include: - Simple single URL crawling with configurable options - Parallel batch crawling with concurrency control - Shared browser hub support for resource efficiency - Progress tracking and custom retry strategies - Comprehensive error handling and retry logic Remove demo and extended test files in favor of new focused test suite.
163 lines
5.1 KiB
Python
163 lines
5.1 KiB
Python
"""Test the Crawler class for batch crawling capabilities."""
|
|
|
|
import asyncio
|
|
import pytest
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
|
|
from crawl4ai import Crawler
|
|
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.async_logger import AsyncLogger
|
|
from crawl4ai.models import CrawlResult, CrawlResultContainer
|
|
from crawl4ai.browser import BrowserHub
|
|
from crawl4ai.cache_context import CacheMode
|
|
|
|
# Test URLs for crawling
|
|
SAFE_URLS = [
|
|
"https://example.com",
|
|
"https://httpbin.org/html",
|
|
"https://httpbin.org/headers",
|
|
"https://httpbin.org/ip",
|
|
"https://httpbin.org/user-agent",
|
|
"https://httpstat.us/200",
|
|
"https://jsonplaceholder.typicode.com/posts/1",
|
|
"https://jsonplaceholder.typicode.com/comments/1",
|
|
"https://iana.org",
|
|
"https://www.python.org"
|
|
]
|
|
|
|
# Simple test for batch crawling
|
|
@pytest.mark.asyncio
|
|
async def test_batch_crawl_simple():
|
|
"""Test simple batch crawling with multiple URLs."""
|
|
# Use a few test URLs
|
|
urls = SAFE_URLS[:3]
|
|
|
|
# Custom crawler config
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
wait_until="domcontentloaded"
|
|
)
|
|
|
|
# Crawl multiple URLs using batch crawl
|
|
results = await Crawler.crawl(
|
|
urls,
|
|
crawler_config=crawler_config
|
|
)
|
|
|
|
# Verify the results
|
|
assert isinstance(results, dict)
|
|
assert len(results) == len(urls)
|
|
|
|
for url in urls:
|
|
assert url in results
|
|
assert results[url].success
|
|
assert results[url].html is not None
|
|
|
|
# Test parallel batch crawling
|
|
@pytest.mark.asyncio
|
|
async def test_parallel_batch_crawl():
|
|
"""Test parallel batch crawling with multiple URLs."""
|
|
# Use several URLs for parallel crawling
|
|
urls = SAFE_URLS[:5]
|
|
|
|
# Basic crawler config
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
wait_until="domcontentloaded"
|
|
)
|
|
|
|
# Crawl in parallel
|
|
start_time = asyncio.get_event_loop().time()
|
|
results = await Crawler.parallel_crawl(
|
|
urls,
|
|
crawler_config=crawler_config
|
|
)
|
|
end_time = asyncio.get_event_loop().time()
|
|
|
|
# Verify results
|
|
assert len(results) == len(urls)
|
|
successful = sum(1 for r in results.values() if r.success)
|
|
|
|
print(f"Parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s")
|
|
print(f"Success rate: {successful}/{len(urls)}")
|
|
|
|
# At least 80% should succeed
|
|
assert successful / len(urls) >= 0.8
|
|
|
|
# Test batch crawling with different configurations
|
|
@pytest.mark.asyncio
|
|
async def test_batch_crawl_mixed_configs():
|
|
"""Test batch crawling with different configurations for different URLs."""
|
|
# Create URL batches with different configurations
|
|
batch1 = (SAFE_URLS[:2], CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False))
|
|
batch2 = (SAFE_URLS[2:4], CrawlerRunConfig(wait_until="networkidle", screenshot=True))
|
|
|
|
# Crawl with mixed configurations
|
|
start_time = asyncio.get_event_loop().time()
|
|
results = await Crawler.parallel_crawl([batch1, batch2])
|
|
end_time = asyncio.get_event_loop().time()
|
|
|
|
# Extract all URLs
|
|
all_urls = batch1[0] + batch2[0]
|
|
|
|
# Verify results
|
|
assert len(results) == len(all_urls)
|
|
|
|
# Check that screenshots are present only for batch2
|
|
for url in batch1[0]:
|
|
assert results[url].screenshot is None
|
|
|
|
for url in batch2[0]:
|
|
assert results[url].screenshot is not None
|
|
|
|
print(f"Mixed-config parallel crawl of {len(all_urls)} URLs completed in {end_time - start_time:.2f}s")
|
|
|
|
# Test shared browser hub
|
|
@pytest.mark.asyncio
|
|
async def test_batch_crawl_shared_hub():
|
|
"""Test batch crawling with a shared browser hub."""
|
|
# Create and initialize a browser hub
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True
|
|
)
|
|
|
|
browser_hub = await BrowserHub.get_browser_manager(
|
|
config=browser_config,
|
|
max_browsers_per_config=3,
|
|
max_pages_per_browser=4,
|
|
initial_pool_size=1
|
|
)
|
|
|
|
try:
|
|
# Use the hub for parallel crawling
|
|
urls = SAFE_URLS[:3]
|
|
|
|
start_time = asyncio.get_event_loop().time()
|
|
results = await Crawler.parallel_crawl(
|
|
urls,
|
|
browser_hub=browser_hub,
|
|
crawler_config=CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
wait_until="domcontentloaded"
|
|
)
|
|
)
|
|
end_time = asyncio.get_event_loop().time()
|
|
|
|
# Verify results
|
|
assert len(results) == len(urls)
|
|
successful = sum(1 for r in results.values() if r.success)
|
|
|
|
print(f"Shared hub parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s")
|
|
print(f"Success rate: {successful}/{len(urls)}")
|
|
|
|
# Get browser hub statistics
|
|
hub_stats = await browser_hub.get_pool_status()
|
|
print(f"Browser hub stats: {hub_stats}")
|
|
|
|
# At least 80% should succeed
|
|
assert successful / len(urls) >= 0.8
|
|
|
|
finally:
|
|
# Clean up the browser hub
|
|
await browser_hub.close() |