crawl4ai/tests/pipeline/test_batch_crawl.py

"""Test the Crawler class for batch crawling capabilities."""

import asyncio
import pytest
from typing import List, Dict, Any, Optional, Tuple

from crawl4ai import Crawler
from crawl4ai import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
from crawl4ai.models import CrawlResult, CrawlResultContainer
from crawl4ai.browser import BrowserHub
from crawl4ai.cache_context import CacheMode

# Test URLs for crawling
SAFE_URLS = [
   "https://example.com",
   "https://httpbin.org/html",
   "https://httpbin.org/headers",
   "https://httpbin.org/ip",
   "https://httpbin.org/user-agent",
   "https://httpstat.us/200",
   "https://jsonplaceholder.typicode.com/posts/1",
   "https://jsonplaceholder.typicode.com/comments/1",
   "https://iana.org",
   "https://www.python.org"
]

# Simple test for batch crawling
@pytest.mark.asyncio
async def test_batch_crawl_simple():
    """Test simple batch crawling with multiple URLs."""
    # Use a few test URLs
    urls = SAFE_URLS[:3]

    # Custom crawler config
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        wait_until="domcontentloaded"
    )

    # Crawl multiple URLs using batch crawl
    results = await Crawler.crawl(
        urls,
        crawler_config=crawler_config
    )

    # Verify the results
    assert isinstance(results, dict)
    assert len(results) == len(urls)

    for url in urls:
        assert url in results
        assert results[url].success
        assert results[url].html is not None

# Test parallel batch crawling
@pytest.mark.asyncio
async def test_parallel_batch_crawl():
    """Test parallel batch crawling with multiple URLs."""
    # Use several URLs for parallel crawling
    urls = SAFE_URLS[:5]

    # Basic crawler config
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        wait_until="domcontentloaded"
    )

    # Crawl in parallel
    start_time = asyncio.get_event_loop().time()
    results = await Crawler.parallel_crawl(
        urls,
        crawler_config=crawler_config
    )
    end_time = asyncio.get_event_loop().time()

    # Verify results
    assert len(results) == len(urls)
    successful = sum(1 for r in results.values() if r.success)

    print(f"Parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s")
    print(f"Success rate: {successful}/{len(urls)}")

    # At least 80% should succeed
    assert successful / len(urls) >= 0.8

# Test batch crawling with different configurations
@pytest.mark.asyncio
async def test_batch_crawl_mixed_configs():
    """Test batch crawling with different configurations for different URLs."""
    # Create URL batches with different configurations
    batch1 = (SAFE_URLS[:2], CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False))
    batch2 = (SAFE_URLS[2:4], CrawlerRunConfig(wait_until="networkidle", screenshot=True))

    # Crawl with mixed configurations
    start_time = asyncio.get_event_loop().time()
    results = await Crawler.parallel_crawl([batch1, batch2])
    end_time = asyncio.get_event_loop().time()

    # Extract all URLs
    all_urls = batch1[0] + batch2[0]

    # Verify results
    assert len(results) == len(all_urls)

    # Check that screenshots are present only for batch2
    for url in batch1[0]:
        assert results[url].screenshot is None

    for url in batch2[0]:
        assert results[url].screenshot is not None

    print(f"Mixed-config parallel crawl of {len(all_urls)} URLs completed in {end_time - start_time:.2f}s")

# Test shared browser hub
@pytest.mark.asyncio
async def test_batch_crawl_shared_hub():
    """Test batch crawling with a shared browser hub."""
    # Create and initialize a browser hub
    browser_config = BrowserConfig(
        browser_type="chromium",
        headless=True
    )

    browser_hub = await BrowserHub.get_browser_manager(
        config=browser_config,
        max_browsers_per_config=3,
        max_pages_per_browser=4,
        initial_pool_size=1
    )

    try:
        # Use the hub for parallel crawling
        urls = SAFE_URLS[:3]

        start_time = asyncio.get_event_loop().time()
        results = await Crawler.parallel_crawl(
            urls,
            browser_hub=browser_hub,
            crawler_config=CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                wait_until="domcontentloaded"
            )
        )
        end_time = asyncio.get_event_loop().time()

        # Verify results
        assert len(results) == len(urls)
        successful = sum(1 for r in results.values() if r.success)

        print(f"Shared hub parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s")
        print(f"Success rate: {successful}/{len(urls)}")

        # Get browser hub statistics
        hub_stats = await browser_hub.get_pool_status()
        print(f"Browser hub stats: {hub_stats}")

        # At least 80% should succeed
        assert successful / len(urls) >= 0.8

    finally:
        # Clean up the browser hub
        await browser_hub.close()