crawl4ai/tests/pipeline/test_crawler.py

# test_crawler.py
import asyncio
import warnings
import pytest
import pytest_asyncio
from typing import Optional, Tuple

# Define test fixtures
@pytest_asyncio.fixture
async def clean_browser_hub():
    """Fixture to ensure clean browser hub state between tests."""
    # Yield control to the test
    yield

    # After test, cleanup all browser hubs
    from crawl4ai.browser import BrowserHub
    try:
        await BrowserHub.shutdown_all()
    except Exception as e:
        print(f"Error during browser cleanup: {e}")

from crawl4ai import Crawler
from crawl4ai import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
from crawl4ai.models import CrawlResultContainer
from crawl4ai.browser import BrowserHub
from crawl4ai.cache_context import CacheMode

import warnings
from pydantic import PydanticDeprecatedSince20


# Test URLs for crawling
SAFE_URLS = [
    "https://example.com",
    "https://httpbin.org/html",
    "https://httpbin.org/headers",
    "https://httpbin.org/ip",
    "https://httpbin.org/user-agent",
    "https://httpstat.us/200",
    "https://jsonplaceholder.typicode.com/posts/1",
    "https://jsonplaceholder.typicode.com/comments/1",
    "https://iana.org",
    "https://www.python.org",
]


class TestCrawlerBasic:
    """Basic tests for the Crawler utility class"""

    @pytest.mark.asyncio
    async def test_simple_crawl_single_url(self, clean_browser_hub):
        """Test crawling a single URL with default configuration"""
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=Warning)
            # Basic logger
            logger = AsyncLogger(verbose=True)

            # Basic single URL crawl with default configuration
            url = "https://example.com"
            result = await Crawler.crawl(url)

            # Verify the result
            assert isinstance(result, CrawlResultContainer)
            assert result.success
            assert result.url == url
            assert result.html is not None
            assert len(result.html) > 0

    @pytest.mark.asyncio
    async def test_crawl_with_custom_config(self, clean_browser_hub):
        """Test crawling with custom browser and crawler configuration"""
        # Custom browser config
        browser_config = BrowserConfig(
            browser_type="chromium",
            headless=True,
            viewport_width=1280,
            viewport_height=800,
        )

        # Custom crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS, wait_until="networkidle", screenshot=True
        )

        # Crawl with custom configuration
        url = "https://httpbin.org/html"
        result = await Crawler.crawl(
            url, browser_config=browser_config, crawler_config=crawler_config
        )

        # Verify the result
        assert result.success
        assert result.url == url
        assert result.screenshot is not None

    @pytest.mark.asyncio
    async def test_crawl_multiple_urls_sequential(self, clean_browser_hub):
        """Test crawling multiple URLs sequentially"""
        # Use a few test URLs
        urls = SAFE_URLS[:3]

        # Custom crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
        )

        # Crawl multiple URLs sequentially
        results = await Crawler.crawl(urls, crawler_config=crawler_config)

        # Verify the results
        assert isinstance(results, dict)
        assert len(results) == len(urls)

        for url in urls:
            assert url in results
            assert results[url].success
            assert results[url].html is not None

    @pytest.mark.asyncio
    async def test_crawl_with_error_handling(self, clean_browser_hub):
        """Test error handling during crawling"""
        # Include a valid URL and a non-existent URL
        urls = ["https://example.com", "https://non-existent-domain-123456789.com"]

        # Crawl with retries
        results = await Crawler.crawl(urls, max_retries=2, retry_delay=1.0)

        # Verify results for both URLs
        assert len(results) == 2

        # Valid URL should succeed
        assert results[urls[0]].success

        # Invalid URL should fail but be in results
        assert urls[1] in results
        assert not results[urls[1]].success
        assert results[urls[1]].error_message is not None


class TestCrawlerParallel:
    """Tests for the parallel crawling capabilities of Crawler"""

    @pytest.mark.asyncio
    async def test_parallel_crawl_simple(self, clean_browser_hub):
        """Test basic parallel crawling with same configuration"""
        # Use several URLs for parallel crawling
        urls = SAFE_URLS[:5]

        # Basic crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
        )

        # Crawl in parallel with default concurrency
        start_time = asyncio.get_event_loop().time()
        results = await Crawler.parallel_crawl(urls, crawler_config=crawler_config)
        end_time = asyncio.get_event_loop().time()

        # Verify results
        assert len(results) == len(urls)
        successful = sum(1 for r in results.values() if r.success)

        print(
            f"Parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
        )
        print(f"Success rate: {successful}/{len(urls)}")

        # At least 80% should succeed
        assert successful / len(urls) >= 0.8

    @pytest.mark.asyncio
    async def test_parallel_crawl_with_concurrency_limit(self, clean_browser_hub):
        """Test parallel crawling with concurrency limit"""
        # Use more URLs to test concurrency control
        urls = SAFE_URLS[:8]

        # Custom crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
        )

        # Limited concurrency
        concurrency = 2

        # Time the crawl
        start_time = asyncio.get_event_loop().time()
        results = await Crawler.parallel_crawl(
            urls, crawler_config=crawler_config, concurrency=concurrency
        )
        end_time = asyncio.get_event_loop().time()

        # Verify results
        assert len(results) == len(urls)
        successful = sum(1 for r in results.values() if r.success)

        print(
            f"Parallel crawl with concurrency={concurrency} of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
        )
        print(f"Success rate: {successful}/{len(urls)}")

        # At least 80% should succeed
        assert successful / len(urls) >= 0.8

    @pytest.mark.asyncio
    async def test_parallel_crawl_with_different_configs(self, clean_browser_hub):
        """Test parallel crawling with different configurations for different URLs"""
        # Create URL batches with different configurations
        batch1 = (
            SAFE_URLS[:2],
            CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
        )
        batch2 = (
            SAFE_URLS[2:4],
            CrawlerRunConfig(wait_until="networkidle", screenshot=True),
        )
        batch3 = (
            SAFE_URLS[4:6],
            CrawlerRunConfig(wait_until="load", scan_full_page=True),
        )

        # Crawl with mixed configurations
        start_time = asyncio.get_event_loop().time()
        results = await Crawler.parallel_crawl([batch1, batch2, batch3])
        end_time = asyncio.get_event_loop().time()

        # Extract all URLs
        all_urls = batch1[0] + batch2[0] + batch3[0]

        # Verify results
        assert len(results) == len(all_urls)

        # Check that screenshots are present only for batch2
        for url in batch1[0]:
            assert not results[url].screenshot

        for url in batch2[0]:
            assert results[url].screenshot

        print(
            f"Mixed-config parallel crawl of {len(all_urls)} URLs completed in {end_time - start_time:.2f}s"
        )

    @pytest.mark.asyncio
    async def test_parallel_crawl_with_shared_browser_hub(self, clean_browser_hub):
        """Test parallel crawling with a shared browser hub"""
        # Create and initialize a browser hub
        browser_config = BrowserConfig(browser_type="chromium", headless=True)

        browser_hub = await BrowserHub.get_browser_manager(
            config=browser_config,
            max_browsers_per_config=3,
            max_pages_per_browser=4,
            initial_pool_size=1,
        )

        try:
            # Use the hub for parallel crawling
            urls = SAFE_URLS[:6]

            start_time = asyncio.get_event_loop().time()
            results = await Crawler.parallel_crawl(
                urls,
                browser_hub=browser_hub,
                crawler_config=CrawlerRunConfig(
                    cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
                ),
            )
            end_time = asyncio.get_event_loop().time()

            # Verify results
            # assert (len(results), len(urls))
            assert len(results) == len(urls)
            successful = sum(1 for r in results.values() if r.success)

            print(
                f"Shared hub parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
            )
            print(f"Success rate: {successful}/{len(urls)}")

            # Get browser hub statistics
            hub_stats = await browser_hub.get_pool_status()
            print(f"Browser hub stats: {hub_stats}")

            # At least 80% should succeed
            # assert (successful / len(urls), 0.8)
            assert successful / len(urls) >= 0.8

        finally:
            # Clean up the browser hub
            await browser_hub.close()


class TestCrawlerAdvanced:
    """Advanced tests for the Crawler utility class"""

    @pytest.mark.asyncio
    async def test_crawl_with_customized_batch_config(self, clean_browser_hub):
        """Test crawling with fully customized batch configuration"""
        # Create URL batches with different browser and crawler configurations
        browser_config1 = BrowserConfig(browser_type="chromium", headless=True)
        browser_config2 = BrowserConfig(
            browser_type="chromium", headless=False, viewport_width=1920
        )

        crawler_config1 = CrawlerRunConfig(wait_until="domcontentloaded")
        crawler_config2 = CrawlerRunConfig(wait_until="networkidle", screenshot=True)

        batch1 = (SAFE_URLS[:2], browser_config1, crawler_config1)
        batch2 = (SAFE_URLS[2:4], browser_config2, crawler_config2)

        # Crawl with mixed configurations
        results = await Crawler.parallel_crawl([batch1, batch2])

        # Extract all URLs
        all_urls = batch1[0] + batch2[0]

        # Verify results
        # assert (len(results), len(all_urls))
        assert len(results) == len(all_urls)

        # Verify batch-specific processing
        for url in batch1[0]:
            assert results[url].screenshot is None  # No screenshots for batch1

        for url in batch2[0]:
            assert results[url].screenshot is not None  # Should have screenshots for batch2

    @pytest.mark.asyncio
    async def test_crawl_with_progress_callback(self, clean_browser_hub):
        """Test crawling with progress callback"""
        # Use several URLs
        urls = SAFE_URLS[:5]

        # Track progress
        progress_data = {"started": 0, "completed": 0, "failed": 0, "updates": []}

        # Progress callback
        async def on_progress(
            status: str, url: str, result: Optional[CrawlResultContainer] = None
        ):
            if status == "started":
                progress_data["started"] += 1
            elif status == "completed":
                progress_data["completed"] += 1
                if not result.success:
                    progress_data["failed"] += 1

            progress_data["updates"].append((status, url))
            print(f"Progress: {status} - {url}")

        # Crawl with progress tracking
        results = await Crawler.parallel_crawl(
            urls,
            crawler_config=CrawlerRunConfig(wait_until="domcontentloaded"),
            progress_callback=on_progress,
        )

        # Verify progress tracking
        assert progress_data["started"] == len(urls)
        assert progress_data["completed"] == len(urls)
        assert len(progress_data["updates"]) == len(urls) * 2  # start + complete events

    @pytest.mark.asyncio
    async def test_crawl_with_dynamic_retry_strategy(self, clean_browser_hub):
        """Test crawling with a dynamic retry strategy"""
        # Include URLs that might fail
        urls = [
            "https://example.com",
            "https://httpstat.us/500",
            "https://httpstat.us/404",
        ]

        # Custom retry strategy
        async def retry_strategy(
            url: str, attempt: int, error: Exception
        ) -> Tuple[bool, float]:
            # Only retry 500 errors, not 404s
            if "500" in url:
                return True, 1.0  # Retry with 1 second delay
            return False, 0.0  # Don't retry other errors

        # Crawl with custom retry strategy
        results = await Crawler.parallel_crawl(
            urls,
            crawler_config=CrawlerRunConfig(wait_until="domcontentloaded"),
            retry_strategy=retry_strategy,
            max_retries=3,
        )

        # Verify results
        assert len(results) == len(urls)

        # Example.com should succeed
        assert results[urls[0]].success

        # httpstat.us pages return content even for error status codes
        # so our crawler marks them as successful since it got HTML content
        # Verify that we got the expected status code
        assert results[urls[1]].status_code == 500

        # 404 should have the correct status code
        assert results[urls[2]].status_code == 404

    @pytest.mark.asyncio
    async def test_crawl_with_very_large_batch(self, clean_browser_hub):
        """Test crawling with a very large batch of URLs"""
        # Create a batch by repeating our safe URLs
        # Note: In a real test, we'd use more URLs, but for simplicity we'll use a smaller set
        large_batch = list(dict.fromkeys(SAFE_URLS[:5] * 2))  # ~10 unique URLs

        # Set a reasonable concurrency limit
        concurrency = 10

        # Time the crawl
        start_time = asyncio.get_event_loop().time()
        results = await Crawler.parallel_crawl(
            large_batch,
            crawler_config=CrawlerRunConfig(
                wait_until="domcontentloaded",
                page_timeout=10000,  # Shorter timeout for large batch
            ),
            concurrency=concurrency,
        )
        end_time = asyncio.get_event_loop().time()

        # Verify results
        # assert (len(results), len(large_batch))
        assert len(results) == len(large_batch)
        successful = sum(1 for r in results.values() if r.success)

        print(
            f"Large batch crawl of {len(large_batch)} URLs completed in {end_time - start_time:.2f}s"
        )
        print(f"Success rate: {successful}/{len(large_batch)}")
        print(
            f"Average time per URL: {(end_time - start_time) / len(large_batch):.2f}s"
        )

        # At least 80% should succeed (from our unique URLs)
        assert successful / len(results) >= 0.8


if __name__ == "__main__":
    # Use pytest for async tests
    pytest.main(["-xvs", __file__])