crawl4ai/tests/browser/test_cdp_concurrency.py

"""
Test CDP browser concurrency with arun_many.

This test suite validates that the fixes for concurrent page creation
in managed browsers (CDP mode) work correctly, particularly:
1. Always creating new pages instead of reusing
2. Page lock serialization prevents race conditions
3. Multiple concurrent arun_many calls work correctly
"""

# Standard library imports
import asyncio
import os
import sys

# Third-party imports
import pytest

# Add the project root to Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))

# Local imports
from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig


@pytest.mark.asyncio
async def test_cdp_concurrent_arun_many_basic():
    """
    Test basic concurrent arun_many with CDP browser.
    This tests the fix for always creating new pages.
    """
    browser_config = BrowserConfig(
        use_managed_browser=True,
        headless=True,
        verbose=False
    )

    urls = [
        "https://example.com",
        "https://www.python.org",
        "https://httpbin.org/html",
    ]

    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Run arun_many - should create new pages for each URL
        results = await crawler.arun_many(urls=urls, config=config)

        # Verify all URLs were crawled successfully
        assert len(results) == len(urls), f"Expected {len(urls)} results, got {len(results)}"

        for i, result in enumerate(results):
            assert result is not None, f"Result {i} is None"
            assert result.success, f"Result {i} failed: {result.error_message}"
            assert result.status_code == 200, f"Result {i} has status {result.status_code}"
            assert len(result.html) > 0, f"Result {i} has empty HTML"


@pytest.mark.asyncio
async def test_cdp_multiple_sequential_arun_many():
    """
    Test multiple sequential arun_many calls with CDP browser.
    Each call should work correctly without interference.
    """
    browser_config = BrowserConfig(
        use_managed_browser=True,
        headless=True,
        verbose=False
    )

    urls_batch1 = [
        "https://example.com",
        "https://httpbin.org/html",
    ]

    urls_batch2 = [
        "https://www.python.org",
        "https://example.org",
    ]

    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

    async with AsyncWebCrawler(config=browser_config) as crawler:
        # First batch
        results1 = await crawler.arun_many(urls=urls_batch1, config=config)
        assert len(results1) == len(urls_batch1)
        for result in results1:
            assert result.success, f"First batch failed: {result.error_message}"

        # Second batch - should work without issues
        results2 = await crawler.arun_many(urls=urls_batch2, config=config)
        assert len(results2) == len(urls_batch2)
        for result in results2:
            assert result.success, f"Second batch failed: {result.error_message}"


@pytest.mark.asyncio
async def test_cdp_concurrent_arun_many_stress():
    """
    Stress test: Multiple concurrent arun_many calls with CDP browser.
    This is the key test for the concurrency fix - ensures page lock works.
    """
    browser_config = BrowserConfig(
        use_managed_browser=True,
        headless=True,
        verbose=False
    )

    # Create multiple batches of URLs
    num_batches = 3
    urls_per_batch = 3

    batches = [
        [f"https://httpbin.org/delay/{i}?batch={batch}"
         for i in range(urls_per_batch)]
        for batch in range(num_batches)
    ]

    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Run multiple arun_many calls concurrently
        tasks = [
            crawler.arun_many(urls=batch, config=config)
            for batch in batches
        ]

        # Execute all batches in parallel
        all_results = await asyncio.gather(*tasks, return_exceptions=True)

        # Verify no exceptions occurred
        for i, results in enumerate(all_results):
            assert not isinstance(results, Exception), f"Batch {i} raised exception: {results}"
            assert len(results) == urls_per_batch, f"Batch {i}: expected {urls_per_batch} results, got {len(results)}"

            # Verify each result
            for j, result in enumerate(results):
                assert result is not None, f"Batch {i}, result {j} is None"
                # Some may fail due to network/timing, but should not crash
                if result.success:
                    assert len(result.html) > 0, f"Batch {i}, result {j} has empty HTML"


@pytest.mark.asyncio
async def test_cdp_page_isolation():
    """
    Test that pages are properly isolated - changes to one don't affect another.
    This validates that we're creating truly independent pages.
    """
    browser_config = BrowserConfig(
        use_managed_browser=True,
        headless=True,
        verbose=False
    )

    url = "https://example.com"

    # Use different JS codes to verify isolation
    config1 = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        js_code="document.body.setAttribute('data-test', 'page1');"
    )

    config2 = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        js_code="document.body.setAttribute('data-test', 'page2');"
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Run both configs concurrently
        results = await crawler.arun_many(
            urls=[url, url],
            configs=[config1, config2]
        )

        assert len(results) == 2
        assert results[0].success and results[1].success

        # Both should succeed with their own modifications
        # (We can't directly check the data-test attribute, but success indicates isolation)
        assert 'Example Domain' in results[0].html
        assert 'Example Domain' in results[1].html


@pytest.mark.asyncio
async def test_cdp_with_different_viewport_sizes():
    """
    Test concurrent crawling with different viewport configurations.
    Ensures context/page creation handles different configs correctly.
    """
    browser_config = BrowserConfig(
        use_managed_browser=True,
        headless=True,
        verbose=False
    )

    url = "https://example.com"

    # Different viewport sizes (though in CDP mode these may be limited)
    configs = [
        CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
        CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
        CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
    ]

    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(
            urls=[url] * len(configs),
            configs=configs
        )

        assert len(results) == len(configs)
        for i, result in enumerate(results):
            assert result.success, f"Config {i} failed: {result.error_message}"
            assert len(result.html) > 0


@pytest.mark.asyncio
async def test_cdp_error_handling_concurrent():
    """
    Test that errors in one concurrent request don't affect others.
    This ensures proper isolation and error handling.
    """
    browser_config = BrowserConfig(
        use_managed_browser=True,
        headless=True,
        verbose=False
    )

    urls = [
        "https://example.com",  # Valid
        "https://this-domain-definitely-does-not-exist-12345.com",  # Invalid
        "https://httpbin.org/html",  # Valid
    ]

    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(urls=urls, config=config)

        assert len(results) == len(urls)

        # First and third should succeed
        assert results[0].success, "First URL should succeed"
        assert results[2].success, "Third URL should succeed"

        # Second may fail (invalid domain)
        # But its failure shouldn't affect the others


@pytest.mark.asyncio
async def test_cdp_large_batch():
    """
    Test handling a larger batch of URLs to ensure scalability.
    """
    browser_config = BrowserConfig(
        use_managed_browser=True,
        headless=True,
        verbose=False
    )

    # Create 10 URLs
    num_urls = 10
    urls = [f"https://httpbin.org/delay/0?id={i}" for i in range(num_urls)]

    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(urls=urls, config=config)

        assert len(results) == num_urls

        # Count successes
        successes = sum(1 for r in results if r.success)
        # Allow some failures due to network issues, but most should succeed
        assert successes >= num_urls * 0.8, f"Only {successes}/{num_urls} succeeded"


if __name__ == "__main__":
    # Run tests with pytest
    pytest.main([__file__, "-v", "-s"])