diff --git a/CHANGES_CDP_CONCURRENCY.md b/CHANGES_CDP_CONCURRENCY.md new file mode 100644 index 00000000..47a5bef8 --- /dev/null +++ b/CHANGES_CDP_CONCURRENCY.md @@ -0,0 +1,214 @@ +# CDP Browser Concurrency Fixes and Improvements + +## Overview + +This document describes the changes made to fix concurrency issues with CDP (Chrome DevTools Protocol) browsers when using `arun_many` and improve overall browser management. + +## Problems Addressed + +1. **Race Conditions in Page Creation**: When using managed CDP browsers with concurrent `arun_many` calls, the code attempted to reuse existing pages from `context.pages`, leading to race conditions and "Target page/context closed" errors. + +2. **Proxy Configuration Issues**: Proxy credentials were incorrectly embedded in the `--proxy-server` URL, which doesn't work properly with CDP browsers. + +3. **Insufficient Startup Checks**: Browser process startup checks were minimal and didn't catch early failures effectively. + +4. **Unclear Logging**: Logging messages lacked structure and context, making debugging difficult. + +5. **Duplicate Browser Arguments**: Browser launch arguments could contain duplicates despite deduplication attempts. + +## Solutions Implemented + +### 1. Always Create New Pages in Managed Browser Mode + +**File**: `crawl4ai/browser_manager.py` (lines 1106-1113) + +**Change**: Modified `get_page()` method to always create new pages instead of attempting to reuse existing ones for managed browsers without `storage_state`. + +**Before**: +```python +context = self.default_context +pages = context.pages +page = next((p for p in pages if p.url == crawlerRunConfig.url), None) +if not page: + if pages: + page = pages[0] + else: + # Create new page only if none exist + async with self._page_lock: + page = await context.new_page() +``` + +**After**: +```python +context = self.default_context +# Always create new pages instead of reusing existing ones +# This prevents race conditions in concurrent scenarios (arun_many with CDP) +# Serialize page creation to avoid 'Target page/context closed' errors +async with self._page_lock: + page = await context.new_page() +await self._apply_stealth_to_page(page) +``` + +**Benefits**: +- Eliminates race conditions when multiple tasks call `arun_many` concurrently +- Each request gets a fresh, independent page +- Page lock serializes creation to prevent TOCTOU (Time-of-check to time-of-use) issues + +### 2. Fixed Proxy Flag Formatting + +**File**: `crawl4ai/browser_manager.py` (lines 103-109) + +**Change**: Removed credentials from proxy URL as they should be handled via separate authentication mechanisms in CDP. + +**Before**: +```python +elif config.proxy_config: + creds = "" + if config.proxy_config.username and config.proxy_config.password: + creds = f"{config.proxy_config.username}:{config.proxy_config.password}@" + flags.append(f"--proxy-server={creds}{config.proxy_config.server}") +``` + +**After**: +```python +elif config.proxy_config: + # Note: For CDP/managed browsers, proxy credentials should be handled + # via authentication, not in the URL. Only pass the server address. + flags.append(f"--proxy-server={config.proxy_config.server}") +``` + +### 3. Enhanced Startup Checks + +**File**: `crawl4ai/browser_manager.py` (lines 298-336) + +**Changes**: +- Multiple check intervals (0.1s, 0.2s, 0.3s) to catch early failures +- Capture and log stdout/stderr on failure (limited to 200 chars) +- Raise `RuntimeError` with detailed diagnostics on startup failure +- Log process PID on successful startup in verbose mode + +**Benefits**: +- Catches browser crashes during startup +- Provides detailed diagnostic information for debugging +- Fails fast with clear error messages + +### 4. Improved Logging + +**File**: `crawl4ai/browser_manager.py` (lines 218-291) + +**Changes**: +- Structured logging with proper parameter substitution +- Log browser type, port, and headless status at launch +- Format and log full command with proper shell escaping +- Better error messages with context +- Consistent use of logger with null checks + +**Example**: +```python +if self.logger and self.browser_config.verbose: + self.logger.debug( + "Launching browser: {browser_type} | Port: {port} | Headless: {headless}", + tag="BROWSER", + params={ + "browser_type": self.browser_type, + "port": self.debugging_port, + "headless": self.headless + } + ) +``` + +### 5. Deduplicate Browser Launch Arguments + +**File**: `crawl4ai/browser_manager.py` (lines 424-425) + +**Change**: Added explicit deduplication after merging all flags. + +```python +# merge common launch flags +flags.extend(self.build_browser_flags(self.browser_config)) +# Deduplicate flags - use dict.fromkeys to preserve order while removing duplicates +flags = list(dict.fromkeys(flags)) +``` + +### 6. Import Refactoring + +**Files**: `crawl4ai/browser_manager.py`, `crawl4ai/browser_profiler.py`, `tests/browser/test_cdp_concurrency.py` + +**Changes**: Organized all imports according to PEP 8: +1. Standard library imports (alphabetized) +2. Third-party imports (alphabetized) +3. Local imports (alphabetized) + +**Benefits**: +- Improved code readability +- Easier to spot missing or unused imports +- Consistent style across the codebase + +## Testing + +### New Test Suite + +**File**: `tests/browser/test_cdp_concurrency.py` + +Comprehensive test suite with 8 tests covering: + +1. **Basic Concurrent arun_many**: Validates multiple URLs can be crawled concurrently +2. **Sequential arun_many Calls**: Ensures multiple sequential batches work correctly +3. **Stress Test**: Multiple concurrent `arun_many` calls to test page lock effectiveness +4. **Page Isolation**: Verifies pages are truly independent +5. **Different Configurations**: Tests with varying viewport sizes and configs +6. **Error Handling**: Ensures errors in one request don't affect others +7. **Large Batches**: Scalability test with 10+ URLs +8. **Smoke Test Script**: Standalone script for quick validation + +### Running Tests + +**With pytest** (if available): +```bash +cd /path/to/crawl4ai +pytest tests/browser/test_cdp_concurrency.py -v +``` + +**Standalone smoke test**: +```bash +cd /path/to/crawl4ai +python3 tests/browser/smoke_test_cdp.py +``` + +## Migration Guide + +### For Users + +No breaking changes. Existing code will continue to work, but with better reliability in concurrent scenarios. + +### For Contributors + +When working with managed browsers: +1. Always use the page lock when creating pages in shared contexts +2. Prefer creating new pages over reusing existing ones for concurrent operations +3. Use structured logging with parameter substitution +4. Follow PEP 8 import organization + +## Performance Impact + +- **Positive**: Eliminates race conditions and crashes in concurrent scenarios +- **Neutral**: Page creation overhead is negligible compared to page navigation +- **Consideration**: More pages may be created, but they are properly closed after use + +## Backward Compatibility + +All changes are backward compatible. Session-based page reuse still works as before when `session_id` is provided. + +## Related Issues + +- Fixes race conditions in concurrent `arun_many` calls with CDP browsers +- Addresses "Target page/context closed" errors +- Improves browser startup reliability + +## Future Improvements + +Consider: +1. Configurable page pooling with proper lifecycle management +2. More granular locks for different contexts +3. Metrics for page creation/reuse patterns +4. Connection pooling for CDP connections diff --git a/tests/browser/smoke_test_cdp.py b/tests/browser/smoke_test_cdp.py new file mode 100755 index 00000000..e29ba39d --- /dev/null +++ b/tests/browser/smoke_test_cdp.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +""" +Simple smoke test for CDP concurrency fixes. +This can be run without pytest to quickly validate the changes. +""" + +import asyncio +import sys +import os + +# Add the project root to Python path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + + +async def test_basic_cdp(): + """Basic test that CDP browser works""" + print("Test 1: Basic CDP browser test...") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True, + verbose=False + ) + + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + assert result.success, f"Failed: {result.error_message}" + assert len(result.html) > 0, "Empty HTML" + print(" ✓ Basic CDP test passed") + return True + except Exception as e: + print(f" ✗ Basic CDP test failed: {e}") + return False + + +async def test_arun_many_cdp(): + """Test arun_many with CDP browser - the key concurrency fix""" + print("\nTest 2: arun_many with CDP browser...") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True, + verbose=False + ) + + urls = [ + "https://example.com", + "https://httpbin.org/html", + "https://www.example.org", + ] + + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + results = await crawler.arun_many( + urls=urls, + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + + assert len(results) == len(urls), f"Expected {len(urls)} results, got {len(results)}" + + success_count = sum(1 for r in results if r.success) + print(f" ✓ Crawled {success_count}/{len(urls)} URLs successfully") + + if success_count >= len(urls) * 0.8: # Allow 20% failure for network issues + print(" ✓ arun_many CDP test passed") + return True + else: + print(f" ✗ Too many failures: {len(urls) - success_count}/{len(urls)}") + return False + + except Exception as e: + print(f" ✗ arun_many CDP test failed: {e}") + import traceback + traceback.print_exc() + return False + + +async def test_concurrent_arun_many(): + """Test concurrent arun_many calls - stress test for page lock""" + print("\nTest 3: Concurrent arun_many calls...") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True, + verbose=False + ) + + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + # Run two arun_many calls concurrently + task1 = crawler.arun_many( + urls=["https://example.com", "https://httpbin.org/html"], + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + + task2 = crawler.arun_many( + urls=["https://www.example.org", "https://example.com"], + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + + results1, results2 = await asyncio.gather(task1, task2, return_exceptions=True) + + # Check for exceptions + if isinstance(results1, Exception): + print(f" ✗ Task 1 raised exception: {results1}") + return False + if isinstance(results2, Exception): + print(f" ✗ Task 2 raised exception: {results2}") + return False + + total_success = sum(1 for r in results1 if r.success) + sum(1 for r in results2 if r.success) + total_requests = len(results1) + len(results2) + + print(f" ✓ {total_success}/{total_requests} concurrent requests succeeded") + + if total_success >= total_requests * 0.7: # Allow 30% failure for concurrent stress + print(" ✓ Concurrent arun_many test passed") + return True + else: + print(f" ✗ Too many concurrent failures") + return False + + except Exception as e: + print(f" ✗ Concurrent test failed: {e}") + import traceback + traceback.print_exc() + return False + + +async def main(): + """Run all smoke tests""" + print("=" * 60) + print("CDP Concurrency Smoke Tests") + print("=" * 60) + + results = [] + + # Run tests sequentially + results.append(await test_basic_cdp()) + results.append(await test_arun_many_cdp()) + results.append(await test_concurrent_arun_many()) + + print("\n" + "=" * 60) + passed = sum(results) + total = len(results) + + if passed == total: + print(f"✓ All {total} smoke tests passed!") + print("=" * 60) + return 0 + else: + print(f"✗ {total - passed}/{total} smoke tests failed") + print("=" * 60) + return 1 + + +if __name__ == "__main__": + exit_code = asyncio.run(main()) + sys.exit(exit_code)