Implement CDP concurrency fixes and improve logging
- Modified get_page() to always create new pages for managed browsers - Ensured page lock serializes all new_page() calls in managed mode - Fixed proxy flag formatting (removed credentials from URL) - Added deduplication of browser launch args - Enhanced startup checks with multiple intervals - Improved logging with structured messages and better formatting - Added comprehensive test suite for CDP concurrency Co-authored-by: Ahmed-Tawfik94 <106467151+Ahmed-Tawfik94@users.noreply.github.com>
This commit is contained in:
@@ -104,10 +104,9 @@ class ManagedBrowser:
|
||||
if config.proxy:
|
||||
flags.append(f"--proxy-server={config.proxy}")
|
||||
elif config.proxy_config:
|
||||
creds = ""
|
||||
if config.proxy_config.username and config.proxy_config.password:
|
||||
creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
|
||||
flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
|
||||
# Note: For CDP/managed browsers, proxy credentials should be handled
|
||||
# via authentication, not in the URL. Only pass the server address.
|
||||
flags.append(f"--proxy-server={config.proxy_config.server}")
|
||||
# dedupe
|
||||
return list(dict.fromkeys(flags))
|
||||
|
||||
@@ -219,11 +218,27 @@ class ManagedBrowser:
|
||||
os.remove(fp)
|
||||
except Exception as _e:
|
||||
# non-fatal — we'll try to start anyway, but log what happened
|
||||
self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")
|
||||
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
"Pre-launch cleanup failed: {error} | Will attempt to start browser anyway",
|
||||
tag="BROWSER",
|
||||
params={"error": str(_e)}
|
||||
)
|
||||
|
||||
# Start browser process
|
||||
try:
|
||||
# Log browser launch intent
|
||||
if self.logger and self.browser_config.verbose:
|
||||
self.logger.debug(
|
||||
"Launching browser: {browser_type} | Port: {port} | Headless: {headless}",
|
||||
tag="BROWSER",
|
||||
params={
|
||||
"browser_type": self.browser_type,
|
||||
"port": self.debugging_port,
|
||||
"headless": self.headless
|
||||
}
|
||||
)
|
||||
|
||||
# Use DETACHED_PROCESS flag on Windows to fully detach the process
|
||||
# On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
|
||||
if sys.platform == "win32":
|
||||
@@ -241,19 +256,36 @@ class ManagedBrowser:
|
||||
preexec_fn=os.setpgrp # Start in a new process group
|
||||
)
|
||||
|
||||
# If verbose is True print args used to run the process
|
||||
# Log full command if verbose logging is enabled
|
||||
if self.logger and self.browser_config.verbose:
|
||||
# Format args for better readability - escape and join
|
||||
formatted_args = ' '.join(shlex.quote(str(arg)) for arg in args)
|
||||
self.logger.debug(
|
||||
f"Starting browser with args: {' '.join(args)}",
|
||||
tag="BROWSER"
|
||||
)
|
||||
"Browser launch command: {command}",
|
||||
tag="BROWSER",
|
||||
params={"command": formatted_args}
|
||||
)
|
||||
|
||||
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
||||
await asyncio.sleep(0.5) # Give browser time to start
|
||||
# Perform startup health checks
|
||||
await asyncio.sleep(0.5) # Initial delay for process startup
|
||||
await self._initial_startup_check()
|
||||
await asyncio.sleep(2) # Give browser time to start
|
||||
return f"http://{self.host}:{self.debugging_port}"
|
||||
await asyncio.sleep(2) # Additional time for browser initialization
|
||||
|
||||
cdp_url = f"http://{self.host}:{self.debugging_port}"
|
||||
if self.logger:
|
||||
self.logger.info(
|
||||
"Browser started successfully | CDP URL: {cdp_url}",
|
||||
tag="BROWSER",
|
||||
params={"cdp_url": cdp_url}
|
||||
)
|
||||
return cdp_url
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
"Failed to start browser: {error}",
|
||||
tag="BROWSER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
await self.cleanup()
|
||||
raise Exception(f"Failed to start browser: {e}")
|
||||
|
||||
@@ -266,23 +298,41 @@ class ManagedBrowser:
|
||||
return
|
||||
|
||||
# Check that process started without immediate termination
|
||||
await asyncio.sleep(0.5)
|
||||
if self.browser_process.poll() is not None:
|
||||
# Process already terminated
|
||||
stdout, stderr = b"", b""
|
||||
try:
|
||||
stdout, stderr = self.browser_process.communicate(timeout=0.5)
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
# Perform multiple checks with increasing delays to catch early failures
|
||||
check_intervals = [0.1, 0.2, 0.3] # Total 0.6s
|
||||
|
||||
for delay in check_intervals:
|
||||
await asyncio.sleep(delay)
|
||||
if self.browser_process.poll() is not None:
|
||||
# Process already terminated - capture output for debugging
|
||||
stdout, stderr = b"", b""
|
||||
try:
|
||||
stdout, stderr = self.browser_process.communicate(timeout=0.5)
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
|
||||
error_msg = "Browser process terminated during startup"
|
||||
if stderr:
|
||||
error_msg += f" | STDERR: {stderr.decode()[:200]}" # Limit output length
|
||||
if stdout:
|
||||
error_msg += f" | STDOUT: {stdout.decode()[:200]}"
|
||||
|
||||
self.logger.error(
|
||||
message="{error_msg} | Exit code: {code}",
|
||||
tag="BROWSER",
|
||||
params={
|
||||
"error_msg": error_msg,
|
||||
"code": self.browser_process.returncode,
|
||||
},
|
||||
)
|
||||
raise RuntimeError(f"Browser failed to start: {error_msg}")
|
||||
|
||||
self.logger.error(
|
||||
message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
||||
tag="ERROR",
|
||||
params={
|
||||
"code": self.browser_process.returncode,
|
||||
"stdout": stdout.decode() if stdout else "",
|
||||
"stderr": stderr.decode() if stderr else "",
|
||||
},
|
||||
# Process is still running after checks - log success
|
||||
if self.logger and self.browser_config.verbose:
|
||||
self.logger.debug(
|
||||
"Browser process startup check passed | PID: {pid}",
|
||||
tag="BROWSER",
|
||||
params={"pid": self.browser_process.pid}
|
||||
)
|
||||
|
||||
async def _monitor_browser_process(self):
|
||||
@@ -371,6 +421,8 @@ class ManagedBrowser:
|
||||
flags.append("--headless=new")
|
||||
# merge common launch flags
|
||||
flags.extend(self.build_browser_flags(self.browser_config))
|
||||
# Deduplicate flags - use dict.fromkeys to preserve order while removing duplicates
|
||||
flags = list(dict.fromkeys(flags))
|
||||
elif self.browser_type == "firefox":
|
||||
flags = [
|
||||
"--remote-debugging-port",
|
||||
@@ -1048,21 +1100,12 @@ class BrowserManager:
|
||||
await self._apply_stealth_to_page(page)
|
||||
else:
|
||||
context = self.default_context
|
||||
pages = context.pages
|
||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||
if not page:
|
||||
if pages:
|
||||
page = pages[0]
|
||||
else:
|
||||
# Double-check under lock to avoid TOCTOU and ensure only
|
||||
# one task calls new_page when pages=[] concurrently
|
||||
async with self._page_lock:
|
||||
pages = context.pages
|
||||
if pages:
|
||||
page = pages[0]
|
||||
else:
|
||||
page = await context.new_page()
|
||||
await self._apply_stealth_to_page(page)
|
||||
# Always create new pages instead of reusing existing ones
|
||||
# This prevents race conditions in concurrent scenarios (arun_many with CDP)
|
||||
# Serialize page creation to avoid 'Target page/context closed' errors
|
||||
async with self._page_lock:
|
||||
page = await context.new_page()
|
||||
await self._apply_stealth_to_page(page)
|
||||
else:
|
||||
# Otherwise, check if we have an existing context for this config
|
||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||
|
||||
278
tests/browser/test_cdp_concurrency.py
Normal file
278
tests/browser/test_cdp_concurrency.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""
|
||||
Test CDP browser concurrency with arun_many.
|
||||
|
||||
This test suite validates that the fixes for concurrent page creation
|
||||
in managed browsers (CDP mode) work correctly, particularly:
|
||||
1. Always creating new pages instead of reusing
|
||||
2. Page lock serialization prevents race conditions
|
||||
3. Multiple concurrent arun_many calls work correctly
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import pytest
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the project root to Python path
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cdp_concurrent_arun_many_basic():
|
||||
"""
|
||||
Test basic concurrent arun_many with CDP browser.
|
||||
This tests the fix for always creating new pages.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://www.python.org",
|
||||
"https://httpbin.org/html",
|
||||
]
|
||||
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Run arun_many - should create new pages for each URL
|
||||
results = await crawler.arun_many(urls=urls, config=config)
|
||||
|
||||
# Verify all URLs were crawled successfully
|
||||
assert len(results) == len(urls), f"Expected {len(urls)} results, got {len(results)}"
|
||||
|
||||
for i, result in enumerate(results):
|
||||
assert result is not None, f"Result {i} is None"
|
||||
assert result.success, f"Result {i} failed: {result.error_message}"
|
||||
assert result.status_code == 200, f"Result {i} has status {result.status_code}"
|
||||
assert len(result.html) > 0, f"Result {i} has empty HTML"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cdp_multiple_sequential_arun_many():
|
||||
"""
|
||||
Test multiple sequential arun_many calls with CDP browser.
|
||||
Each call should work correctly without interference.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
urls_batch1 = [
|
||||
"https://example.com",
|
||||
"https://httpbin.org/html",
|
||||
]
|
||||
|
||||
urls_batch2 = [
|
||||
"https://www.python.org",
|
||||
"https://example.org",
|
||||
]
|
||||
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# First batch
|
||||
results1 = await crawler.arun_many(urls=urls_batch1, config=config)
|
||||
assert len(results1) == len(urls_batch1)
|
||||
for result in results1:
|
||||
assert result.success, f"First batch failed: {result.error_message}"
|
||||
|
||||
# Second batch - should work without issues
|
||||
results2 = await crawler.arun_many(urls=urls_batch2, config=config)
|
||||
assert len(results2) == len(urls_batch2)
|
||||
for result in results2:
|
||||
assert result.success, f"Second batch failed: {result.error_message}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cdp_concurrent_arun_many_stress():
|
||||
"""
|
||||
Stress test: Multiple concurrent arun_many calls with CDP browser.
|
||||
This is the key test for the concurrency fix - ensures page lock works.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Create multiple batches of URLs
|
||||
num_batches = 3
|
||||
urls_per_batch = 3
|
||||
|
||||
batches = [
|
||||
[f"https://httpbin.org/delay/{i}?batch={batch}"
|
||||
for i in range(urls_per_batch)]
|
||||
for batch in range(num_batches)
|
||||
]
|
||||
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Run multiple arun_many calls concurrently
|
||||
tasks = [
|
||||
crawler.arun_many(urls=batch, config=config)
|
||||
for batch in batches
|
||||
]
|
||||
|
||||
# Execute all batches in parallel
|
||||
all_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Verify no exceptions occurred
|
||||
for i, results in enumerate(all_results):
|
||||
assert not isinstance(results, Exception), f"Batch {i} raised exception: {results}"
|
||||
assert len(results) == urls_per_batch, f"Batch {i}: expected {urls_per_batch} results, got {len(results)}"
|
||||
|
||||
# Verify each result
|
||||
for j, result in enumerate(results):
|
||||
assert result is not None, f"Batch {i}, result {j} is None"
|
||||
# Some may fail due to network/timing, but should not crash
|
||||
if result.success:
|
||||
assert len(result.html) > 0, f"Batch {i}, result {j} has empty HTML"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cdp_page_isolation():
|
||||
"""
|
||||
Test that pages are properly isolated - changes to one don't affect another.
|
||||
This validates that we're creating truly independent pages.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
url = "https://example.com"
|
||||
|
||||
# Use different JS codes to verify isolation
|
||||
config1 = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
js_code="document.body.setAttribute('data-test', 'page1');"
|
||||
)
|
||||
|
||||
config2 = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
js_code="document.body.setAttribute('data-test', 'page2');"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Run both configs concurrently
|
||||
results = await crawler.arun_many(
|
||||
urls=[url, url],
|
||||
configs=[config1, config2]
|
||||
)
|
||||
|
||||
assert len(results) == 2
|
||||
assert results[0].success and results[1].success
|
||||
|
||||
# Both should succeed with their own modifications
|
||||
# (We can't directly check the data-test attribute, but success indicates isolation)
|
||||
assert 'Example Domain' in results[0].html
|
||||
assert 'Example Domain' in results[1].html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cdp_with_different_viewport_sizes():
|
||||
"""
|
||||
Test concurrent crawling with different viewport configurations.
|
||||
Ensures context/page creation handles different configs correctly.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
url = "https://example.com"
|
||||
|
||||
# Different viewport sizes (though in CDP mode these may be limited)
|
||||
configs = [
|
||||
CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=[url] * len(configs),
|
||||
configs=configs
|
||||
)
|
||||
|
||||
assert len(results) == len(configs)
|
||||
for i, result in enumerate(results):
|
||||
assert result.success, f"Config {i} failed: {result.error_message}"
|
||||
assert len(result.html) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cdp_error_handling_concurrent():
|
||||
"""
|
||||
Test that errors in one concurrent request don't affect others.
|
||||
This ensures proper isolation and error handling.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
urls = [
|
||||
"https://example.com", # Valid
|
||||
"https://this-domain-definitely-does-not-exist-12345.com", # Invalid
|
||||
"https://httpbin.org/html", # Valid
|
||||
]
|
||||
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(urls=urls, config=config)
|
||||
|
||||
assert len(results) == len(urls)
|
||||
|
||||
# First and third should succeed
|
||||
assert results[0].success, "First URL should succeed"
|
||||
assert results[2].success, "Third URL should succeed"
|
||||
|
||||
# Second may fail (invalid domain)
|
||||
# But its failure shouldn't affect the others
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cdp_large_batch():
|
||||
"""
|
||||
Test handling a larger batch of URLs to ensure scalability.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Create 10 URLs
|
||||
num_urls = 10
|
||||
urls = [f"https://httpbin.org/delay/0?id={i}" for i in range(num_urls)]
|
||||
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(urls=urls, config=config)
|
||||
|
||||
assert len(results) == num_urls
|
||||
|
||||
# Count successes
|
||||
successes = sum(1 for r in results if r.success)
|
||||
# Allow some failures due to network issues, but most should succeed
|
||||
assert successes >= num_urls * 0.8, f"Only {successes}/{num_urls} succeeded"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run tests with pytest
|
||||
pytest.main([__file__, "-v", "-s"])
|
||||
Reference in New Issue
Block a user