Improve CDP connection handling
This commit is contained in:
@@ -425,6 +425,15 @@ class BrowserConfig:
|
|||||||
the local Playwright client resources. Useful for cloud/server scenarios
|
the local Playwright client resources. Useful for cloud/server scenarios
|
||||||
where you don't own the remote browser but need to prevent memory leaks
|
where you don't own the remote browser but need to prevent memory leaks
|
||||||
from accumulated Playwright instances. Default: False.
|
from accumulated Playwright instances. Default: False.
|
||||||
|
cdp_close_delay (float): Seconds to wait after disconnecting a CDP WebSocket before stopping the
|
||||||
|
Playwright subprocess. Gives the connection time to fully release. Set to
|
||||||
|
0 to skip the delay entirely. Only applies when cdp_cleanup_on_close=True.
|
||||||
|
Default: 1.0.
|
||||||
|
cache_cdp_connection (bool): When True and using cdp_url, the Playwright subprocess and CDP WebSocket
|
||||||
|
are cached at the class level and shared across multiple BrowserManager
|
||||||
|
instances connecting to the same cdp_url. Reference-counted; the connection
|
||||||
|
is only closed when the last user releases it. Eliminates the overhead of
|
||||||
|
repeated Playwright/CDP setup and teardown. Default: False.
|
||||||
create_isolated_context (bool): When True and using cdp_url, forces creation of a new browser context
|
create_isolated_context (bool): When True and using cdp_url, forces creation of a new browser context
|
||||||
instead of reusing the default context. Essential for concurrent crawls
|
instead of reusing the default context. Essential for concurrent crawls
|
||||||
on the same browser to prevent navigation conflicts. Default: False.
|
on the same browser to prevent navigation conflicts. Default: False.
|
||||||
@@ -485,6 +494,8 @@ class BrowserConfig:
|
|||||||
browser_context_id: str = None,
|
browser_context_id: str = None,
|
||||||
target_id: str = None,
|
target_id: str = None,
|
||||||
cdp_cleanup_on_close: bool = False,
|
cdp_cleanup_on_close: bool = False,
|
||||||
|
cdp_close_delay: float = 1.0,
|
||||||
|
cache_cdp_connection: bool = False,
|
||||||
create_isolated_context: bool = False,
|
create_isolated_context: bool = False,
|
||||||
use_persistent_context: bool = False,
|
use_persistent_context: bool = False,
|
||||||
user_data_dir: str = None,
|
user_data_dir: str = None,
|
||||||
@@ -529,6 +540,8 @@ class BrowserConfig:
|
|||||||
self.browser_context_id = browser_context_id
|
self.browser_context_id = browser_context_id
|
||||||
self.target_id = target_id
|
self.target_id = target_id
|
||||||
self.cdp_cleanup_on_close = cdp_cleanup_on_close
|
self.cdp_cleanup_on_close = cdp_cleanup_on_close
|
||||||
|
self.cdp_close_delay = cdp_close_delay
|
||||||
|
self.cache_cdp_connection = cache_cdp_connection
|
||||||
self.create_isolated_context = create_isolated_context
|
self.create_isolated_context = create_isolated_context
|
||||||
self.use_persistent_context = use_persistent_context
|
self.use_persistent_context = use_persistent_context
|
||||||
self.user_data_dir = user_data_dir
|
self.user_data_dir = user_data_dir
|
||||||
|
|||||||
@@ -139,7 +139,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
Close the browser and clean up resources.
|
Close the browser and clean up resources.
|
||||||
"""
|
"""
|
||||||
await self.browser_manager.close()
|
await self.browser_manager.close()
|
||||||
# Explicitly reset the static Playwright instance
|
# Explicitly reset the static Playwright instance (skip if using cached CDP)
|
||||||
|
if not self.browser_manager._using_cached_cdp:
|
||||||
BrowserManager._playwright_instance = None
|
BrowserManager._playwright_instance = None
|
||||||
|
|
||||||
async def kill_session(self, session_id: str):
|
async def kill_session(self, session_id: str):
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from typing import List, Optional
|
from typing import Dict, List, Optional, Tuple
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
@@ -559,6 +559,91 @@ async def clone_runtime_state(
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class _CDPConnectionCache:
|
||||||
|
"""
|
||||||
|
Class-level cache for Playwright + CDP browser connections.
|
||||||
|
|
||||||
|
When enabled via BrowserConfig(cache_cdp_connection=True), multiple
|
||||||
|
BrowserManager instances connecting to the same cdp_url will share
|
||||||
|
a single Playwright subprocess and CDP WebSocket. Reference-counted;
|
||||||
|
the connection is closed when the last user releases it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_cache: Dict[str, Tuple] = {} # cdp_url -> (playwright, browser, ref_count)
|
||||||
|
_lock: Optional[asyncio.Lock] = None # lazy-init to avoid event loop issues
|
||||||
|
_lock_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_lock(cls) -> asyncio.Lock:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
if cls._lock is None or cls._lock_loop is not loop:
|
||||||
|
cls._lock = asyncio.Lock()
|
||||||
|
cls._lock_loop = loop
|
||||||
|
return cls._lock
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def acquire(cls, cdp_url: str, use_undetected: bool = False):
|
||||||
|
"""Get or create a cached (playwright, browser) for this cdp_url."""
|
||||||
|
async with cls._get_lock():
|
||||||
|
if cdp_url in cls._cache:
|
||||||
|
pw, browser, count = cls._cache[cdp_url]
|
||||||
|
if browser.is_connected():
|
||||||
|
cls._cache[cdp_url] = (pw, browser, count + 1)
|
||||||
|
return pw, browser
|
||||||
|
# Stale connection — clean up and fall through to create new
|
||||||
|
try:
|
||||||
|
await pw.stop()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
del cls._cache[cdp_url]
|
||||||
|
|
||||||
|
# Create new connection
|
||||||
|
if use_undetected:
|
||||||
|
from patchright.async_api import async_playwright
|
||||||
|
else:
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
pw = await async_playwright().start()
|
||||||
|
browser = await pw.chromium.connect_over_cdp(cdp_url)
|
||||||
|
cls._cache[cdp_url] = (pw, browser, 1)
|
||||||
|
return pw, browser
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def release(cls, cdp_url: str):
|
||||||
|
"""Decrement ref count; close connection when last user releases."""
|
||||||
|
async with cls._get_lock():
|
||||||
|
if cdp_url not in cls._cache:
|
||||||
|
return
|
||||||
|
pw, browser, count = cls._cache[cdp_url]
|
||||||
|
if count <= 1:
|
||||||
|
try:
|
||||||
|
await browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
await pw.stop()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
del cls._cache[cdp_url]
|
||||||
|
else:
|
||||||
|
cls._cache[cdp_url] = (pw, browser, count - 1)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def close_all(cls):
|
||||||
|
"""Force-close all cached connections. Call on application shutdown."""
|
||||||
|
async with cls._get_lock():
|
||||||
|
for cdp_url in list(cls._cache.keys()):
|
||||||
|
pw, browser, _ = cls._cache[cdp_url]
|
||||||
|
try:
|
||||||
|
await browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
await pw.stop()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
cls._cache.clear()
|
||||||
|
|
||||||
|
|
||||||
class BrowserManager:
|
class BrowserManager:
|
||||||
"""
|
"""
|
||||||
Manages the browser instance and context.
|
Manages the browser instance and context.
|
||||||
@@ -616,6 +701,7 @@ class BrowserManager:
|
|||||||
self.default_context = None
|
self.default_context = None
|
||||||
self.managed_browser = None
|
self.managed_browser = None
|
||||||
self.playwright = None
|
self.playwright = None
|
||||||
|
self._using_cached_cdp = False
|
||||||
|
|
||||||
# Session management
|
# Session management
|
||||||
self.sessions = {}
|
self.sessions = {}
|
||||||
@@ -666,6 +752,15 @@ class BrowserManager:
|
|||||||
if self.playwright is not None:
|
if self.playwright is not None:
|
||||||
await self.close()
|
await self.close()
|
||||||
|
|
||||||
|
# Use cached CDP connection if enabled and cdp_url is set
|
||||||
|
if self.config.cache_cdp_connection and self.config.cdp_url:
|
||||||
|
self._using_cached_cdp = True
|
||||||
|
self.config.use_managed_browser = True
|
||||||
|
self.playwright, self.browser = await _CDPConnectionCache.acquire(
|
||||||
|
self.config.cdp_url, self.use_undetected
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._using_cached_cdp = False
|
||||||
if self.use_undetected:
|
if self.use_undetected:
|
||||||
from patchright.async_api import async_playwright
|
from patchright.async_api import async_playwright
|
||||||
else:
|
else:
|
||||||
@@ -676,6 +771,8 @@ class BrowserManager:
|
|||||||
|
|
||||||
if self.config.cdp_url or self.config.use_managed_browser:
|
if self.config.cdp_url or self.config.use_managed_browser:
|
||||||
self.config.use_managed_browser = True
|
self.config.use_managed_browser = True
|
||||||
|
|
||||||
|
if not self._using_cached_cdp:
|
||||||
cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
|
cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
|
||||||
|
|
||||||
# Add CDP endpoint verification before connecting
|
# Add CDP endpoint verification before connecting
|
||||||
@@ -683,6 +780,7 @@ class BrowserManager:
|
|||||||
raise Exception(f"CDP endpoint at {cdp_url} is not ready after startup")
|
raise Exception(f"CDP endpoint at {cdp_url} is not ready after startup")
|
||||||
|
|
||||||
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
||||||
|
|
||||||
contexts = self.browser.contexts
|
contexts = self.browser.contexts
|
||||||
|
|
||||||
# If browser_context_id is provided, we're using a pre-created context
|
# If browser_context_id is provided, we're using a pre-created context
|
||||||
@@ -1409,6 +1507,24 @@ class BrowserManager:
|
|||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Close all browser resources and clean up."""
|
"""Close all browser resources and clean up."""
|
||||||
|
# Cached CDP path: only clean up this instance's sessions/contexts,
|
||||||
|
# then release the shared connection reference.
|
||||||
|
if self._using_cached_cdp:
|
||||||
|
session_ids = list(self.sessions.keys())
|
||||||
|
for session_id in session_ids:
|
||||||
|
await self.kill_session(session_id)
|
||||||
|
for ctx in self.contexts_by_config.values():
|
||||||
|
try:
|
||||||
|
await ctx.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self.contexts_by_config.clear()
|
||||||
|
await _CDPConnectionCache.release(self.config.cdp_url)
|
||||||
|
self.browser = None
|
||||||
|
self.playwright = None
|
||||||
|
self._using_cached_cdp = False
|
||||||
|
return
|
||||||
|
|
||||||
if self.config.cdp_url:
|
if self.config.cdp_url:
|
||||||
# When using external CDP, we don't own the browser process.
|
# When using external CDP, we don't own the browser process.
|
||||||
# If cdp_cleanup_on_close is True, properly disconnect from the browser
|
# If cdp_cleanup_on_close is True, properly disconnect from the browser
|
||||||
@@ -1440,7 +1556,8 @@ class BrowserManager:
|
|||||||
)
|
)
|
||||||
self.browser = None
|
self.browser = None
|
||||||
# Allow time for CDP connection to fully release before another client connects
|
# Allow time for CDP connection to fully release before another client connects
|
||||||
await asyncio.sleep(1.0)
|
if self.config.cdp_close_delay > 0:
|
||||||
|
await asyncio.sleep(self.config.cdp_close_delay)
|
||||||
|
|
||||||
# Stop Playwright instance to prevent memory leaks
|
# Stop Playwright instance to prevent memory leaks
|
||||||
if self.playwright:
|
if self.playwright:
|
||||||
|
|||||||
506
tests/test_cdp_changes.py
Normal file
506
tests/test_cdp_changes.py
Normal file
@@ -0,0 +1,506 @@
|
|||||||
|
"""
|
||||||
|
Tests for CDP connection caching and configurable close delay.
|
||||||
|
|
||||||
|
Two test suites:
|
||||||
|
1. Regression — default behavior unchanged, basic CDP still works
|
||||||
|
2. Stress — race conditions, parallel crawlers, locking, cache correctness
|
||||||
|
|
||||||
|
All tests are real (no mocks). Requires a running Chrome on port 9222:
|
||||||
|
chrome --headless=new --no-sandbox --remote-debugging-port=9222
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.browser_manager import _CDPConnectionCache, BrowserManager
|
||||||
|
|
||||||
|
CDP_URL = "http://localhost:9222"
|
||||||
|
TEST_URL = "https://example.com"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _quick_crawl(browser_cfg: BrowserConfig, url: str = TEST_URL) -> bool:
|
||||||
|
"""Run a single crawl, return True if the page loaded successfully."""
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=15000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||||
|
result = await crawler.arun(url=url, config=run_cfg)
|
||||||
|
return result.success and result.status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# SUITE 1 — REGRESSION (default behaviour, no new flags)
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestRegression:
|
||||||
|
"""Verify nothing is broken when the new parameters keep their defaults."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_default_cdp_crawl(self):
|
||||||
|
"""Basic CDP crawl with default settings still works."""
|
||||||
|
cfg = BrowserConfig(cdp_url=CDP_URL, headless=True)
|
||||||
|
assert await _quick_crawl(cfg)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_default_params_values(self):
|
||||||
|
"""BrowserConfig defaults are backward-compatible."""
|
||||||
|
cfg = BrowserConfig()
|
||||||
|
assert cfg.cdp_close_delay == 1.0
|
||||||
|
assert cfg.cache_cdp_connection is False
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cdp_cleanup_on_close_still_works(self):
|
||||||
|
"""cdp_cleanup_on_close=True path (existing feature) still works."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cdp_cleanup_on_close=True,
|
||||||
|
)
|
||||||
|
assert await _quick_crawl(cfg)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_sequential_crawls_default(self):
|
||||||
|
"""Two sequential crawls with default settings both succeed."""
|
||||||
|
cfg = BrowserConfig(cdp_url=CDP_URL, headless=True)
|
||||||
|
assert await _quick_crawl(cfg)
|
||||||
|
assert await _quick_crawl(cfg)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cdp_close_delay_default_timing(self):
|
||||||
|
"""Default close delay is ~1s (not 0, not much more)."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cdp_cleanup_on_close=True,
|
||||||
|
)
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=15000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
# The 1s sleep must be present (at least ~0.9s of close overhead)
|
||||||
|
assert elapsed >= 0.9, f"Close was too fast ({elapsed:.2f}s), sleep may be missing"
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# SUITE 2 — STRESS (cache, parallelism, race conditions, locking)
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
class TestStress:
|
||||||
|
"""Hammer the CDP cache and configurable delay under pressure."""
|
||||||
|
|
||||||
|
# -- cdp_close_delay -------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_close_delay_zero_skips_sleep(self):
|
||||||
|
"""cdp_close_delay=0 should make close noticeably faster."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cdp_cleanup_on_close=True,
|
||||||
|
cdp_close_delay=0,
|
||||||
|
)
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=15000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
# Without the 1s sleep the whole thing should be well under 1s of close overhead
|
||||||
|
assert elapsed < 5.0, f"Close too slow with delay=0 ({elapsed:.2f}s)"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_close_delay_custom_value(self):
|
||||||
|
"""A custom delay value is respected."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cdp_cleanup_on_close=True,
|
||||||
|
cdp_close_delay=0.2,
|
||||||
|
)
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=15000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
# Should include at least the 0.2s delay
|
||||||
|
assert elapsed >= 0.2
|
||||||
|
|
||||||
|
# -- cache_cdp_connection: basic -----------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cache_basic_crawl(self):
|
||||||
|
"""A single crawl with caching enabled works."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
assert await _quick_crawl(cfg)
|
||||||
|
# Clean up cache after test
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cache_sequential_reuse(self):
|
||||||
|
"""Two sequential crawlers reuse the same cached connection."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# First crawl — creates the cache entry
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
r1 = await crawler.arun(
|
||||||
|
url=TEST_URL,
|
||||||
|
config=CrawlerRunConfig(wait_until="domcontentloaded", page_timeout=15000, verbose=False),
|
||||||
|
)
|
||||||
|
assert r1.success
|
||||||
|
|
||||||
|
# Cache should still have an entry (ref went 1 -> 0, so it closed).
|
||||||
|
# But if we acquire again immediately it should create a fresh one.
|
||||||
|
# The key thing: this must not crash or hang.
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
r2 = await crawler.arun(
|
||||||
|
url=TEST_URL,
|
||||||
|
config=CrawlerRunConfig(wait_until="domcontentloaded", page_timeout=15000, verbose=False),
|
||||||
|
)
|
||||||
|
assert r2.success
|
||||||
|
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cache_overlapping_instances(self):
|
||||||
|
"""Two crawlers alive at the same time share the cache (ref_count=2)."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=15000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
crawler1 = AsyncWebCrawler(config=cfg)
|
||||||
|
await crawler1.start()
|
||||||
|
|
||||||
|
crawler2 = AsyncWebCrawler(config=cfg)
|
||||||
|
await crawler2.start()
|
||||||
|
|
||||||
|
# Both should work concurrently
|
||||||
|
r1, r2 = await asyncio.gather(
|
||||||
|
crawler1.arun(url=TEST_URL, config=run_cfg),
|
||||||
|
crawler2.arun(url=TEST_URL, config=run_cfg),
|
||||||
|
)
|
||||||
|
assert r1.success
|
||||||
|
assert r2.success
|
||||||
|
|
||||||
|
# Close one — connection must survive for the other
|
||||||
|
await crawler1.close()
|
||||||
|
|
||||||
|
# Second crawler should still work after first closed
|
||||||
|
r3 = await crawler2.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
assert r3.success
|
||||||
|
|
||||||
|
await crawler2.close()
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
|
||||||
|
# -- cache: ref counting -------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cache_ref_count_lifecycle(self):
|
||||||
|
"""Verify ref count goes up and comes back down correctly."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
c1 = AsyncWebCrawler(config=cfg)
|
||||||
|
await c1.start()
|
||||||
|
# Cache should have ref_count=1
|
||||||
|
assert CDP_URL in _CDPConnectionCache._cache
|
||||||
|
_, _, count = _CDPConnectionCache._cache[CDP_URL]
|
||||||
|
assert count == 1
|
||||||
|
|
||||||
|
c2 = AsyncWebCrawler(config=cfg)
|
||||||
|
await c2.start()
|
||||||
|
_, _, count = _CDPConnectionCache._cache[CDP_URL]
|
||||||
|
assert count == 2
|
||||||
|
|
||||||
|
await c1.close()
|
||||||
|
_, _, count = _CDPConnectionCache._cache[CDP_URL]
|
||||||
|
assert count == 1
|
||||||
|
|
||||||
|
await c2.close()
|
||||||
|
# Last reference released — entry should be gone
|
||||||
|
assert CDP_URL not in _CDPConnectionCache._cache
|
||||||
|
|
||||||
|
# -- cache: speed benefit ------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cache_faster_than_uncached(self):
|
||||||
|
"""Cached sequential crawls should be faster than uncached."""
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=15000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Uncached: two sequential crawls (each does full playwright start/stop)
|
||||||
|
cfg_no_cache = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cdp_cleanup_on_close=True,
|
||||||
|
cdp_close_delay=0.5,
|
||||||
|
)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
for _ in range(2):
|
||||||
|
async with AsyncWebCrawler(config=cfg_no_cache) as crawler:
|
||||||
|
await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
uncached_time = time.monotonic() - t0
|
||||||
|
|
||||||
|
# Cached: two sequential crawls (share playwright/CDP)
|
||||||
|
cfg_cache = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
for _ in range(2):
|
||||||
|
async with AsyncWebCrawler(config=cfg_cache) as crawler:
|
||||||
|
await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
cached_time = time.monotonic() - t0
|
||||||
|
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
|
||||||
|
# Cached should be faster (the uncached has 2x 0.5s delay alone)
|
||||||
|
assert cached_time < uncached_time, (
|
||||||
|
f"Cached ({cached_time:.2f}s) was not faster than uncached ({uncached_time:.2f}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# -- parallel stress -----------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parallel_cached_crawlers(self):
|
||||||
|
"""Launch 5 crawlers in parallel, all sharing the cache."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
create_isolated_context=True,
|
||||||
|
)
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=20000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def crawl_one(idx: int):
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
result = await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
return idx, result.success
|
||||||
|
|
||||||
|
results = await asyncio.gather(*[crawl_one(i) for i in range(5)])
|
||||||
|
for idx, success in results:
|
||||||
|
assert success, f"Crawler {idx} failed"
|
||||||
|
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parallel_mixed_cached_uncached(self):
|
||||||
|
"""Mix of cached and uncached crawlers running in parallel.
|
||||||
|
|
||||||
|
Uses create_isolated_context=True because parallel crawlers sharing
|
||||||
|
a single default context will cause navigation conflicts — this is
|
||||||
|
the expected pattern for concurrent CDP access.
|
||||||
|
"""
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=20000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def crawl_cached():
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL, headless=True,
|
||||||
|
cache_cdp_connection=True, create_isolated_context=True,
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
r = await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
return "cached", r.success
|
||||||
|
|
||||||
|
async def crawl_uncached():
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL, headless=True,
|
||||||
|
create_isolated_context=True,
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
r = await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
return "uncached", r.success
|
||||||
|
|
||||||
|
tasks = [crawl_cached(), crawl_uncached(), crawl_cached(), crawl_uncached()]
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
for label, success in results:
|
||||||
|
assert success, f"{label} crawler failed"
|
||||||
|
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_rapid_open_close_cache(self):
|
||||||
|
"""Rapidly open and close 10 crawlers sequentially — no leaks/hangs."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=15000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
result = await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
assert result.success, f"Iteration {i} failed"
|
||||||
|
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cache_close_all_idempotent(self):
|
||||||
|
"""Calling close_all() multiple times doesn't crash."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
r = await crawler.arun(
|
||||||
|
url=TEST_URL,
|
||||||
|
config=CrawlerRunConfig(wait_until="domcontentloaded", page_timeout=15000, verbose=False),
|
||||||
|
)
|
||||||
|
assert r.success
|
||||||
|
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
await _CDPConnectionCache.close_all() # second call must not raise
|
||||||
|
await _CDPConnectionCache.close_all() # third call must not raise
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_stale_connection_recovery(self):
|
||||||
|
"""If the cached browser disconnects, next acquire recovers."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=15000,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build up a cache entry
|
||||||
|
c1 = AsyncWebCrawler(config=cfg)
|
||||||
|
await c1.start()
|
||||||
|
|
||||||
|
# Forcibly disconnect the cached browser to simulate staleness
|
||||||
|
if CDP_URL in _CDPConnectionCache._cache:
|
||||||
|
_, browser, _ = _CDPConnectionCache._cache[CDP_URL]
|
||||||
|
try:
|
||||||
|
await browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
await c1.close()
|
||||||
|
|
||||||
|
# Next crawler should detect stale and create a fresh connection
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
result = await crawler.arun(url=TEST_URL, config=run_cfg)
|
||||||
|
assert result.success
|
||||||
|
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parallel_start_race(self):
|
||||||
|
"""Multiple crawlers calling start() simultaneously — lock prevents races."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
crawlers = [AsyncWebCrawler(config=cfg) for _ in range(5)]
|
||||||
|
|
||||||
|
# Start all at once — this hammers _CDPConnectionCache.acquire() concurrently
|
||||||
|
await asyncio.gather(*[c.start() for c in crawlers])
|
||||||
|
|
||||||
|
# All should have the same browser reference
|
||||||
|
browsers = set()
|
||||||
|
for c in crawlers:
|
||||||
|
bm = c.crawler_strategy.browser_manager
|
||||||
|
browsers.add(id(bm.browser))
|
||||||
|
|
||||||
|
# With caching, they should all share the same browser object
|
||||||
|
assert len(browsers) == 1, f"Expected 1 shared browser, got {len(browsers)}"
|
||||||
|
|
||||||
|
# Ref count should be 5
|
||||||
|
_, _, count = _CDPConnectionCache._cache[CDP_URL]
|
||||||
|
assert count == 5
|
||||||
|
|
||||||
|
# Close all
|
||||||
|
await asyncio.gather(*[c.close() for c in crawlers])
|
||||||
|
|
||||||
|
# Cache should be empty
|
||||||
|
assert CDP_URL not in _CDPConnectionCache._cache
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parallel_close_race(self):
|
||||||
|
"""Multiple crawlers closing simultaneously — no double-free."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
cdp_url=CDP_URL,
|
||||||
|
headless=True,
|
||||||
|
cache_cdp_connection=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
crawlers = [AsyncWebCrawler(config=cfg) for _ in range(5)]
|
||||||
|
await asyncio.gather(*[c.start() for c in crawlers])
|
||||||
|
|
||||||
|
# Close all at once — hammers _CDPConnectionCache.release() concurrently
|
||||||
|
await asyncio.gather(*[c.close() for c in crawlers])
|
||||||
|
|
||||||
|
# Cache must be clean
|
||||||
|
assert CDP_URL not in _CDPConnectionCache._cache
|
||||||
|
|
||||||
|
# Must still work after everything is closed
|
||||||
|
async with AsyncWebCrawler(config=cfg) as crawler:
|
||||||
|
r = await crawler.arun(
|
||||||
|
url=TEST_URL,
|
||||||
|
config=CrawlerRunConfig(wait_until="domcontentloaded", page_timeout=15000, verbose=False),
|
||||||
|
)
|
||||||
|
assert r.success
|
||||||
|
|
||||||
|
await _CDPConnectionCache.close_all()
|
||||||
Reference in New Issue
Block a user