Fix browser recycling under high concurrency — version-based approach
The previous recycle logic waited for all refcounts to hit 0 before recycling, which never happened under sustained concurrent load (20+ crawls always had at least one active). New approach: - Add _browser_version to config signature — bump it to force new contexts - When threshold is hit: bump version, move old sigs to _pending_cleanup - New requests get new contexts automatically (different signature) - Old contexts drain naturally and get cleaned up when refcount hits 0 - Safety cap: max 3 pending browsers draining at once This means recycling now works under any load pattern — no blocking, no waiting for quiet moments. Old and new browsers coexist briefly during transitions. Includes 12 new tests covering version bumps, concurrent recycling, safety cap, and edge cases.
This commit is contained in:
@@ -734,12 +734,14 @@ class BrowserManager:
|
|||||||
# Browser endpoint key for global page tracking (set after browser starts)
|
# Browser endpoint key for global page tracking (set after browser starts)
|
||||||
self._browser_endpoint_key: Optional[str] = None
|
self._browser_endpoint_key: Optional[str] = None
|
||||||
|
|
||||||
# Browser recycling state
|
# Browser recycling state (version-based approach)
|
||||||
self._pages_served = 0
|
self._pages_served = 0
|
||||||
self._recycling = False
|
self._browser_version = 1 # included in signature, bump to create new browser
|
||||||
self._recycle_lock = asyncio.Lock()
|
self._pending_cleanup = {} # old_sig -> {"browser": browser, "contexts": [...], "done": Event}
|
||||||
self._recycle_done = asyncio.Event()
|
self._pending_cleanup_lock = asyncio.Lock()
|
||||||
self._recycle_done.set() # starts "open" — not recycling
|
self._max_pending_browsers = 3 # safety cap — block if too many draining
|
||||||
|
self._cleanup_slot_available = asyncio.Event()
|
||||||
|
self._cleanup_slot_available.set() # starts open
|
||||||
|
|
||||||
# Stealth adapter for stealth mode
|
# Stealth adapter for stealth mode
|
||||||
self._stealth_adapter = None
|
self._stealth_adapter = None
|
||||||
@@ -1318,6 +1320,9 @@ class BrowserManager:
|
|||||||
sig_dict["simulate_user"] = crawlerRunConfig.simulate_user
|
sig_dict["simulate_user"] = crawlerRunConfig.simulate_user
|
||||||
sig_dict["magic"] = crawlerRunConfig.magic
|
sig_dict["magic"] = crawlerRunConfig.magic
|
||||||
|
|
||||||
|
# Browser version — bumped on recycle to force new browser instance
|
||||||
|
sig_dict["_browser_version"] = self._browser_version
|
||||||
|
|
||||||
signature_json = json.dumps(sig_dict, sort_keys=True, default=str)
|
signature_json = json.dumps(sig_dict, sort_keys=True, default=str)
|
||||||
return hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
return hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
@@ -1434,9 +1439,6 @@ class BrowserManager:
|
|||||||
Returns:
|
Returns:
|
||||||
(page, context): The Page and its BrowserContext
|
(page, context): The Page and its BrowserContext
|
||||||
"""
|
"""
|
||||||
# Block if browser is being recycled; wakes instantly when done
|
|
||||||
await self._recycle_done.wait()
|
|
||||||
|
|
||||||
self._cleanup_expired_sessions()
|
self._cleanup_expired_sessions()
|
||||||
|
|
||||||
# If a session_id is provided and we already have it, reuse that page + context
|
# If a session_id is provided and we already have it, reuse that page + context
|
||||||
@@ -1597,6 +1599,11 @@ class BrowserManager:
|
|||||||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||||||
|
|
||||||
self._pages_served += 1
|
self._pages_served += 1
|
||||||
|
|
||||||
|
# Check if browser recycle threshold is hit — bump version for next requests
|
||||||
|
# This happens AFTER incrementing counter so concurrent requests see correct count
|
||||||
|
await self._maybe_bump_browser_version()
|
||||||
|
|
||||||
return page, context
|
return page, context
|
||||||
|
|
||||||
async def kill_session(self, session_id: str):
|
async def kill_session(self, session_id: str):
|
||||||
@@ -1645,16 +1652,19 @@ class BrowserManager:
|
|||||||
release_page() so the context lifecycle is properly tracked.
|
release_page() so the context lifecycle is properly tracked.
|
||||||
"""
|
"""
|
||||||
self._release_page_from_use(page)
|
self._release_page_from_use(page)
|
||||||
|
sig = None
|
||||||
|
refcount = -1
|
||||||
async with self._contexts_lock:
|
async with self._contexts_lock:
|
||||||
sig = self._page_to_sig.pop(page, None)
|
sig = self._page_to_sig.pop(page, None)
|
||||||
if sig is not None and sig in self._context_refcounts:
|
if sig is not None and sig in self._context_refcounts:
|
||||||
self._context_refcounts[sig] = max(
|
self._context_refcounts[sig] = max(
|
||||||
0, self._context_refcounts[sig] - 1
|
0, self._context_refcounts[sig] - 1
|
||||||
)
|
)
|
||||||
|
refcount = self._context_refcounts[sig]
|
||||||
|
|
||||||
# Check if browser recycle is needed
|
# Check if this signature belongs to an old browser waiting to be cleaned up
|
||||||
if self._should_recycle():
|
if sig is not None and refcount == 0:
|
||||||
await self._maybe_recycle_browser()
|
await self._maybe_cleanup_old_browser(sig)
|
||||||
|
|
||||||
def _should_recycle(self) -> bool:
|
def _should_recycle(self) -> bool:
|
||||||
"""Check if page threshold reached for browser recycling."""
|
"""Check if page threshold reached for browser recycling."""
|
||||||
@@ -1663,77 +1673,113 @@ class BrowserManager:
|
|||||||
return False
|
return False
|
||||||
return self._pages_served >= limit
|
return self._pages_served >= limit
|
||||||
|
|
||||||
async def _maybe_recycle_browser(self):
|
async def _maybe_bump_browser_version(self):
|
||||||
"""Recycle browser if no active crawls are in-flight.
|
"""Bump browser version if threshold reached, moving old browser to pending cleanup.
|
||||||
|
|
||||||
Uses asyncio.Event to block new get_page() callers during recycle,
|
New requests automatically get a new browser (via new signature).
|
||||||
and sets _recycling inside _contexts_lock to prevent race conditions.
|
Old browser drains naturally and gets cleaned up when refcount hits 0.
|
||||||
"""
|
"""
|
||||||
if self._recycling:
|
if not self._should_recycle():
|
||||||
return
|
return
|
||||||
|
|
||||||
async with self._recycle_lock:
|
# Safety cap: wait if too many old browsers are draining
|
||||||
if self._recycling:
|
while True:
|
||||||
return
|
async with self._pending_cleanup_lock:
|
||||||
|
# Re-check threshold under lock (another request may have bumped already)
|
||||||
|
if not self._should_recycle():
|
||||||
|
return
|
||||||
|
|
||||||
# Set _recycling and check refcounts under the SAME lock
|
# Check safety cap
|
||||||
# to prevent a new crawl slipping in between check and flag set
|
if len(self._pending_cleanup) >= self._max_pending_browsers:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.debug(
|
||||||
|
message="Waiting for old browser to drain (pending: {count})",
|
||||||
|
tag="BROWSER",
|
||||||
|
params={"count": len(self._pending_cleanup)},
|
||||||
|
)
|
||||||
|
self._cleanup_slot_available.clear()
|
||||||
|
# Release lock and wait
|
||||||
|
else:
|
||||||
|
# We have a slot — do the bump inside this lock hold
|
||||||
|
old_version = self._browser_version
|
||||||
|
old_sigs = []
|
||||||
|
async with self._contexts_lock:
|
||||||
|
for sig in list(self._context_refcounts.keys()):
|
||||||
|
old_sigs.append(sig)
|
||||||
|
|
||||||
|
if self.logger:
|
||||||
|
self.logger.info(
|
||||||
|
message="Bumping browser version {old} -> {new} after {count} pages",
|
||||||
|
tag="BROWSER",
|
||||||
|
params={
|
||||||
|
"old": old_version,
|
||||||
|
"new": old_version + 1,
|
||||||
|
"count": self._pages_served,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mark old signatures for cleanup when their refcount hits 0
|
||||||
|
done_event = asyncio.Event()
|
||||||
|
for sig in old_sigs:
|
||||||
|
self._pending_cleanup[sig] = {
|
||||||
|
"version": old_version,
|
||||||
|
"done": done_event,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Bump version — new get_page() calls will create new contexts
|
||||||
|
self._browser_version += 1
|
||||||
|
self._pages_served = 0
|
||||||
|
return # Done!
|
||||||
|
|
||||||
|
# If we get here, we need to wait for a cleanup slot
|
||||||
|
await self._cleanup_slot_available.wait()
|
||||||
|
|
||||||
|
async def _maybe_cleanup_old_browser(self, sig: str):
|
||||||
|
"""Clean up an old browser's context if its refcount hit 0 and it's pending cleanup."""
|
||||||
|
async with self._pending_cleanup_lock:
|
||||||
|
if sig not in self._pending_cleanup:
|
||||||
|
return # Not an old browser signature
|
||||||
|
|
||||||
|
cleanup_info = self._pending_cleanup.pop(sig)
|
||||||
|
old_version = cleanup_info["version"]
|
||||||
|
|
||||||
|
if self.logger:
|
||||||
|
self.logger.debug(
|
||||||
|
message="Cleaning up context from browser version {version} (sig: {sig})",
|
||||||
|
tag="BROWSER",
|
||||||
|
params={"version": old_version, "sig": sig[:12]},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove context from tracking
|
||||||
async with self._contexts_lock:
|
async with self._contexts_lock:
|
||||||
total_active = sum(self._context_refcounts.values())
|
context = self.contexts_by_config.pop(sig, None)
|
||||||
if total_active > 0:
|
self._context_refcounts.pop(sig, None)
|
||||||
return # active crawls running, next release will re-check
|
self._context_last_used.pop(sig, None)
|
||||||
self._recycling = True
|
|
||||||
self._recycle_done.clear() # block new get_page() callers
|
|
||||||
|
|
||||||
try:
|
# Close context outside locks
|
||||||
|
if context is not None:
|
||||||
|
try:
|
||||||
|
await context.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check if any signatures from this old version remain
|
||||||
|
remaining_old = [
|
||||||
|
s for s, info in self._pending_cleanup.items()
|
||||||
|
if info["version"] == old_version
|
||||||
|
]
|
||||||
|
|
||||||
|
if not remaining_old:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
message="Recycling browser after {count} pages to reclaim memory",
|
message="All contexts from browser version {version} cleaned up",
|
||||||
tag="BROWSER",
|
tag="BROWSER",
|
||||||
params={"count": self._pages_served},
|
params={"version": old_version},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Force full cleanup to kill the browser process and reclaim
|
# Open a cleanup slot if we're below the cap
|
||||||
# memory. For external CDP (cdp_url without cache), temporarily
|
if len(self._pending_cleanup) < self._max_pending_browsers:
|
||||||
# enable cdp_cleanup_on_close so close() actually disconnects.
|
self._cleanup_slot_available.set()
|
||||||
# For cached CDP, close() already handles release correctly.
|
|
||||||
saved_cdp_cleanup = self.config.cdp_cleanup_on_close
|
|
||||||
if self.config.cdp_url and not self._using_cached_cdp:
|
|
||||||
self.config.cdp_cleanup_on_close = True
|
|
||||||
try:
|
|
||||||
await self.close()
|
|
||||||
finally:
|
|
||||||
self.config.cdp_cleanup_on_close = saved_cdp_cleanup
|
|
||||||
|
|
||||||
# close() already clears most tracking dicts, but ensure
|
|
||||||
# everything is reset for the fresh browser
|
|
||||||
self.contexts_by_config.clear()
|
|
||||||
self._context_refcounts.clear()
|
|
||||||
self._context_last_used.clear()
|
|
||||||
self._page_to_sig.clear()
|
|
||||||
self.sessions.clear()
|
|
||||||
# Clear global page tracking for this endpoint — old pages are dead
|
|
||||||
if self._browser_endpoint_key and self._browser_endpoint_key in BrowserManager._global_pages_in_use:
|
|
||||||
BrowserManager._global_pages_in_use[self._browser_endpoint_key].clear()
|
|
||||||
|
|
||||||
# Re-create ManagedBrowser if needed — close() sets it to None,
|
|
||||||
# but start() expects it to exist for the managed browser path.
|
|
||||||
if self.config.use_managed_browser and self.managed_browser is None:
|
|
||||||
self.managed_browser = ManagedBrowser(
|
|
||||||
browser_type=self.config.browser_type,
|
|
||||||
user_data_dir=self.config.user_data_dir,
|
|
||||||
headless=self.config.headless,
|
|
||||||
logger=self.logger,
|
|
||||||
debugging_port=self.config.debugging_port,
|
|
||||||
cdp_url=self.config.cdp_url,
|
|
||||||
browser_config=self.config,
|
|
||||||
)
|
|
||||||
|
|
||||||
await self.start()
|
|
||||||
self._pages_served = 0
|
|
||||||
finally:
|
|
||||||
self._recycling = False
|
|
||||||
self._recycle_done.set() # wake ALL waiting get_page() callers
|
|
||||||
|
|
||||||
def _cleanup_expired_sessions(self):
|
def _cleanup_expired_sessions(self):
|
||||||
"""Clean up expired sessions based on TTL."""
|
"""Clean up expired sessions based on TTL."""
|
||||||
|
|||||||
386
tests/async/test_browser_recycle_v2.py
Normal file
386
tests/async/test_browser_recycle_v2.py
Normal file
@@ -0,0 +1,386 @@
|
|||||||
|
"""
|
||||||
|
Tests for version-based browser recycling.
|
||||||
|
|
||||||
|
The new recycle approach:
|
||||||
|
1. When pages_served hits threshold, bump _browser_version
|
||||||
|
2. Old signatures go to _pending_cleanup
|
||||||
|
3. New requests get new contexts (different version = different signature)
|
||||||
|
4. When old context's refcount hits 0, it gets cleaned up
|
||||||
|
5. No blocking — old and new browsers coexist during transition
|
||||||
|
|
||||||
|
These tests use small thresholds (3-5 pages) to verify the mechanics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import threading
|
||||||
|
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Local test server
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
PAGES = {}
|
||||||
|
for i in range(100):
|
||||||
|
PAGES[f"/page{i}"] = (
|
||||||
|
f"<!DOCTYPE html><html><head><title>Page {i}</title></head>"
|
||||||
|
f"<body><h1>Page {i}</h1><p>Content for page {i}.</p></body></html>"
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
|
||||||
|
class Handler(SimpleHTTPRequestHandler):
|
||||||
|
def log_message(self, *a):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
body = PAGES.get(self.path, PAGES["/page0"])
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-type", "text/html")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(body)
|
||||||
|
|
||||||
|
|
||||||
|
class _Server(HTTPServer):
|
||||||
|
allow_reuse_address = True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def srv():
|
||||||
|
s = _Server(("127.0.0.1", 0), Handler)
|
||||||
|
port = s.server_address[1]
|
||||||
|
t = threading.Thread(target=s.serve_forever, daemon=True)
|
||||||
|
t.start()
|
||||||
|
yield f"http://127.0.0.1:{port}"
|
||||||
|
s.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
def _u(base, i):
|
||||||
|
return f"{base}/page{i}"
|
||||||
|
|
||||||
|
|
||||||
|
def _bm(c):
|
||||||
|
return c.crawler_strategy.browser_manager
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# SECTION A — Version bump mechanics
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_version_bump_on_threshold(srv):
|
||||||
|
"""Browser version should bump when threshold is reached."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=3,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
assert bm._browser_version == 1
|
||||||
|
|
||||||
|
# Crawl 2 pages — no bump yet
|
||||||
|
for i in range(2):
|
||||||
|
r = await c.arun(url=_u(srv, i), config=run)
|
||||||
|
assert r.success
|
||||||
|
|
||||||
|
assert bm._browser_version == 1, "Version should still be 1 after 2 pages"
|
||||||
|
assert bm._pages_served == 2
|
||||||
|
|
||||||
|
# 3rd page hits threshold (3) and triggers bump AFTER the page is served
|
||||||
|
r = await c.arun(url=_u(srv, 2), config=run)
|
||||||
|
assert r.success
|
||||||
|
assert bm._browser_version == 2, "Version should bump after 3rd page"
|
||||||
|
assert bm._pages_served == 0, "Counter resets after bump"
|
||||||
|
|
||||||
|
# 4th page is first page of version 2
|
||||||
|
r = await c.arun(url=_u(srv, 3), config=run)
|
||||||
|
assert r.success
|
||||||
|
assert bm._pages_served == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_signature_changes_after_version_bump(srv):
|
||||||
|
"""Same CrawlerRunConfig should produce different signatures after version bump."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=2,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
# Get signature before bump
|
||||||
|
sig_v1 = bm._make_config_signature(run)
|
||||||
|
|
||||||
|
# Crawl 2 pages
|
||||||
|
for i in range(2):
|
||||||
|
await c.arun(url=_u(srv, i), config=run)
|
||||||
|
|
||||||
|
# 3rd request triggers bump
|
||||||
|
await c.arun(url=_u(srv, 2), config=run)
|
||||||
|
|
||||||
|
# Signature should be different now
|
||||||
|
sig_v2 = bm._make_config_signature(run)
|
||||||
|
assert sig_v1 != sig_v2, "Signature should change after version bump"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_no_version_bump_when_disabled(srv):
|
||||||
|
"""Version should stay at 1 when max_pages_before_recycle=0."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=0, # Disabled
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
for i in range(20):
|
||||||
|
r = await c.arun(url=_u(srv, i), config=run)
|
||||||
|
assert r.success
|
||||||
|
|
||||||
|
assert bm._browser_version == 1, "Version should not bump when disabled"
|
||||||
|
assert bm._pages_served == 20
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# SECTION B — Pending cleanup mechanics
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_old_signature_goes_to_pending_cleanup(srv):
|
||||||
|
"""Version bump works and old contexts get cleaned up."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=2,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
# Crawl 2 pages — creates signature for version 1, bumps on 2nd
|
||||||
|
for i in range(2):
|
||||||
|
await c.arun(url=_u(srv, i), config=run)
|
||||||
|
|
||||||
|
# After 2 pages with threshold=2, version should have bumped
|
||||||
|
assert bm._browser_version == 2
|
||||||
|
|
||||||
|
# Since sequential crawls release pages immediately (refcount=0),
|
||||||
|
# old contexts get cleaned up right away. Pending cleanup should be empty.
|
||||||
|
# This is correct behavior — cleanup is eager when possible.
|
||||||
|
assert len(bm._pending_cleanup) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cleanup_happens_when_refcount_hits_zero(srv):
|
||||||
|
"""Old context should be closed when its refcount drops to 0."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=3,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
# Sequential crawls: each page is released before next request
|
||||||
|
# So refcount is always 0 between requests, and cleanup happens immediately
|
||||||
|
for i in range(10):
|
||||||
|
r = await c.arun(url=_u(srv, i), config=run)
|
||||||
|
assert r.success
|
||||||
|
|
||||||
|
# Should have bumped twice (at 3 and 6) with version now at 3
|
||||||
|
# But since refcount=0 immediately, pending_cleanup should be empty
|
||||||
|
assert len(bm._pending_cleanup) == 0, "All old contexts should be cleaned up"
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# SECTION C — Concurrent crawls with recycling
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_concurrent_crawls_dont_block_on_recycle(srv):
|
||||||
|
"""Concurrent crawls should not block — old browser drains while new one serves."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=5,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
# Launch 20 concurrent crawls
|
||||||
|
tasks = [c.arun(url=_u(srv, i), config=run) for i in range(20)]
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
# All should succeed — no blocking, no errors
|
||||||
|
excs = [r for r in results if isinstance(r, Exception)]
|
||||||
|
assert len(excs) == 0, f"Exceptions: {excs[:3]}"
|
||||||
|
|
||||||
|
successes = [r for r in results if not isinstance(r, Exception) and r.success]
|
||||||
|
assert len(successes) == 20, f"Only {len(successes)} succeeded"
|
||||||
|
|
||||||
|
# Version should have bumped multiple times
|
||||||
|
assert bm._browser_version >= 2, "Should have recycled at least once"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_high_concurrency_with_small_threshold(srv):
|
||||||
|
"""Stress test: 50 concurrent crawls with threshold=3."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=3,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
# 50 concurrent crawls with threshold of 3 — many version bumps
|
||||||
|
tasks = [c.arun(url=_u(srv, i % 100), config=run) for i in range(50)]
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
excs = [r for r in results if isinstance(r, Exception)]
|
||||||
|
assert len(excs) == 0, f"Exceptions: {excs[:3]}"
|
||||||
|
|
||||||
|
successes = [r for r in results if not isinstance(r, Exception) and r.success]
|
||||||
|
assert len(successes) == 50
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# SECTION D — Safety cap (max pending browsers)
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_safety_cap_limits_pending_browsers(srv):
|
||||||
|
"""Should not exceed _max_pending_browsers old browsers draining."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=2,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
bm._max_pending_browsers = 2 # Lower cap for testing
|
||||||
|
|
||||||
|
# Run enough crawls to potentially exceed the cap
|
||||||
|
for i in range(15):
|
||||||
|
r = await c.arun(url=_u(srv, i), config=run)
|
||||||
|
assert r.success
|
||||||
|
|
||||||
|
# Pending cleanup should never have exceeded the cap
|
||||||
|
# (We can't directly test this during execution, but if it works without
|
||||||
|
# deadlock/timeout, the cap logic is functioning)
|
||||||
|
assert len(bm._pending_cleanup) <= bm._max_pending_browsers
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# SECTION E — Managed browser mode
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_managed_browser_recycle(srv):
|
||||||
|
"""Recycling should work with managed browser mode."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
use_managed_browser=True,
|
||||||
|
max_pages_before_recycle=3,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
|
r = await c.arun(url=_u(srv, i), config=run)
|
||||||
|
assert r.success, f"Page {i} failed"
|
||||||
|
|
||||||
|
# Version should have bumped
|
||||||
|
assert bm._browser_version >= 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_managed_browser_isolated_context_recycle(srv):
|
||||||
|
"""Recycling with managed browser + isolated contexts."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
use_managed_browser=True,
|
||||||
|
create_isolated_context=True,
|
||||||
|
max_pages_before_recycle=3,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
|
r = await c.arun(url=_u(srv, i), config=run)
|
||||||
|
assert r.success, f"Page {i} failed"
|
||||||
|
|
||||||
|
assert bm._browser_version >= 2
|
||||||
|
|
||||||
|
|
||||||
|
# ===================================================================
|
||||||
|
# SECTION F — Edge cases
|
||||||
|
# ===================================================================
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_threshold_of_one(srv):
|
||||||
|
"""Edge case: threshold=1 means version bump after every page."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=1,
|
||||||
|
)
|
||||||
|
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
for i in range(5):
|
||||||
|
r = await c.arun(url=_u(srv, i), config=run)
|
||||||
|
assert r.success
|
||||||
|
|
||||||
|
# With threshold=1, each page triggers a bump after being served:
|
||||||
|
# Page 0: served, counter=1 >= 1, bump -> version=2, counter=0
|
||||||
|
# Page 1: served, counter=1 >= 1, bump -> version=3, counter=0
|
||||||
|
# ... etc.
|
||||||
|
# After 5 pages, should have bumped 5 times
|
||||||
|
assert bm._browser_version == 6 # Started at 1, bumped 5 times
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_different_configs_get_separate_cleanup_tracking(srv):
|
||||||
|
"""Different CrawlerRunConfigs should track separately in pending cleanup."""
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True, verbose=False,
|
||||||
|
max_pages_before_recycle=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=cfg) as c:
|
||||||
|
bm = _bm(c)
|
||||||
|
|
||||||
|
run_a = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
||||||
|
run_b = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS, verbose=False,
|
||||||
|
override_navigator=True, # Different config
|
||||||
|
)
|
||||||
|
|
||||||
|
# Alternate between configs
|
||||||
|
for i in range(6):
|
||||||
|
cfg_to_use = run_a if i % 2 == 0 else run_b
|
||||||
|
r = await c.arun(url=_u(srv, i), config=cfg_to_use)
|
||||||
|
assert r.success
|
||||||
|
|
||||||
|
# Both configs should work fine
|
||||||
|
assert bm._browser_version >= 2
|
||||||
Reference in New Issue
Block a user