Fix browser recycling under high concurrency — version-based approach

The previous recycle logic waited for all refcounts to hit 0 before
recycling, which never happened under sustained concurrent load (20+
crawls always had at least one active).

New approach:
- Add _browser_version to config signature — bump it to force new contexts
- When threshold is hit: bump version, move old sigs to _pending_cleanup
- New requests get new contexts automatically (different signature)
- Old contexts drain naturally and get cleaned up when refcount hits 0
- Safety cap: max 3 pending browsers draining at once

This means recycling now works under any load pattern — no blocking,
no waiting for quiet moments. Old and new browsers coexist briefly
during transitions.

Includes 12 new tests covering version bumps, concurrent recycling,
safety cap, and edge cases.
This commit is contained in:
unclecode
2026-02-05 07:48:12 +00:00
parent c046918bb4
commit 3401dd1620
2 changed files with 502 additions and 70 deletions

View File

@@ -734,12 +734,14 @@ class BrowserManager:
# Browser endpoint key for global page tracking (set after browser starts) # Browser endpoint key for global page tracking (set after browser starts)
self._browser_endpoint_key: Optional[str] = None self._browser_endpoint_key: Optional[str] = None
# Browser recycling state # Browser recycling state (version-based approach)
self._pages_served = 0 self._pages_served = 0
self._recycling = False self._browser_version = 1 # included in signature, bump to create new browser
self._recycle_lock = asyncio.Lock() self._pending_cleanup = {} # old_sig -> {"browser": browser, "contexts": [...], "done": Event}
self._recycle_done = asyncio.Event() self._pending_cleanup_lock = asyncio.Lock()
self._recycle_done.set() # starts "open" — not recycling self._max_pending_browsers = 3 # safety cap — block if too many draining
self._cleanup_slot_available = asyncio.Event()
self._cleanup_slot_available.set() # starts open
# Stealth adapter for stealth mode # Stealth adapter for stealth mode
self._stealth_adapter = None self._stealth_adapter = None
@@ -1318,6 +1320,9 @@ class BrowserManager:
sig_dict["simulate_user"] = crawlerRunConfig.simulate_user sig_dict["simulate_user"] = crawlerRunConfig.simulate_user
sig_dict["magic"] = crawlerRunConfig.magic sig_dict["magic"] = crawlerRunConfig.magic
# Browser version — bumped on recycle to force new browser instance
sig_dict["_browser_version"] = self._browser_version
signature_json = json.dumps(sig_dict, sort_keys=True, default=str) signature_json = json.dumps(sig_dict, sort_keys=True, default=str)
return hashlib.sha256(signature_json.encode("utf-8")).hexdigest() return hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
@@ -1434,9 +1439,6 @@ class BrowserManager:
Returns: Returns:
(page, context): The Page and its BrowserContext (page, context): The Page and its BrowserContext
""" """
# Block if browser is being recycled; wakes instantly when done
await self._recycle_done.wait()
self._cleanup_expired_sessions() self._cleanup_expired_sessions()
# If a session_id is provided and we already have it, reuse that page + context # If a session_id is provided and we already have it, reuse that page + context
@@ -1597,6 +1599,11 @@ class BrowserManager:
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
self._pages_served += 1 self._pages_served += 1
# Check if browser recycle threshold is hit — bump version for next requests
# This happens AFTER incrementing counter so concurrent requests see correct count
await self._maybe_bump_browser_version()
return page, context return page, context
async def kill_session(self, session_id: str): async def kill_session(self, session_id: str):
@@ -1645,16 +1652,19 @@ class BrowserManager:
release_page() so the context lifecycle is properly tracked. release_page() so the context lifecycle is properly tracked.
""" """
self._release_page_from_use(page) self._release_page_from_use(page)
sig = None
refcount = -1
async with self._contexts_lock: async with self._contexts_lock:
sig = self._page_to_sig.pop(page, None) sig = self._page_to_sig.pop(page, None)
if sig is not None and sig in self._context_refcounts: if sig is not None and sig in self._context_refcounts:
self._context_refcounts[sig] = max( self._context_refcounts[sig] = max(
0, self._context_refcounts[sig] - 1 0, self._context_refcounts[sig] - 1
) )
refcount = self._context_refcounts[sig]
# Check if browser recycle is needed # Check if this signature belongs to an old browser waiting to be cleaned up
if self._should_recycle(): if sig is not None and refcount == 0:
await self._maybe_recycle_browser() await self._maybe_cleanup_old_browser(sig)
def _should_recycle(self) -> bool: def _should_recycle(self) -> bool:
"""Check if page threshold reached for browser recycling.""" """Check if page threshold reached for browser recycling."""
@@ -1663,77 +1673,113 @@ class BrowserManager:
return False return False
return self._pages_served >= limit return self._pages_served >= limit
async def _maybe_recycle_browser(self): async def _maybe_bump_browser_version(self):
"""Recycle browser if no active crawls are in-flight. """Bump browser version if threshold reached, moving old browser to pending cleanup.
Uses asyncio.Event to block new get_page() callers during recycle, New requests automatically get a new browser (via new signature).
and sets _recycling inside _contexts_lock to prevent race conditions. Old browser drains naturally and gets cleaned up when refcount hits 0.
""" """
if self._recycling: if not self._should_recycle():
return return
async with self._recycle_lock: # Safety cap: wait if too many old browsers are draining
if self._recycling: while True:
return async with self._pending_cleanup_lock:
# Re-check threshold under lock (another request may have bumped already)
if not self._should_recycle():
return
# Set _recycling and check refcounts under the SAME lock # Check safety cap
# to prevent a new crawl slipping in between check and flag set if len(self._pending_cleanup) >= self._max_pending_browsers:
if self.logger:
self.logger.debug(
message="Waiting for old browser to drain (pending: {count})",
tag="BROWSER",
params={"count": len(self._pending_cleanup)},
)
self._cleanup_slot_available.clear()
# Release lock and wait
else:
# We have a slot — do the bump inside this lock hold
old_version = self._browser_version
old_sigs = []
async with self._contexts_lock:
for sig in list(self._context_refcounts.keys()):
old_sigs.append(sig)
if self.logger:
self.logger.info(
message="Bumping browser version {old} -> {new} after {count} pages",
tag="BROWSER",
params={
"old": old_version,
"new": old_version + 1,
"count": self._pages_served,
},
)
# Mark old signatures for cleanup when their refcount hits 0
done_event = asyncio.Event()
for sig in old_sigs:
self._pending_cleanup[sig] = {
"version": old_version,
"done": done_event,
}
# Bump version — new get_page() calls will create new contexts
self._browser_version += 1
self._pages_served = 0
return # Done!
# If we get here, we need to wait for a cleanup slot
await self._cleanup_slot_available.wait()
async def _maybe_cleanup_old_browser(self, sig: str):
"""Clean up an old browser's context if its refcount hit 0 and it's pending cleanup."""
async with self._pending_cleanup_lock:
if sig not in self._pending_cleanup:
return # Not an old browser signature
cleanup_info = self._pending_cleanup.pop(sig)
old_version = cleanup_info["version"]
if self.logger:
self.logger.debug(
message="Cleaning up context from browser version {version} (sig: {sig})",
tag="BROWSER",
params={"version": old_version, "sig": sig[:12]},
)
# Remove context from tracking
async with self._contexts_lock: async with self._contexts_lock:
total_active = sum(self._context_refcounts.values()) context = self.contexts_by_config.pop(sig, None)
if total_active > 0: self._context_refcounts.pop(sig, None)
return # active crawls running, next release will re-check self._context_last_used.pop(sig, None)
self._recycling = True
self._recycle_done.clear() # block new get_page() callers
try: # Close context outside locks
if context is not None:
try:
await context.close()
except Exception:
pass
# Check if any signatures from this old version remain
remaining_old = [
s for s, info in self._pending_cleanup.items()
if info["version"] == old_version
]
if not remaining_old:
if self.logger: if self.logger:
self.logger.info( self.logger.info(
message="Recycling browser after {count} pages to reclaim memory", message="All contexts from browser version {version} cleaned up",
tag="BROWSER", tag="BROWSER",
params={"count": self._pages_served}, params={"version": old_version},
) )
# Force full cleanup to kill the browser process and reclaim # Open a cleanup slot if we're below the cap
# memory. For external CDP (cdp_url without cache), temporarily if len(self._pending_cleanup) < self._max_pending_browsers:
# enable cdp_cleanup_on_close so close() actually disconnects. self._cleanup_slot_available.set()
# For cached CDP, close() already handles release correctly.
saved_cdp_cleanup = self.config.cdp_cleanup_on_close
if self.config.cdp_url and not self._using_cached_cdp:
self.config.cdp_cleanup_on_close = True
try:
await self.close()
finally:
self.config.cdp_cleanup_on_close = saved_cdp_cleanup
# close() already clears most tracking dicts, but ensure
# everything is reset for the fresh browser
self.contexts_by_config.clear()
self._context_refcounts.clear()
self._context_last_used.clear()
self._page_to_sig.clear()
self.sessions.clear()
# Clear global page tracking for this endpoint — old pages are dead
if self._browser_endpoint_key and self._browser_endpoint_key in BrowserManager._global_pages_in_use:
BrowserManager._global_pages_in_use[self._browser_endpoint_key].clear()
# Re-create ManagedBrowser if needed — close() sets it to None,
# but start() expects it to exist for the managed browser path.
if self.config.use_managed_browser and self.managed_browser is None:
self.managed_browser = ManagedBrowser(
browser_type=self.config.browser_type,
user_data_dir=self.config.user_data_dir,
headless=self.config.headless,
logger=self.logger,
debugging_port=self.config.debugging_port,
cdp_url=self.config.cdp_url,
browser_config=self.config,
)
await self.start()
self._pages_served = 0
finally:
self._recycling = False
self._recycle_done.set() # wake ALL waiting get_page() callers
def _cleanup_expired_sessions(self): def _cleanup_expired_sessions(self):
"""Clean up expired sessions based on TTL.""" """Clean up expired sessions based on TTL."""

View File

@@ -0,0 +1,386 @@
"""
Tests for version-based browser recycling.
The new recycle approach:
1. When pages_served hits threshold, bump _browser_version
2. Old signatures go to _pending_cleanup
3. New requests get new contexts (different version = different signature)
4. When old context's refcount hits 0, it gets cleaned up
5. No blocking — old and new browsers coexist during transition
These tests use small thresholds (3-5 pages) to verify the mechanics.
"""
import asyncio
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
import pytest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
# ---------------------------------------------------------------------------
# Local test server
# ---------------------------------------------------------------------------
PAGES = {}
for i in range(100):
PAGES[f"/page{i}"] = (
f"<!DOCTYPE html><html><head><title>Page {i}</title></head>"
f"<body><h1>Page {i}</h1><p>Content for page {i}.</p></body></html>"
).encode()
class Handler(SimpleHTTPRequestHandler):
def log_message(self, *a):
pass
def do_GET(self):
body = PAGES.get(self.path, PAGES["/page0"])
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(body)
class _Server(HTTPServer):
allow_reuse_address = True
@pytest.fixture(scope="module")
def srv():
s = _Server(("127.0.0.1", 0), Handler)
port = s.server_address[1]
t = threading.Thread(target=s.serve_forever, daemon=True)
t.start()
yield f"http://127.0.0.1:{port}"
s.shutdown()
def _u(base, i):
return f"{base}/page{i}"
def _bm(c):
return c.crawler_strategy.browser_manager
# ===================================================================
# SECTION A — Version bump mechanics
# ===================================================================
@pytest.mark.asyncio
async def test_version_bump_on_threshold(srv):
"""Browser version should bump when threshold is reached."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
assert bm._browser_version == 1
# Crawl 2 pages — no bump yet
for i in range(2):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
assert bm._browser_version == 1, "Version should still be 1 after 2 pages"
assert bm._pages_served == 2
# 3rd page hits threshold (3) and triggers bump AFTER the page is served
r = await c.arun(url=_u(srv, 2), config=run)
assert r.success
assert bm._browser_version == 2, "Version should bump after 3rd page"
assert bm._pages_served == 0, "Counter resets after bump"
# 4th page is first page of version 2
r = await c.arun(url=_u(srv, 3), config=run)
assert r.success
assert bm._pages_served == 1
@pytest.mark.asyncio
async def test_signature_changes_after_version_bump(srv):
"""Same CrawlerRunConfig should produce different signatures after version bump."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=2,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# Get signature before bump
sig_v1 = bm._make_config_signature(run)
# Crawl 2 pages
for i in range(2):
await c.arun(url=_u(srv, i), config=run)
# 3rd request triggers bump
await c.arun(url=_u(srv, 2), config=run)
# Signature should be different now
sig_v2 = bm._make_config_signature(run)
assert sig_v1 != sig_v2, "Signature should change after version bump"
@pytest.mark.asyncio
async def test_no_version_bump_when_disabled(srv):
"""Version should stay at 1 when max_pages_before_recycle=0."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=0, # Disabled
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
for i in range(20):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
assert bm._browser_version == 1, "Version should not bump when disabled"
assert bm._pages_served == 20
# ===================================================================
# SECTION B — Pending cleanup mechanics
# ===================================================================
@pytest.mark.asyncio
async def test_old_signature_goes_to_pending_cleanup(srv):
"""Version bump works and old contexts get cleaned up."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=2,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# Crawl 2 pages — creates signature for version 1, bumps on 2nd
for i in range(2):
await c.arun(url=_u(srv, i), config=run)
# After 2 pages with threshold=2, version should have bumped
assert bm._browser_version == 2
# Since sequential crawls release pages immediately (refcount=0),
# old contexts get cleaned up right away. Pending cleanup should be empty.
# This is correct behavior — cleanup is eager when possible.
assert len(bm._pending_cleanup) == 0
@pytest.mark.asyncio
async def test_cleanup_happens_when_refcount_hits_zero(srv):
"""Old context should be closed when its refcount drops to 0."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# Sequential crawls: each page is released before next request
# So refcount is always 0 between requests, and cleanup happens immediately
for i in range(10):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
# Should have bumped twice (at 3 and 6) with version now at 3
# But since refcount=0 immediately, pending_cleanup should be empty
assert len(bm._pending_cleanup) == 0, "All old contexts should be cleaned up"
# ===================================================================
# SECTION C — Concurrent crawls with recycling
# ===================================================================
@pytest.mark.asyncio
async def test_concurrent_crawls_dont_block_on_recycle(srv):
"""Concurrent crawls should not block — old browser drains while new one serves."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=5,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# Launch 20 concurrent crawls
tasks = [c.arun(url=_u(srv, i), config=run) for i in range(20)]
results = await asyncio.gather(*tasks, return_exceptions=True)
# All should succeed — no blocking, no errors
excs = [r for r in results if isinstance(r, Exception)]
assert len(excs) == 0, f"Exceptions: {excs[:3]}"
successes = [r for r in results if not isinstance(r, Exception) and r.success]
assert len(successes) == 20, f"Only {len(successes)} succeeded"
# Version should have bumped multiple times
assert bm._browser_version >= 2, "Should have recycled at least once"
@pytest.mark.asyncio
async def test_high_concurrency_with_small_threshold(srv):
"""Stress test: 50 concurrent crawls with threshold=3."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# 50 concurrent crawls with threshold of 3 — many version bumps
tasks = [c.arun(url=_u(srv, i % 100), config=run) for i in range(50)]
results = await asyncio.gather(*tasks, return_exceptions=True)
excs = [r for r in results if isinstance(r, Exception)]
assert len(excs) == 0, f"Exceptions: {excs[:3]}"
successes = [r for r in results if not isinstance(r, Exception) and r.success]
assert len(successes) == 50
# ===================================================================
# SECTION D — Safety cap (max pending browsers)
# ===================================================================
@pytest.mark.asyncio
async def test_safety_cap_limits_pending_browsers(srv):
"""Should not exceed _max_pending_browsers old browsers draining."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=2,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
bm._max_pending_browsers = 2 # Lower cap for testing
# Run enough crawls to potentially exceed the cap
for i in range(15):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
# Pending cleanup should never have exceeded the cap
# (We can't directly test this during execution, but if it works without
# deadlock/timeout, the cap logic is functioning)
assert len(bm._pending_cleanup) <= bm._max_pending_browsers
# ===================================================================
# SECTION E — Managed browser mode
# ===================================================================
@pytest.mark.asyncio
async def test_managed_browser_recycle(srv):
"""Recycling should work with managed browser mode."""
cfg = BrowserConfig(
headless=True, verbose=False,
use_managed_browser=True,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
for i in range(10):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success, f"Page {i} failed"
# Version should have bumped
assert bm._browser_version >= 2
@pytest.mark.asyncio
async def test_managed_browser_isolated_context_recycle(srv):
"""Recycling with managed browser + isolated contexts."""
cfg = BrowserConfig(
headless=True, verbose=False,
use_managed_browser=True,
create_isolated_context=True,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
for i in range(10):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success, f"Page {i} failed"
assert bm._browser_version >= 2
# ===================================================================
# SECTION F — Edge cases
# ===================================================================
@pytest.mark.asyncio
async def test_threshold_of_one(srv):
"""Edge case: threshold=1 means version bump after every page."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=1,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
for i in range(5):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
# With threshold=1, each page triggers a bump after being served:
# Page 0: served, counter=1 >= 1, bump -> version=2, counter=0
# Page 1: served, counter=1 >= 1, bump -> version=3, counter=0
# ... etc.
# After 5 pages, should have bumped 5 times
assert bm._browser_version == 6 # Started at 1, bumped 5 times
@pytest.mark.asyncio
async def test_different_configs_get_separate_cleanup_tracking(srv):
"""Different CrawlerRunConfigs should track separately in pending cleanup."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=2,
)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
run_a = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
run_b = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, verbose=False,
override_navigator=True, # Different config
)
# Alternate between configs
for i in range(6):
cfg_to_use = run_a if i % 2 == 0 else run_b
r = await c.arun(url=_u(srv, i), config=cfg_to_use)
assert r.success
# Both configs should work fine
assert bm._browser_version >= 2