Fix browser recycling deadlock under sustained concurrent load (#1640)

Three bugs in the version-based browser recycling caused requests to
hang after ~80-130 pages under concurrent load:

1. Race condition: _maybe_bump_browser_version() added ALL context
   signatures to _pending_cleanup, including those with refcount 0.
   Since no future release would trigger cleanup for idle sigs, they
   stayed in _pending_cleanup permanently. Fix: split sigs into
   active (refcount > 0, go to pending) and idle (refcount == 0,
   cleaned up immediately).

2. Finally block fragility: the first line of _crawl_web's finally
   block accessed page.context.browser.contexts, which throws if the
   browser crashed. This prevented release_page_with_context() from
   ever being called, permanently leaking the refcount. Fix: call
   release_page_with_context() first in its own try/except, then do
   best-effort cleanup of listeners and page.

3. Safety cap deadlock: when _pending_cleanup accumulated >= 3 stuck
   entries, _maybe_bump_browser_version() blocked get_page() forever
   with no timeout. Fix: 30-second timeout on the wait, after which
   stuck entries (refcount 0) are force-cleaned.

Includes regression test covering all three bugs plus multi-config
concurrent crawl scenarios.
This commit is contained in:
unclecode
2026-02-19 06:27:25 +00:00
parent 13048a106b
commit 2060c7e965
3 changed files with 520 additions and 34 deletions

View File

@@ -1135,34 +1135,33 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
raise e
finally:
# If no session_id is given we should close the page
all_contexts = page.context.browser.contexts
total_pages = sum(len(context.pages) for context in all_contexts)
if config.session_id:
# Session keeps exclusive access to the page - don't release
pass
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
# Keep the page open but release it for reuse by next crawl
await self.browser_manager.release_page_with_context(page)
else:
# Detach listeners before closing to prevent potential errors during close
if config.capture_network_requests:
page.remove_listener("request", handle_request_capture)
page.remove_listener("response", handle_response_capture)
page.remove_listener("requestfailed", handle_request_failed_capture)
if config.capture_console_messages:
# Retrieve any final console messages for undetected browsers
if hasattr(self.adapter, 'retrieve_console_messages'):
final_messages = await self.adapter.retrieve_console_messages(page)
captured_console.extend(final_messages)
if not config.session_id:
# ALWAYS decrement refcount first — must succeed even if
# the browser crashed or the page is in a bad state.
try:
await self.browser_manager.release_page_with_context(page)
except Exception:
pass
# Clean up console capture
await self.adapter.cleanup_console_capture(page, handle_console, handle_error)
# Best-effort cleanup: detach listeners and close the page
try:
if config.capture_network_requests:
page.remove_listener("request", handle_request_capture)
page.remove_listener("response", handle_response_capture)
page.remove_listener("requestfailed", handle_request_failed_capture)
if config.capture_console_messages:
if hasattr(self.adapter, 'retrieve_console_messages'):
final_messages = await self.adapter.retrieve_console_messages(page)
captured_console.extend(final_messages)
await self.adapter.cleanup_console_capture(page, handle_console, handle_error)
# Release page and decrement context refcount before closing
await self.browser_manager.release_page_with_context(page)
# Close the page
await page.close()
# Close the page unless it's the last one in a headless/managed browser
all_contexts = page.context.browser.contexts
total_pages = sum(len(context.pages) for context in all_contexts)
if not (total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless)):
await page.close()
except Exception:
pass
# async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None):

View File

@@ -1792,25 +1792,34 @@ class BrowserManager:
else:
# We have a slot — do the bump inside this lock hold
old_version = self._browser_version
old_sigs = []
active_sigs = []
idle_sigs = []
async with self._contexts_lock:
for sig in list(self._context_refcounts.keys()):
old_sigs.append(sig)
if self._context_refcounts.get(sig, 0) > 0:
active_sigs.append(sig)
else:
idle_sigs.append(sig)
if self.logger:
self.logger.info(
message="Bumping browser version {old} -> {new} after {count} pages",
message="Bumping browser version {old} -> {new} after {count} pages ({active} active, {idle} idle sigs)",
tag="BROWSER",
params={
"old": old_version,
"new": old_version + 1,
"count": self._pages_served,
"active": len(active_sigs),
"idle": len(idle_sigs),
},
)
# Mark old signatures for cleanup when their refcount hits 0
# Only add sigs with active crawls to pending cleanup.
# Sigs with refcount 0 are cleaned up immediately below
# to avoid them being stuck in _pending_cleanup forever
# (no future release would trigger their cleanup).
done_event = asyncio.Event()
for sig in old_sigs:
for sig in active_sigs:
self._pending_cleanup[sig] = {
"version": old_version,
"done": done_event,
@@ -1819,10 +1828,64 @@ class BrowserManager:
# Bump version — new get_page() calls will create new contexts
self._browser_version += 1
self._pages_served = 0
return # Done!
# If we get here, we need to wait for a cleanup slot
await self._cleanup_slot_available.wait()
# Clean up idle sigs immediately (outside pending_cleanup_lock below)
break # exit while loop to do cleanup outside locks
# Safety cap path: wait for a cleanup slot, then retry.
# Timeout prevents permanent deadlock if stuck entries never drain.
try:
await asyncio.wait_for(
self._cleanup_slot_available.wait(), timeout=30.0
)
except asyncio.TimeoutError:
# Force-clean any pending entries that have refcount 0
# (they're stuck and will never drain naturally)
async with self._pending_cleanup_lock:
stuck_sigs = [
s for s in list(self._pending_cleanup.keys())
if self._context_refcounts.get(s, 0) == 0
]
for sig in stuck_sigs:
self._pending_cleanup.pop(sig, None)
if stuck_sigs:
if self.logger:
self.logger.warning(
message="Force-cleaned {count} stuck pending entries after timeout",
tag="BROWSER",
params={"count": len(stuck_sigs)},
)
# Clean up the stuck contexts
for sig in stuck_sigs:
async with self._contexts_lock:
context = self.contexts_by_config.pop(sig, None)
self._context_refcounts.pop(sig, None)
self._context_last_used.pop(sig, None)
if context is not None:
try:
await context.close()
except Exception:
pass
if len(self._pending_cleanup) < self._max_pending_browsers:
self._cleanup_slot_available.set()
# Reached via break — clean up idle sigs immediately (outside locks)
for sig in idle_sigs:
async with self._contexts_lock:
context = self.contexts_by_config.pop(sig, None)
self._context_refcounts.pop(sig, None)
self._context_last_used.pop(sig, None)
if context is not None:
try:
await context.close()
except Exception:
pass
if idle_sigs and self.logger:
self.logger.debug(
message="Immediately cleaned up {count} idle contexts from version {version}",
tag="BROWSER",
params={"count": len(idle_sigs), "version": old_version},
)
async def _maybe_cleanup_old_browser(self, sig: str):
"""Clean up an old browser's context if its refcount hit 0 and it's pending cleanup."""

424
test_repro_1640.py Normal file
View File

@@ -0,0 +1,424 @@
"""
Regression tests for PR #1640 — memory leak / hang under high concurrency
with max_pages_before_recycle enabled.
Tests three bugs that were fixed:
Bug 1: Race condition — release_page_with_context() runs BEFORE
_maybe_bump_browser_version() adds the sig to _pending_cleanup.
FIX: Don't add refcount-0 sigs to pending; clean them up immediately.
Bug 2: The finally block in _crawl_web can fail before calling
release_page_with_context(), leaking the refcount permanently.
FIX: Call release_page_with_context() FIRST in the finally block.
Bug 3: Accumulated pending_cleanup entries hit _max_pending_browsers cap,
blocking ALL get_page() calls → system-wide deadlock.
FIX: 30s timeout on safety cap wait + force-clean stuck entries.
Exit code 0 = all tests pass. Exit code 1 = regression found.
"""
import asyncio
import sys
import os
import time
sys.path.insert(0, os.path.dirname(__file__))
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.browser_manager import BrowserManager
PASS = 0
FAIL = 0
def check(name, condition):
global PASS, FAIL
if condition:
PASS += 1
print(f" PASS: {name}")
else:
FAIL += 1
print(f" FAIL: {name}")
async def test_bug1_multi_config_race():
"""
Bug 1 fix: idle sigs (refcount=0) must NOT be added to _pending_cleanup.
They should be cleaned up immediately during the version bump.
"""
print("\n" + "="*70)
print("TEST: Bug 1 — idle sig must not get stuck in _pending_cleanup")
print("="*70)
config = BrowserConfig(
headless=True,
extra_args=['--no-sandbox', '--disable-gpu'],
max_pages_before_recycle=3,
)
bm = BrowserManager(config)
await bm.start()
try:
config_a = CrawlerRunConfig(magic=True, cache_mode="bypass")
config_b = CrawlerRunConfig(magic=False, cache_mode="bypass")
# Use config A, then release → refcount 0
page_a, _ = await bm.get_page(config_a)
sig_a = bm._page_to_sig.get(page_a)
await bm.release_page_with_context(page_a)
await page_a.close()
print(f" sig_a refcount after release: {bm._context_refcounts.get(sig_a)}")
# Use config B twice → pages_served hits threshold → version bump
page_b1, _ = await bm.get_page(config_b)
page_b2, _ = await bm.get_page(config_b)
sig_b = bm._page_to_sig.get(page_b1)
# At this point the version should have bumped (3 pages served >= threshold 3)
print(f" _browser_version: {bm._browser_version}")
print(f" _pending_cleanup sigs: {list(bm._pending_cleanup.keys())}")
# sig_a (refcount=0) must NOT be in _pending_cleanup
check("sig_a NOT in _pending_cleanup",
sig_a not in bm._pending_cleanup)
# sig_a should have been cleaned up from _context_refcounts
check("sig_a cleaned from _context_refcounts",
sig_a not in bm._context_refcounts)
# sig_b (refcount>0) SHOULD be in _pending_cleanup (it will drain naturally)
check("sig_b IS in _pending_cleanup (active, will drain)",
sig_b in bm._pending_cleanup)
# Release B pages → sig_b drains → cleaned up
await bm.release_page_with_context(page_b1)
await page_b1.close()
await bm.release_page_with_context(page_b2)
await page_b2.close()
check("sig_b cleaned after release",
sig_b not in bm._pending_cleanup)
check("_pending_cleanup is empty",
len(bm._pending_cleanup) == 0)
finally:
await bm.close()
async def test_bug2_release_always_called():
"""
Bug 2 fix: release_page_with_context() must be called even when
the browser is in a bad state.
The fix moves release_page_with_context() to the FIRST line of
the finally block in _crawl_web, wrapped in try/except.
Here we verify that release_page_with_context itself works even
after browser crash, and that the fixed finally block pattern
always decrements the refcount.
"""
print("\n" + "="*70)
print("TEST: Bug 2 — release_page_with_context must work after browser crash")
print("="*70)
config = BrowserConfig(
headless=True,
extra_args=['--no-sandbox', '--disable-gpu'],
max_pages_before_recycle=5,
)
bm = BrowserManager(config)
await bm.start()
try:
crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass")
page, ctx = await bm.get_page(crawl_config)
sig = bm._page_to_sig.get(page)
print(f" sig refcount before crash: {bm._context_refcounts.get(sig)}")
check("refcount is 1 before crash",
bm._context_refcounts.get(sig) == 1)
# Simulate browser crash
if bm.browser:
await bm.browser.close()
bm.browser = None
# The FIX: call release_page_with_context even after crash
# (simulating what the fixed finally block does)
try:
await bm.release_page_with_context(page)
except Exception:
pass
refcount_after = bm._context_refcounts.get(sig, 0)
print(f" sig refcount after crash + release: {refcount_after}")
check("refcount decremented to 0 after crash + release",
refcount_after == 0)
check("page removed from _page_to_sig",
page not in bm._page_to_sig)
finally:
bm.browser = None
bm.contexts_by_config.clear()
bm._context_refcounts.clear()
bm._context_last_used.clear()
bm._page_to_sig.clear()
if bm.playwright:
await bm.playwright.stop()
async def test_bug3_safety_cap_timeout():
"""
Bug 3 fix: the safety cap wait must have a timeout.
When stuck entries accumulate, the timeout fires and force-cleans
entries with refcount 0, preventing permanent deadlock.
"""
print("\n" + "="*70)
print("TEST: Bug 3 — safety cap wait must not block forever")
print("="*70)
config = BrowserConfig(
headless=True,
extra_args=['--no-sandbox', '--disable-gpu'],
max_pages_before_recycle=2,
)
bm = BrowserManager(config)
await bm.start()
try:
crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass")
# Inject stuck entries WITH refcount 0 (simulating leaked refcounts
# that were later force-decremented or never properly tracked)
print(f" Safety cap: {bm._max_pending_browsers}")
for i in range(bm._max_pending_browsers):
fake_sig = f"stuck_sig_{i}"
bm._pending_cleanup[fake_sig] = {"version": i, "done": asyncio.Event()}
# refcount 0 = stuck (no future release will clean these up)
bm._context_refcounts[fake_sig] = 0
print(f" Injected {len(bm._pending_cleanup)} stuck entries (refcount=0)")
bm._pages_served = bm.config.max_pages_before_recycle
# The fix: get_page should NOT block forever.
# The 30s timeout will fire, force-clean stuck entries, and proceed.
# We use a 35s test timeout to allow the 30s internal timeout to fire.
print(f" Calling get_page() — should unblock after ~30s timeout...")
start = time.monotonic()
try:
page, ctx = await asyncio.wait_for(
bm.get_page(crawl_config),
timeout=35.0
)
elapsed = time.monotonic() - start
print(f" get_page() returned after {elapsed:.1f}s")
check("get_page() did NOT deadlock (returned within 35s)", True)
check("stuck entries were force-cleaned",
len(bm._pending_cleanup) < bm._max_pending_browsers)
await bm.release_page_with_context(page)
await page.close()
except asyncio.TimeoutError:
elapsed = time.monotonic() - start
print(f" get_page() STILL blocked after {elapsed:.1f}s")
check("get_page() did NOT deadlock", False)
finally:
bm._pending_cleanup.clear()
bm._context_refcounts.clear()
await bm.close()
async def test_real_concurrent_crawl():
"""
Integration test: run many concurrent crawls with recycling
and verify no stuck entries or deadlocks.
"""
print("\n" + "="*70)
print("TEST: Real concurrent crawls with recycling")
print("="*70)
config = BrowserConfig(
headless=True,
extra_args=['--no-sandbox', '--disable-gpu'],
max_pages_before_recycle=10,
)
bm = BrowserManager(config)
await bm.start()
TOTAL = 80
CONCURRENT = 8
completed = 0
errors = 0
sem = asyncio.Semaphore(CONCURRENT)
async def do_crawl(i):
nonlocal completed, errors
async with sem:
try:
crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass")
page, ctx = await asyncio.wait_for(
bm.get_page(crawl_config),
timeout=30.0
)
try:
await page.goto("https://example.com", timeout=15000)
except Exception:
pass
# Use the FIXED finally pattern: release first, then close
try:
await bm.release_page_with_context(page)
except Exception:
pass
try:
await page.close()
except Exception:
pass
completed += 1
if completed % 20 == 0:
print(f" [{completed}/{TOTAL}] version={bm._browser_version} "
f"pending={len(bm._pending_cleanup)} "
f"pages_served={bm._pages_served}")
except asyncio.TimeoutError:
errors += 1
print(f" [{i}] TIMEOUT in get_page()!")
except Exception as e:
errors += 1
if errors <= 3:
print(f" [{i}] Error: {e}")
start = time.monotonic()
tasks = [asyncio.create_task(do_crawl(i)) for i in range(TOTAL)]
await asyncio.gather(*tasks)
elapsed = time.monotonic() - start
print(f"\n Results: {completed}/{TOTAL} completed, {errors} errors, {elapsed:.1f}s")
stuck = [s for s in bm._pending_cleanup if bm._context_refcounts.get(s, 0) == 0]
check(f"all {TOTAL} crawls completed", completed == TOTAL)
check("no errors", errors == 0)
check("no stuck entries in _pending_cleanup", len(stuck) == 0)
check("no timeouts (no deadlock)", errors == 0)
await bm.close()
async def test_multi_config_concurrent():
"""
Integration test: concurrent crawls with DIFFERENT configs to
exercise the multi-sig path that triggered Bug 1.
"""
print("\n" + "="*70)
print("TEST: Multi-config concurrent crawls")
print("="*70)
config = BrowserConfig(
headless=True,
extra_args=['--no-sandbox', '--disable-gpu'],
max_pages_before_recycle=5,
)
bm = BrowserManager(config)
await bm.start()
TOTAL = 40
CONCURRENT = 6
completed = 0
errors = 0
sem = asyncio.Semaphore(CONCURRENT)
configs = [
CrawlerRunConfig(magic=True, cache_mode="bypass"),
CrawlerRunConfig(magic=False, cache_mode="bypass"),
CrawlerRunConfig(magic=True, simulate_user=True, cache_mode="bypass"),
]
async def do_crawl(i):
nonlocal completed, errors
async with sem:
try:
crawl_config = configs[i % len(configs)]
page, ctx = await asyncio.wait_for(
bm.get_page(crawl_config),
timeout=30.0
)
try:
await page.goto("https://example.com", timeout=15000)
except Exception:
pass
try:
await bm.release_page_with_context(page)
except Exception:
pass
try:
await page.close()
except Exception:
pass
completed += 1
except asyncio.TimeoutError:
errors += 1
print(f" [{i}] TIMEOUT!")
print(f" pending={len(bm._pending_cleanup)}")
except Exception as e:
errors += 1
if errors <= 3:
print(f" [{i}] Error: {e}")
start = time.monotonic()
tasks = [asyncio.create_task(do_crawl(i)) for i in range(TOTAL)]
await asyncio.gather(*tasks)
elapsed = time.monotonic() - start
stuck = [s for s in bm._pending_cleanup if bm._context_refcounts.get(s, 0) == 0]
print(f"\n Results: {completed}/{TOTAL}, {errors} errors, {elapsed:.1f}s")
print(f" Final: version={bm._browser_version} pending={len(bm._pending_cleanup)} stuck={len(stuck)}")
check(f"all {TOTAL} multi-config crawls completed", completed == TOTAL)
check("no stuck entries", len(stuck) == 0)
check("no timeouts", errors == 0)
await bm.close()
async def main():
print("="*70)
print("PR #1640 Regression Tests")
print("="*70)
await test_bug2_release_always_called()
await test_bug1_multi_config_race()
await test_bug3_safety_cap_timeout()
await test_real_concurrent_crawl()
await test_multi_config_concurrent()
print("\n" + "="*70)
if FAIL == 0:
print(f"ALL {PASS} CHECKS PASSED")
else:
print(f"FAILED: {FAIL} checks failed, {PASS} passed")
print("="*70)
sys.exit(1 if FAIL > 0 else 0)
if __name__ == "__main__":
asyncio.run(main())