From 2060c7e96575263132fb832fb5b37ce9c4b9a3c0 Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 19 Feb 2026 06:27:25 +0000 Subject: [PATCH] Fix browser recycling deadlock under sustained concurrent load (#1640) Three bugs in the version-based browser recycling caused requests to hang after ~80-130 pages under concurrent load: 1. Race condition: _maybe_bump_browser_version() added ALL context signatures to _pending_cleanup, including those with refcount 0. Since no future release would trigger cleanup for idle sigs, they stayed in _pending_cleanup permanently. Fix: split sigs into active (refcount > 0, go to pending) and idle (refcount == 0, cleaned up immediately). 2. Finally block fragility: the first line of _crawl_web's finally block accessed page.context.browser.contexts, which throws if the browser crashed. This prevented release_page_with_context() from ever being called, permanently leaking the refcount. Fix: call release_page_with_context() first in its own try/except, then do best-effort cleanup of listeners and page. 3. Safety cap deadlock: when _pending_cleanup accumulated >= 3 stuck entries, _maybe_bump_browser_version() blocked get_page() forever with no timeout. Fix: 30-second timeout on the wait, after which stuck entries (refcount 0) are force-cleaned. Includes regression test covering all three bugs plus multi-config concurrent crawl scenarios. --- crawl4ai/async_crawler_strategy.py | 51 ++-- crawl4ai/browser_manager.py | 79 +++++- test_repro_1640.py | 424 +++++++++++++++++++++++++++++ 3 files changed, 520 insertions(+), 34 deletions(-) create mode 100644 test_repro_1640.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 198a62b7..7e5299d8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1135,34 +1135,33 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): raise e finally: - # If no session_id is given we should close the page - all_contexts = page.context.browser.contexts - total_pages = sum(len(context.pages) for context in all_contexts) - if config.session_id: - # Session keeps exclusive access to the page - don't release - pass - elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless): - # Keep the page open but release it for reuse by next crawl - await self.browser_manager.release_page_with_context(page) - else: - # Detach listeners before closing to prevent potential errors during close - if config.capture_network_requests: - page.remove_listener("request", handle_request_capture) - page.remove_listener("response", handle_response_capture) - page.remove_listener("requestfailed", handle_request_failed_capture) - if config.capture_console_messages: - # Retrieve any final console messages for undetected browsers - if hasattr(self.adapter, 'retrieve_console_messages'): - final_messages = await self.adapter.retrieve_console_messages(page) - captured_console.extend(final_messages) + if not config.session_id: + # ALWAYS decrement refcount first — must succeed even if + # the browser crashed or the page is in a bad state. + try: + await self.browser_manager.release_page_with_context(page) + except Exception: + pass - # Clean up console capture - await self.adapter.cleanup_console_capture(page, handle_console, handle_error) + # Best-effort cleanup: detach listeners and close the page + try: + if config.capture_network_requests: + page.remove_listener("request", handle_request_capture) + page.remove_listener("response", handle_response_capture) + page.remove_listener("requestfailed", handle_request_failed_capture) + if config.capture_console_messages: + if hasattr(self.adapter, 'retrieve_console_messages'): + final_messages = await self.adapter.retrieve_console_messages(page) + captured_console.extend(final_messages) + await self.adapter.cleanup_console_capture(page, handle_console, handle_error) - # Release page and decrement context refcount before closing - await self.browser_manager.release_page_with_context(page) - # Close the page - await page.close() + # Close the page unless it's the last one in a headless/managed browser + all_contexts = page.context.browser.contexts + total_pages = sum(len(context.pages) for context in all_contexts) + if not (total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless)): + await page.close() + except Exception: + pass # async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None): diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 438c81ab..09f67356 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -1792,25 +1792,34 @@ class BrowserManager: else: # We have a slot — do the bump inside this lock hold old_version = self._browser_version - old_sigs = [] + active_sigs = [] + idle_sigs = [] async with self._contexts_lock: for sig in list(self._context_refcounts.keys()): - old_sigs.append(sig) + if self._context_refcounts.get(sig, 0) > 0: + active_sigs.append(sig) + else: + idle_sigs.append(sig) if self.logger: self.logger.info( - message="Bumping browser version {old} -> {new} after {count} pages", + message="Bumping browser version {old} -> {new} after {count} pages ({active} active, {idle} idle sigs)", tag="BROWSER", params={ "old": old_version, "new": old_version + 1, "count": self._pages_served, + "active": len(active_sigs), + "idle": len(idle_sigs), }, ) - # Mark old signatures for cleanup when their refcount hits 0 + # Only add sigs with active crawls to pending cleanup. + # Sigs with refcount 0 are cleaned up immediately below + # to avoid them being stuck in _pending_cleanup forever + # (no future release would trigger their cleanup). done_event = asyncio.Event() - for sig in old_sigs: + for sig in active_sigs: self._pending_cleanup[sig] = { "version": old_version, "done": done_event, @@ -1819,10 +1828,64 @@ class BrowserManager: # Bump version — new get_page() calls will create new contexts self._browser_version += 1 self._pages_served = 0 - return # Done! - # If we get here, we need to wait for a cleanup slot - await self._cleanup_slot_available.wait() + # Clean up idle sigs immediately (outside pending_cleanup_lock below) + break # exit while loop to do cleanup outside locks + + # Safety cap path: wait for a cleanup slot, then retry. + # Timeout prevents permanent deadlock if stuck entries never drain. + try: + await asyncio.wait_for( + self._cleanup_slot_available.wait(), timeout=30.0 + ) + except asyncio.TimeoutError: + # Force-clean any pending entries that have refcount 0 + # (they're stuck and will never drain naturally) + async with self._pending_cleanup_lock: + stuck_sigs = [ + s for s in list(self._pending_cleanup.keys()) + if self._context_refcounts.get(s, 0) == 0 + ] + for sig in stuck_sigs: + self._pending_cleanup.pop(sig, None) + if stuck_sigs: + if self.logger: + self.logger.warning( + message="Force-cleaned {count} stuck pending entries after timeout", + tag="BROWSER", + params={"count": len(stuck_sigs)}, + ) + # Clean up the stuck contexts + for sig in stuck_sigs: + async with self._contexts_lock: + context = self.contexts_by_config.pop(sig, None) + self._context_refcounts.pop(sig, None) + self._context_last_used.pop(sig, None) + if context is not None: + try: + await context.close() + except Exception: + pass + if len(self._pending_cleanup) < self._max_pending_browsers: + self._cleanup_slot_available.set() + + # Reached via break — clean up idle sigs immediately (outside locks) + for sig in idle_sigs: + async with self._contexts_lock: + context = self.contexts_by_config.pop(sig, None) + self._context_refcounts.pop(sig, None) + self._context_last_used.pop(sig, None) + if context is not None: + try: + await context.close() + except Exception: + pass + if idle_sigs and self.logger: + self.logger.debug( + message="Immediately cleaned up {count} idle contexts from version {version}", + tag="BROWSER", + params={"count": len(idle_sigs), "version": old_version}, + ) async def _maybe_cleanup_old_browser(self, sig: str): """Clean up an old browser's context if its refcount hit 0 and it's pending cleanup.""" diff --git a/test_repro_1640.py b/test_repro_1640.py new file mode 100644 index 00000000..ee333818 --- /dev/null +++ b/test_repro_1640.py @@ -0,0 +1,424 @@ +""" +Regression tests for PR #1640 — memory leak / hang under high concurrency +with max_pages_before_recycle enabled. + +Tests three bugs that were fixed: + +Bug 1: Race condition — release_page_with_context() runs BEFORE + _maybe_bump_browser_version() adds the sig to _pending_cleanup. + FIX: Don't add refcount-0 sigs to pending; clean them up immediately. + +Bug 2: The finally block in _crawl_web can fail before calling + release_page_with_context(), leaking the refcount permanently. + FIX: Call release_page_with_context() FIRST in the finally block. + +Bug 3: Accumulated pending_cleanup entries hit _max_pending_browsers cap, + blocking ALL get_page() calls → system-wide deadlock. + FIX: 30s timeout on safety cap wait + force-clean stuck entries. + +Exit code 0 = all tests pass. Exit code 1 = regression found. +""" + +import asyncio +import sys +import os +import time + +sys.path.insert(0, os.path.dirname(__file__)) + +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.browser_manager import BrowserManager + +PASS = 0 +FAIL = 0 + + +def check(name, condition): + global PASS, FAIL + if condition: + PASS += 1 + print(f" PASS: {name}") + else: + FAIL += 1 + print(f" FAIL: {name}") + + +async def test_bug1_multi_config_race(): + """ + Bug 1 fix: idle sigs (refcount=0) must NOT be added to _pending_cleanup. + They should be cleaned up immediately during the version bump. + """ + print("\n" + "="*70) + print("TEST: Bug 1 — idle sig must not get stuck in _pending_cleanup") + print("="*70) + + config = BrowserConfig( + headless=True, + extra_args=['--no-sandbox', '--disable-gpu'], + max_pages_before_recycle=3, + ) + bm = BrowserManager(config) + await bm.start() + + try: + config_a = CrawlerRunConfig(magic=True, cache_mode="bypass") + config_b = CrawlerRunConfig(magic=False, cache_mode="bypass") + + # Use config A, then release → refcount 0 + page_a, _ = await bm.get_page(config_a) + sig_a = bm._page_to_sig.get(page_a) + await bm.release_page_with_context(page_a) + await page_a.close() + + print(f" sig_a refcount after release: {bm._context_refcounts.get(sig_a)}") + + # Use config B twice → pages_served hits threshold → version bump + page_b1, _ = await bm.get_page(config_b) + page_b2, _ = await bm.get_page(config_b) + sig_b = bm._page_to_sig.get(page_b1) + + # At this point the version should have bumped (3 pages served >= threshold 3) + print(f" _browser_version: {bm._browser_version}") + print(f" _pending_cleanup sigs: {list(bm._pending_cleanup.keys())}") + + # sig_a (refcount=0) must NOT be in _pending_cleanup + check("sig_a NOT in _pending_cleanup", + sig_a not in bm._pending_cleanup) + + # sig_a should have been cleaned up from _context_refcounts + check("sig_a cleaned from _context_refcounts", + sig_a not in bm._context_refcounts) + + # sig_b (refcount>0) SHOULD be in _pending_cleanup (it will drain naturally) + check("sig_b IS in _pending_cleanup (active, will drain)", + sig_b in bm._pending_cleanup) + + # Release B pages → sig_b drains → cleaned up + await bm.release_page_with_context(page_b1) + await page_b1.close() + await bm.release_page_with_context(page_b2) + await page_b2.close() + + check("sig_b cleaned after release", + sig_b not in bm._pending_cleanup) + + check("_pending_cleanup is empty", + len(bm._pending_cleanup) == 0) + + finally: + await bm.close() + + +async def test_bug2_release_always_called(): + """ + Bug 2 fix: release_page_with_context() must be called even when + the browser is in a bad state. + + The fix moves release_page_with_context() to the FIRST line of + the finally block in _crawl_web, wrapped in try/except. + Here we verify that release_page_with_context itself works even + after browser crash, and that the fixed finally block pattern + always decrements the refcount. + """ + print("\n" + "="*70) + print("TEST: Bug 2 — release_page_with_context must work after browser crash") + print("="*70) + + config = BrowserConfig( + headless=True, + extra_args=['--no-sandbox', '--disable-gpu'], + max_pages_before_recycle=5, + ) + bm = BrowserManager(config) + await bm.start() + + try: + crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass") + + page, ctx = await bm.get_page(crawl_config) + sig = bm._page_to_sig.get(page) + print(f" sig refcount before crash: {bm._context_refcounts.get(sig)}") + + check("refcount is 1 before crash", + bm._context_refcounts.get(sig) == 1) + + # Simulate browser crash + if bm.browser: + await bm.browser.close() + bm.browser = None + + # The FIX: call release_page_with_context even after crash + # (simulating what the fixed finally block does) + try: + await bm.release_page_with_context(page) + except Exception: + pass + + refcount_after = bm._context_refcounts.get(sig, 0) + print(f" sig refcount after crash + release: {refcount_after}") + + check("refcount decremented to 0 after crash + release", + refcount_after == 0) + + check("page removed from _page_to_sig", + page not in bm._page_to_sig) + + finally: + bm.browser = None + bm.contexts_by_config.clear() + bm._context_refcounts.clear() + bm._context_last_used.clear() + bm._page_to_sig.clear() + if bm.playwright: + await bm.playwright.stop() + + +async def test_bug3_safety_cap_timeout(): + """ + Bug 3 fix: the safety cap wait must have a timeout. + When stuck entries accumulate, the timeout fires and force-cleans + entries with refcount 0, preventing permanent deadlock. + """ + print("\n" + "="*70) + print("TEST: Bug 3 — safety cap wait must not block forever") + print("="*70) + + config = BrowserConfig( + headless=True, + extra_args=['--no-sandbox', '--disable-gpu'], + max_pages_before_recycle=2, + ) + bm = BrowserManager(config) + await bm.start() + + try: + crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass") + + # Inject stuck entries WITH refcount 0 (simulating leaked refcounts + # that were later force-decremented or never properly tracked) + print(f" Safety cap: {bm._max_pending_browsers}") + for i in range(bm._max_pending_browsers): + fake_sig = f"stuck_sig_{i}" + bm._pending_cleanup[fake_sig] = {"version": i, "done": asyncio.Event()} + # refcount 0 = stuck (no future release will clean these up) + bm._context_refcounts[fake_sig] = 0 + + print(f" Injected {len(bm._pending_cleanup)} stuck entries (refcount=0)") + + bm._pages_served = bm.config.max_pages_before_recycle + + # The fix: get_page should NOT block forever. + # The 30s timeout will fire, force-clean stuck entries, and proceed. + # We use a 35s test timeout to allow the 30s internal timeout to fire. + print(f" Calling get_page() — should unblock after ~30s timeout...") + start = time.monotonic() + try: + page, ctx = await asyncio.wait_for( + bm.get_page(crawl_config), + timeout=35.0 + ) + elapsed = time.monotonic() - start + print(f" get_page() returned after {elapsed:.1f}s") + + check("get_page() did NOT deadlock (returned within 35s)", True) + check("stuck entries were force-cleaned", + len(bm._pending_cleanup) < bm._max_pending_browsers) + + await bm.release_page_with_context(page) + await page.close() + + except asyncio.TimeoutError: + elapsed = time.monotonic() - start + print(f" get_page() STILL blocked after {elapsed:.1f}s") + check("get_page() did NOT deadlock", False) + + finally: + bm._pending_cleanup.clear() + bm._context_refcounts.clear() + await bm.close() + + +async def test_real_concurrent_crawl(): + """ + Integration test: run many concurrent crawls with recycling + and verify no stuck entries or deadlocks. + """ + print("\n" + "="*70) + print("TEST: Real concurrent crawls with recycling") + print("="*70) + + config = BrowserConfig( + headless=True, + extra_args=['--no-sandbox', '--disable-gpu'], + max_pages_before_recycle=10, + ) + bm = BrowserManager(config) + await bm.start() + + TOTAL = 80 + CONCURRENT = 8 + completed = 0 + errors = 0 + + sem = asyncio.Semaphore(CONCURRENT) + + async def do_crawl(i): + nonlocal completed, errors + async with sem: + try: + crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass") + page, ctx = await asyncio.wait_for( + bm.get_page(crawl_config), + timeout=30.0 + ) + + try: + await page.goto("https://example.com", timeout=15000) + except Exception: + pass + + # Use the FIXED finally pattern: release first, then close + try: + await bm.release_page_with_context(page) + except Exception: + pass + try: + await page.close() + except Exception: + pass + + completed += 1 + if completed % 20 == 0: + print(f" [{completed}/{TOTAL}] version={bm._browser_version} " + f"pending={len(bm._pending_cleanup)} " + f"pages_served={bm._pages_served}") + + except asyncio.TimeoutError: + errors += 1 + print(f" [{i}] TIMEOUT in get_page()!") + except Exception as e: + errors += 1 + if errors <= 3: + print(f" [{i}] Error: {e}") + + start = time.monotonic() + tasks = [asyncio.create_task(do_crawl(i)) for i in range(TOTAL)] + await asyncio.gather(*tasks) + elapsed = time.monotonic() - start + + print(f"\n Results: {completed}/{TOTAL} completed, {errors} errors, {elapsed:.1f}s") + + stuck = [s for s in bm._pending_cleanup if bm._context_refcounts.get(s, 0) == 0] + + check(f"all {TOTAL} crawls completed", completed == TOTAL) + check("no errors", errors == 0) + check("no stuck entries in _pending_cleanup", len(stuck) == 0) + check("no timeouts (no deadlock)", errors == 0) + + await bm.close() + + +async def test_multi_config_concurrent(): + """ + Integration test: concurrent crawls with DIFFERENT configs to + exercise the multi-sig path that triggered Bug 1. + """ + print("\n" + "="*70) + print("TEST: Multi-config concurrent crawls") + print("="*70) + + config = BrowserConfig( + headless=True, + extra_args=['--no-sandbox', '--disable-gpu'], + max_pages_before_recycle=5, + ) + bm = BrowserManager(config) + await bm.start() + + TOTAL = 40 + CONCURRENT = 6 + completed = 0 + errors = 0 + + sem = asyncio.Semaphore(CONCURRENT) + configs = [ + CrawlerRunConfig(magic=True, cache_mode="bypass"), + CrawlerRunConfig(magic=False, cache_mode="bypass"), + CrawlerRunConfig(magic=True, simulate_user=True, cache_mode="bypass"), + ] + + async def do_crawl(i): + nonlocal completed, errors + async with sem: + try: + crawl_config = configs[i % len(configs)] + page, ctx = await asyncio.wait_for( + bm.get_page(crawl_config), + timeout=30.0 + ) + + try: + await page.goto("https://example.com", timeout=15000) + except Exception: + pass + + try: + await bm.release_page_with_context(page) + except Exception: + pass + try: + await page.close() + except Exception: + pass + + completed += 1 + + except asyncio.TimeoutError: + errors += 1 + print(f" [{i}] TIMEOUT!") + print(f" pending={len(bm._pending_cleanup)}") + except Exception as e: + errors += 1 + if errors <= 3: + print(f" [{i}] Error: {e}") + + start = time.monotonic() + tasks = [asyncio.create_task(do_crawl(i)) for i in range(TOTAL)] + await asyncio.gather(*tasks) + elapsed = time.monotonic() - start + + stuck = [s for s in bm._pending_cleanup if bm._context_refcounts.get(s, 0) == 0] + + print(f"\n Results: {completed}/{TOTAL}, {errors} errors, {elapsed:.1f}s") + print(f" Final: version={bm._browser_version} pending={len(bm._pending_cleanup)} stuck={len(stuck)}") + + check(f"all {TOTAL} multi-config crawls completed", completed == TOTAL) + check("no stuck entries", len(stuck) == 0) + check("no timeouts", errors == 0) + + await bm.close() + + +async def main(): + print("="*70) + print("PR #1640 Regression Tests") + print("="*70) + + await test_bug2_release_always_called() + await test_bug1_multi_config_race() + await test_bug3_safety_cap_timeout() + await test_real_concurrent_crawl() + await test_multi_config_concurrent() + + print("\n" + "="*70) + if FAIL == 0: + print(f"ALL {PASS} CHECKS PASSED") + else: + print(f"FAILED: {FAIL} checks failed, {PASS} passed") + print("="*70) + + sys.exit(1 if FAIL > 0 else 0) + + +if __name__ == "__main__": + asyncio.run(main())