Three bugs in the version-based browser recycling caused requests to hang after ~80-130 pages under concurrent load: 1. Race condition: _maybe_bump_browser_version() added ALL context signatures to _pending_cleanup, including those with refcount 0. Since no future release would trigger cleanup for idle sigs, they stayed in _pending_cleanup permanently. Fix: split sigs into active (refcount > 0, go to pending) and idle (refcount == 0, cleaned up immediately). 2. Finally block fragility: the first line of _crawl_web's finally block accessed page.context.browser.contexts, which throws if the browser crashed. This prevented release_page_with_context() from ever being called, permanently leaking the refcount. Fix: call release_page_with_context() first in its own try/except, then do best-effort cleanup of listeners and page. 3. Safety cap deadlock: when _pending_cleanup accumulated >= 3 stuck entries, _maybe_bump_browser_version() blocked get_page() forever with no timeout. Fix: 30-second timeout on the wait, after which stuck entries (refcount 0) are force-cleaned. Includes regression test covering all three bugs plus multi-config concurrent crawl scenarios.
425 lines
14 KiB
Python
425 lines
14 KiB
Python
"""
|
|
Regression tests for PR #1640 — memory leak / hang under high concurrency
|
|
with max_pages_before_recycle enabled.
|
|
|
|
Tests three bugs that were fixed:
|
|
|
|
Bug 1: Race condition — release_page_with_context() runs BEFORE
|
|
_maybe_bump_browser_version() adds the sig to _pending_cleanup.
|
|
FIX: Don't add refcount-0 sigs to pending; clean them up immediately.
|
|
|
|
Bug 2: The finally block in _crawl_web can fail before calling
|
|
release_page_with_context(), leaking the refcount permanently.
|
|
FIX: Call release_page_with_context() FIRST in the finally block.
|
|
|
|
Bug 3: Accumulated pending_cleanup entries hit _max_pending_browsers cap,
|
|
blocking ALL get_page() calls → system-wide deadlock.
|
|
FIX: 30s timeout on safety cap wait + force-clean stuck entries.
|
|
|
|
Exit code 0 = all tests pass. Exit code 1 = regression found.
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
import os
|
|
import time
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.browser_manager import BrowserManager
|
|
|
|
PASS = 0
|
|
FAIL = 0
|
|
|
|
|
|
def check(name, condition):
|
|
global PASS, FAIL
|
|
if condition:
|
|
PASS += 1
|
|
print(f" PASS: {name}")
|
|
else:
|
|
FAIL += 1
|
|
print(f" FAIL: {name}")
|
|
|
|
|
|
async def test_bug1_multi_config_race():
|
|
"""
|
|
Bug 1 fix: idle sigs (refcount=0) must NOT be added to _pending_cleanup.
|
|
They should be cleaned up immediately during the version bump.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST: Bug 1 — idle sig must not get stuck in _pending_cleanup")
|
|
print("="*70)
|
|
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
extra_args=['--no-sandbox', '--disable-gpu'],
|
|
max_pages_before_recycle=3,
|
|
)
|
|
bm = BrowserManager(config)
|
|
await bm.start()
|
|
|
|
try:
|
|
config_a = CrawlerRunConfig(magic=True, cache_mode="bypass")
|
|
config_b = CrawlerRunConfig(magic=False, cache_mode="bypass")
|
|
|
|
# Use config A, then release → refcount 0
|
|
page_a, _ = await bm.get_page(config_a)
|
|
sig_a = bm._page_to_sig.get(page_a)
|
|
await bm.release_page_with_context(page_a)
|
|
await page_a.close()
|
|
|
|
print(f" sig_a refcount after release: {bm._context_refcounts.get(sig_a)}")
|
|
|
|
# Use config B twice → pages_served hits threshold → version bump
|
|
page_b1, _ = await bm.get_page(config_b)
|
|
page_b2, _ = await bm.get_page(config_b)
|
|
sig_b = bm._page_to_sig.get(page_b1)
|
|
|
|
# At this point the version should have bumped (3 pages served >= threshold 3)
|
|
print(f" _browser_version: {bm._browser_version}")
|
|
print(f" _pending_cleanup sigs: {list(bm._pending_cleanup.keys())}")
|
|
|
|
# sig_a (refcount=0) must NOT be in _pending_cleanup
|
|
check("sig_a NOT in _pending_cleanup",
|
|
sig_a not in bm._pending_cleanup)
|
|
|
|
# sig_a should have been cleaned up from _context_refcounts
|
|
check("sig_a cleaned from _context_refcounts",
|
|
sig_a not in bm._context_refcounts)
|
|
|
|
# sig_b (refcount>0) SHOULD be in _pending_cleanup (it will drain naturally)
|
|
check("sig_b IS in _pending_cleanup (active, will drain)",
|
|
sig_b in bm._pending_cleanup)
|
|
|
|
# Release B pages → sig_b drains → cleaned up
|
|
await bm.release_page_with_context(page_b1)
|
|
await page_b1.close()
|
|
await bm.release_page_with_context(page_b2)
|
|
await page_b2.close()
|
|
|
|
check("sig_b cleaned after release",
|
|
sig_b not in bm._pending_cleanup)
|
|
|
|
check("_pending_cleanup is empty",
|
|
len(bm._pending_cleanup) == 0)
|
|
|
|
finally:
|
|
await bm.close()
|
|
|
|
|
|
async def test_bug2_release_always_called():
|
|
"""
|
|
Bug 2 fix: release_page_with_context() must be called even when
|
|
the browser is in a bad state.
|
|
|
|
The fix moves release_page_with_context() to the FIRST line of
|
|
the finally block in _crawl_web, wrapped in try/except.
|
|
Here we verify that release_page_with_context itself works even
|
|
after browser crash, and that the fixed finally block pattern
|
|
always decrements the refcount.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST: Bug 2 — release_page_with_context must work after browser crash")
|
|
print("="*70)
|
|
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
extra_args=['--no-sandbox', '--disable-gpu'],
|
|
max_pages_before_recycle=5,
|
|
)
|
|
bm = BrowserManager(config)
|
|
await bm.start()
|
|
|
|
try:
|
|
crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass")
|
|
|
|
page, ctx = await bm.get_page(crawl_config)
|
|
sig = bm._page_to_sig.get(page)
|
|
print(f" sig refcount before crash: {bm._context_refcounts.get(sig)}")
|
|
|
|
check("refcount is 1 before crash",
|
|
bm._context_refcounts.get(sig) == 1)
|
|
|
|
# Simulate browser crash
|
|
if bm.browser:
|
|
await bm.browser.close()
|
|
bm.browser = None
|
|
|
|
# The FIX: call release_page_with_context even after crash
|
|
# (simulating what the fixed finally block does)
|
|
try:
|
|
await bm.release_page_with_context(page)
|
|
except Exception:
|
|
pass
|
|
|
|
refcount_after = bm._context_refcounts.get(sig, 0)
|
|
print(f" sig refcount after crash + release: {refcount_after}")
|
|
|
|
check("refcount decremented to 0 after crash + release",
|
|
refcount_after == 0)
|
|
|
|
check("page removed from _page_to_sig",
|
|
page not in bm._page_to_sig)
|
|
|
|
finally:
|
|
bm.browser = None
|
|
bm.contexts_by_config.clear()
|
|
bm._context_refcounts.clear()
|
|
bm._context_last_used.clear()
|
|
bm._page_to_sig.clear()
|
|
if bm.playwright:
|
|
await bm.playwright.stop()
|
|
|
|
|
|
async def test_bug3_safety_cap_timeout():
|
|
"""
|
|
Bug 3 fix: the safety cap wait must have a timeout.
|
|
When stuck entries accumulate, the timeout fires and force-cleans
|
|
entries with refcount 0, preventing permanent deadlock.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST: Bug 3 — safety cap wait must not block forever")
|
|
print("="*70)
|
|
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
extra_args=['--no-sandbox', '--disable-gpu'],
|
|
max_pages_before_recycle=2,
|
|
)
|
|
bm = BrowserManager(config)
|
|
await bm.start()
|
|
|
|
try:
|
|
crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass")
|
|
|
|
# Inject stuck entries WITH refcount 0 (simulating leaked refcounts
|
|
# that were later force-decremented or never properly tracked)
|
|
print(f" Safety cap: {bm._max_pending_browsers}")
|
|
for i in range(bm._max_pending_browsers):
|
|
fake_sig = f"stuck_sig_{i}"
|
|
bm._pending_cleanup[fake_sig] = {"version": i, "done": asyncio.Event()}
|
|
# refcount 0 = stuck (no future release will clean these up)
|
|
bm._context_refcounts[fake_sig] = 0
|
|
|
|
print(f" Injected {len(bm._pending_cleanup)} stuck entries (refcount=0)")
|
|
|
|
bm._pages_served = bm.config.max_pages_before_recycle
|
|
|
|
# The fix: get_page should NOT block forever.
|
|
# The 30s timeout will fire, force-clean stuck entries, and proceed.
|
|
# We use a 35s test timeout to allow the 30s internal timeout to fire.
|
|
print(f" Calling get_page() — should unblock after ~30s timeout...")
|
|
start = time.monotonic()
|
|
try:
|
|
page, ctx = await asyncio.wait_for(
|
|
bm.get_page(crawl_config),
|
|
timeout=35.0
|
|
)
|
|
elapsed = time.monotonic() - start
|
|
print(f" get_page() returned after {elapsed:.1f}s")
|
|
|
|
check("get_page() did NOT deadlock (returned within 35s)", True)
|
|
check("stuck entries were force-cleaned",
|
|
len(bm._pending_cleanup) < bm._max_pending_browsers)
|
|
|
|
await bm.release_page_with_context(page)
|
|
await page.close()
|
|
|
|
except asyncio.TimeoutError:
|
|
elapsed = time.monotonic() - start
|
|
print(f" get_page() STILL blocked after {elapsed:.1f}s")
|
|
check("get_page() did NOT deadlock", False)
|
|
|
|
finally:
|
|
bm._pending_cleanup.clear()
|
|
bm._context_refcounts.clear()
|
|
await bm.close()
|
|
|
|
|
|
async def test_real_concurrent_crawl():
|
|
"""
|
|
Integration test: run many concurrent crawls with recycling
|
|
and verify no stuck entries or deadlocks.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST: Real concurrent crawls with recycling")
|
|
print("="*70)
|
|
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
extra_args=['--no-sandbox', '--disable-gpu'],
|
|
max_pages_before_recycle=10,
|
|
)
|
|
bm = BrowserManager(config)
|
|
await bm.start()
|
|
|
|
TOTAL = 80
|
|
CONCURRENT = 8
|
|
completed = 0
|
|
errors = 0
|
|
|
|
sem = asyncio.Semaphore(CONCURRENT)
|
|
|
|
async def do_crawl(i):
|
|
nonlocal completed, errors
|
|
async with sem:
|
|
try:
|
|
crawl_config = CrawlerRunConfig(magic=True, cache_mode="bypass")
|
|
page, ctx = await asyncio.wait_for(
|
|
bm.get_page(crawl_config),
|
|
timeout=30.0
|
|
)
|
|
|
|
try:
|
|
await page.goto("https://example.com", timeout=15000)
|
|
except Exception:
|
|
pass
|
|
|
|
# Use the FIXED finally pattern: release first, then close
|
|
try:
|
|
await bm.release_page_with_context(page)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
await page.close()
|
|
except Exception:
|
|
pass
|
|
|
|
completed += 1
|
|
if completed % 20 == 0:
|
|
print(f" [{completed}/{TOTAL}] version={bm._browser_version} "
|
|
f"pending={len(bm._pending_cleanup)} "
|
|
f"pages_served={bm._pages_served}")
|
|
|
|
except asyncio.TimeoutError:
|
|
errors += 1
|
|
print(f" [{i}] TIMEOUT in get_page()!")
|
|
except Exception as e:
|
|
errors += 1
|
|
if errors <= 3:
|
|
print(f" [{i}] Error: {e}")
|
|
|
|
start = time.monotonic()
|
|
tasks = [asyncio.create_task(do_crawl(i)) for i in range(TOTAL)]
|
|
await asyncio.gather(*tasks)
|
|
elapsed = time.monotonic() - start
|
|
|
|
print(f"\n Results: {completed}/{TOTAL} completed, {errors} errors, {elapsed:.1f}s")
|
|
|
|
stuck = [s for s in bm._pending_cleanup if bm._context_refcounts.get(s, 0) == 0]
|
|
|
|
check(f"all {TOTAL} crawls completed", completed == TOTAL)
|
|
check("no errors", errors == 0)
|
|
check("no stuck entries in _pending_cleanup", len(stuck) == 0)
|
|
check("no timeouts (no deadlock)", errors == 0)
|
|
|
|
await bm.close()
|
|
|
|
|
|
async def test_multi_config_concurrent():
|
|
"""
|
|
Integration test: concurrent crawls with DIFFERENT configs to
|
|
exercise the multi-sig path that triggered Bug 1.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST: Multi-config concurrent crawls")
|
|
print("="*70)
|
|
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
extra_args=['--no-sandbox', '--disable-gpu'],
|
|
max_pages_before_recycle=5,
|
|
)
|
|
bm = BrowserManager(config)
|
|
await bm.start()
|
|
|
|
TOTAL = 40
|
|
CONCURRENT = 6
|
|
completed = 0
|
|
errors = 0
|
|
|
|
sem = asyncio.Semaphore(CONCURRENT)
|
|
configs = [
|
|
CrawlerRunConfig(magic=True, cache_mode="bypass"),
|
|
CrawlerRunConfig(magic=False, cache_mode="bypass"),
|
|
CrawlerRunConfig(magic=True, simulate_user=True, cache_mode="bypass"),
|
|
]
|
|
|
|
async def do_crawl(i):
|
|
nonlocal completed, errors
|
|
async with sem:
|
|
try:
|
|
crawl_config = configs[i % len(configs)]
|
|
page, ctx = await asyncio.wait_for(
|
|
bm.get_page(crawl_config),
|
|
timeout=30.0
|
|
)
|
|
|
|
try:
|
|
await page.goto("https://example.com", timeout=15000)
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
await bm.release_page_with_context(page)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
await page.close()
|
|
except Exception:
|
|
pass
|
|
|
|
completed += 1
|
|
|
|
except asyncio.TimeoutError:
|
|
errors += 1
|
|
print(f" [{i}] TIMEOUT!")
|
|
print(f" pending={len(bm._pending_cleanup)}")
|
|
except Exception as e:
|
|
errors += 1
|
|
if errors <= 3:
|
|
print(f" [{i}] Error: {e}")
|
|
|
|
start = time.monotonic()
|
|
tasks = [asyncio.create_task(do_crawl(i)) for i in range(TOTAL)]
|
|
await asyncio.gather(*tasks)
|
|
elapsed = time.monotonic() - start
|
|
|
|
stuck = [s for s in bm._pending_cleanup if bm._context_refcounts.get(s, 0) == 0]
|
|
|
|
print(f"\n Results: {completed}/{TOTAL}, {errors} errors, {elapsed:.1f}s")
|
|
print(f" Final: version={bm._browser_version} pending={len(bm._pending_cleanup)} stuck={len(stuck)}")
|
|
|
|
check(f"all {TOTAL} multi-config crawls completed", completed == TOTAL)
|
|
check("no stuck entries", len(stuck) == 0)
|
|
check("no timeouts", errors == 0)
|
|
|
|
await bm.close()
|
|
|
|
|
|
async def main():
|
|
print("="*70)
|
|
print("PR #1640 Regression Tests")
|
|
print("="*70)
|
|
|
|
await test_bug2_release_always_called()
|
|
await test_bug1_multi_config_race()
|
|
await test_bug3_safety_cap_timeout()
|
|
await test_real_concurrent_crawl()
|
|
await test_multi_config_concurrent()
|
|
|
|
print("\n" + "="*70)
|
|
if FAIL == 0:
|
|
print(f"ALL {PASS} CHECKS PASSED")
|
|
else:
|
|
print(f"FAILED: {FAIL} checks failed, {PASS} passed")
|
|
print("="*70)
|
|
|
|
sys.exit(1 if FAIL > 0 else 0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|