Fix browser recycling under high concurrency — version-based approach

The previous recycle logic waited for all refcounts to hit 0 before
recycling, which never happened under sustained concurrent load (20+
crawls always had at least one active).

New approach:
- Add _browser_version to config signature — bump it to force new contexts
- When threshold is hit: bump version, move old sigs to _pending_cleanup
- New requests get new contexts automatically (different signature)
- Old contexts drain naturally and get cleaned up when refcount hits 0
- Safety cap: max 3 pending browsers draining at once

This means recycling now works under any load pattern — no blocking,
no waiting for quiet moments. Old and new browsers coexist briefly
during transitions.

Includes 12 new tests covering version bumps, concurrent recycling,
safety cap, and edge cases.
This commit is contained in:
unclecode
2026-02-05 07:48:12 +00:00
parent c046918bb4
commit 3401dd1620
2 changed files with 502 additions and 70 deletions

View File

@@ -0,0 +1,386 @@
"""
Tests for version-based browser recycling.
The new recycle approach:
1. When pages_served hits threshold, bump _browser_version
2. Old signatures go to _pending_cleanup
3. New requests get new contexts (different version = different signature)
4. When old context's refcount hits 0, it gets cleaned up
5. No blocking — old and new browsers coexist during transition
These tests use small thresholds (3-5 pages) to verify the mechanics.
"""
import asyncio
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
import pytest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
# ---------------------------------------------------------------------------
# Local test server
# ---------------------------------------------------------------------------
PAGES = {}
for i in range(100):
PAGES[f"/page{i}"] = (
f"<!DOCTYPE html><html><head><title>Page {i}</title></head>"
f"<body><h1>Page {i}</h1><p>Content for page {i}.</p></body></html>"
).encode()
class Handler(SimpleHTTPRequestHandler):
def log_message(self, *a):
pass
def do_GET(self):
body = PAGES.get(self.path, PAGES["/page0"])
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(body)
class _Server(HTTPServer):
allow_reuse_address = True
@pytest.fixture(scope="module")
def srv():
s = _Server(("127.0.0.1", 0), Handler)
port = s.server_address[1]
t = threading.Thread(target=s.serve_forever, daemon=True)
t.start()
yield f"http://127.0.0.1:{port}"
s.shutdown()
def _u(base, i):
return f"{base}/page{i}"
def _bm(c):
return c.crawler_strategy.browser_manager
# ===================================================================
# SECTION A — Version bump mechanics
# ===================================================================
@pytest.mark.asyncio
async def test_version_bump_on_threshold(srv):
"""Browser version should bump when threshold is reached."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
assert bm._browser_version == 1
# Crawl 2 pages — no bump yet
for i in range(2):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
assert bm._browser_version == 1, "Version should still be 1 after 2 pages"
assert bm._pages_served == 2
# 3rd page hits threshold (3) and triggers bump AFTER the page is served
r = await c.arun(url=_u(srv, 2), config=run)
assert r.success
assert bm._browser_version == 2, "Version should bump after 3rd page"
assert bm._pages_served == 0, "Counter resets after bump"
# 4th page is first page of version 2
r = await c.arun(url=_u(srv, 3), config=run)
assert r.success
assert bm._pages_served == 1
@pytest.mark.asyncio
async def test_signature_changes_after_version_bump(srv):
"""Same CrawlerRunConfig should produce different signatures after version bump."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=2,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# Get signature before bump
sig_v1 = bm._make_config_signature(run)
# Crawl 2 pages
for i in range(2):
await c.arun(url=_u(srv, i), config=run)
# 3rd request triggers bump
await c.arun(url=_u(srv, 2), config=run)
# Signature should be different now
sig_v2 = bm._make_config_signature(run)
assert sig_v1 != sig_v2, "Signature should change after version bump"
@pytest.mark.asyncio
async def test_no_version_bump_when_disabled(srv):
"""Version should stay at 1 when max_pages_before_recycle=0."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=0, # Disabled
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
for i in range(20):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
assert bm._browser_version == 1, "Version should not bump when disabled"
assert bm._pages_served == 20
# ===================================================================
# SECTION B — Pending cleanup mechanics
# ===================================================================
@pytest.mark.asyncio
async def test_old_signature_goes_to_pending_cleanup(srv):
"""Version bump works and old contexts get cleaned up."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=2,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# Crawl 2 pages — creates signature for version 1, bumps on 2nd
for i in range(2):
await c.arun(url=_u(srv, i), config=run)
# After 2 pages with threshold=2, version should have bumped
assert bm._browser_version == 2
# Since sequential crawls release pages immediately (refcount=0),
# old contexts get cleaned up right away. Pending cleanup should be empty.
# This is correct behavior — cleanup is eager when possible.
assert len(bm._pending_cleanup) == 0
@pytest.mark.asyncio
async def test_cleanup_happens_when_refcount_hits_zero(srv):
"""Old context should be closed when its refcount drops to 0."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# Sequential crawls: each page is released before next request
# So refcount is always 0 between requests, and cleanup happens immediately
for i in range(10):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
# Should have bumped twice (at 3 and 6) with version now at 3
# But since refcount=0 immediately, pending_cleanup should be empty
assert len(bm._pending_cleanup) == 0, "All old contexts should be cleaned up"
# ===================================================================
# SECTION C — Concurrent crawls with recycling
# ===================================================================
@pytest.mark.asyncio
async def test_concurrent_crawls_dont_block_on_recycle(srv):
"""Concurrent crawls should not block — old browser drains while new one serves."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=5,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# Launch 20 concurrent crawls
tasks = [c.arun(url=_u(srv, i), config=run) for i in range(20)]
results = await asyncio.gather(*tasks, return_exceptions=True)
# All should succeed — no blocking, no errors
excs = [r for r in results if isinstance(r, Exception)]
assert len(excs) == 0, f"Exceptions: {excs[:3]}"
successes = [r for r in results if not isinstance(r, Exception) and r.success]
assert len(successes) == 20, f"Only {len(successes)} succeeded"
# Version should have bumped multiple times
assert bm._browser_version >= 2, "Should have recycled at least once"
@pytest.mark.asyncio
async def test_high_concurrency_with_small_threshold(srv):
"""Stress test: 50 concurrent crawls with threshold=3."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
# 50 concurrent crawls with threshold of 3 — many version bumps
tasks = [c.arun(url=_u(srv, i % 100), config=run) for i in range(50)]
results = await asyncio.gather(*tasks, return_exceptions=True)
excs = [r for r in results if isinstance(r, Exception)]
assert len(excs) == 0, f"Exceptions: {excs[:3]}"
successes = [r for r in results if not isinstance(r, Exception) and r.success]
assert len(successes) == 50
# ===================================================================
# SECTION D — Safety cap (max pending browsers)
# ===================================================================
@pytest.mark.asyncio
async def test_safety_cap_limits_pending_browsers(srv):
"""Should not exceed _max_pending_browsers old browsers draining."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=2,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
bm._max_pending_browsers = 2 # Lower cap for testing
# Run enough crawls to potentially exceed the cap
for i in range(15):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
# Pending cleanup should never have exceeded the cap
# (We can't directly test this during execution, but if it works without
# deadlock/timeout, the cap logic is functioning)
assert len(bm._pending_cleanup) <= bm._max_pending_browsers
# ===================================================================
# SECTION E — Managed browser mode
# ===================================================================
@pytest.mark.asyncio
async def test_managed_browser_recycle(srv):
"""Recycling should work with managed browser mode."""
cfg = BrowserConfig(
headless=True, verbose=False,
use_managed_browser=True,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
for i in range(10):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success, f"Page {i} failed"
# Version should have bumped
assert bm._browser_version >= 2
@pytest.mark.asyncio
async def test_managed_browser_isolated_context_recycle(srv):
"""Recycling with managed browser + isolated contexts."""
cfg = BrowserConfig(
headless=True, verbose=False,
use_managed_browser=True,
create_isolated_context=True,
max_pages_before_recycle=3,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
for i in range(10):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success, f"Page {i} failed"
assert bm._browser_version >= 2
# ===================================================================
# SECTION F — Edge cases
# ===================================================================
@pytest.mark.asyncio
async def test_threshold_of_one(srv):
"""Edge case: threshold=1 means version bump after every page."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=1,
)
run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
for i in range(5):
r = await c.arun(url=_u(srv, i), config=run)
assert r.success
# With threshold=1, each page triggers a bump after being served:
# Page 0: served, counter=1 >= 1, bump -> version=2, counter=0
# Page 1: served, counter=1 >= 1, bump -> version=3, counter=0
# ... etc.
# After 5 pages, should have bumped 5 times
assert bm._browser_version == 6 # Started at 1, bumped 5 times
@pytest.mark.asyncio
async def test_different_configs_get_separate_cleanup_tracking(srv):
"""Different CrawlerRunConfigs should track separately in pending cleanup."""
cfg = BrowserConfig(
headless=True, verbose=False,
max_pages_before_recycle=2,
)
async with AsyncWebCrawler(config=cfg) as c:
bm = _bm(c)
run_a = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
run_b = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, verbose=False,
override_navigator=True, # Different config
)
# Alternate between configs
for i in range(6):
cfg_to_use = run_a if i % 2 == 0 else run_b
r = await c.arun(url=_u(srv, i), config=cfg_to_use)
assert r.success
# Both configs should work fine
assert bm._browser_version >= 2