Files
crawl4ai/tests/browser/test_context_leak_fix.py
unclecode c790231aba Fix browser context memory leak — signature shrink + LRU eviction (#943)
contexts_by_config accumulated browser contexts unboundedly in long-running
crawlers (Docker API). Two root causes fixed:

1. _make_config_signature() hashed ~60 CrawlerRunConfig fields but only 7
   affect the browser context (proxy_config, locale, timezone_id, geolocation,
   override_navigator, simulate_user, magic). Switched from blacklist to
   whitelist — non-context fields like word_count_threshold, css_selector,
   screenshot, verbose no longer cause unnecessary context creation.

2. No eviction mechanism existed between close() calls. Added refcount
   tracking (_context_refcounts, incremented under _contexts_lock in
   get_page, decremented in release_page_with_context) and LRU eviction
   (_evict_lru_context_locked) that caps contexts at _max_contexts=20,
   evicting only idle contexts (refcount==0) oldest-first.

Also fixed: storage_state path leaked a temporary context every request
(now explicitly closed after clone_runtime_state).

Closes #943. Credit to @Martichou for the investigation in #1640.
2026-02-01 14:23:04 +00:00

359 lines
15 KiB
Python

"""
Integration tests for the browser context memory leak fix.
Tests:
1. Signature shrink: non-context fields produce same hash
2. Signature correctness: context-affecting fields produce different hashes
3. Refcount lifecycle: increment on get_page, decrement on release
4. LRU eviction: oldest idle context is evicted when over limit
5. Eviction respects active refcounts
6. Real browser: contexts don't leak under varying configs
7. Real browser: batch crawl reuses same context
8. Storage state path: temporary context is closed
"""
import asyncio
import time
import pytest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.async_configs import ProxyConfig, GeolocationConfig
from crawl4ai.browser_manager import BrowserManager
# ── Unit tests (no browser needed) ──────────────────────────────────────
class TestSignatureShrink:
"""Verify the whitelist signature only considers context-affecting fields."""
def _bm(self):
return BrowserManager(BrowserConfig(), logger=None)
def test_non_context_fields_same_signature(self):
"""Fields that don't affect browser context must produce identical sigs."""
bm = self._bm()
configs = [
CrawlerRunConfig(word_count_threshold=200),
CrawlerRunConfig(word_count_threshold=50),
CrawlerRunConfig(css_selector=".main"),
CrawlerRunConfig(screenshot=True),
CrawlerRunConfig(pdf=True, verbose=False),
CrawlerRunConfig(scan_full_page=True, scroll_delay=0.5),
CrawlerRunConfig(only_text=True),
CrawlerRunConfig(wait_until="networkidle", page_timeout=30000),
CrawlerRunConfig(capture_network_requests=True),
CrawlerRunConfig(exclude_external_links=True),
]
sigs = [bm._make_config_signature(c) for c in configs]
assert len(set(sigs)) == 1, (
f"Expected all same sig, got {len(set(sigs))} unique: {sigs[:3]}"
)
def test_proxy_changes_signature(self):
bm = self._bm()
c1 = CrawlerRunConfig()
c2 = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://p1:8080"))
c3 = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://p2:8080"))
s1 = bm._make_config_signature(c1)
s2 = bm._make_config_signature(c2)
s3 = bm._make_config_signature(c3)
assert s1 != s2, "proxy vs no-proxy should differ"
assert s2 != s3, "different proxies should differ"
def test_locale_changes_signature(self):
bm = self._bm()
s1 = bm._make_config_signature(CrawlerRunConfig())
s2 = bm._make_config_signature(CrawlerRunConfig(locale="en-US"))
s3 = bm._make_config_signature(CrawlerRunConfig(locale="fr-FR"))
assert s1 != s2
assert s2 != s3
def test_timezone_changes_signature(self):
bm = self._bm()
s1 = bm._make_config_signature(CrawlerRunConfig())
s2 = bm._make_config_signature(CrawlerRunConfig(timezone_id="America/New_York"))
assert s1 != s2
def test_geolocation_changes_signature(self):
bm = self._bm()
s1 = bm._make_config_signature(CrawlerRunConfig())
s2 = bm._make_config_signature(CrawlerRunConfig(
geolocation=GeolocationConfig(latitude=40.7, longitude=-74.0)
))
assert s1 != s2
def test_navigator_overrides_change_signature(self):
bm = self._bm()
base = bm._make_config_signature(CrawlerRunConfig())
s_nav = bm._make_config_signature(CrawlerRunConfig(override_navigator=True))
s_sim = bm._make_config_signature(CrawlerRunConfig(simulate_user=True))
s_mag = bm._make_config_signature(CrawlerRunConfig(magic=True))
assert base != s_nav
assert base != s_sim
assert base != s_mag
def test_signature_stability(self):
"""Same config always produces the same hash."""
bm = self._bm()
c = CrawlerRunConfig(locale="ja-JP", override_navigator=True)
assert bm._make_config_signature(c) == bm._make_config_signature(c)
def test_proxy_config_with_credentials(self):
"""ProxyConfig with username/password produces distinct stable sigs."""
bm = self._bm()
c1 = CrawlerRunConfig(proxy_config=ProxyConfig(
server="http://proxy:8080", username="user1", password="pass1"
))
c2 = CrawlerRunConfig(proxy_config=ProxyConfig(
server="http://proxy:8080", username="user2", password="pass2"
))
s1 = bm._make_config_signature(c1)
s2 = bm._make_config_signature(c2)
assert s1 != s2, "different credentials should differ"
assert s1 == bm._make_config_signature(c1), "should be stable"
class TestLRUEviction:
"""Verify eviction logic (no browser needed)."""
def _bm(self, max_ctx=3):
bm = BrowserManager(BrowserConfig(), logger=None)
bm._max_contexts = max_ctx
return bm
def test_no_eviction_under_limit(self):
bm = self._bm(max_ctx=5)
for i in range(5):
sig = f"sig_{i}"
bm.contexts_by_config[sig] = f"ctx_{i}"
bm._context_refcounts[sig] = 0
bm._context_last_used[sig] = time.monotonic()
assert bm._evict_lru_context_locked() is None
def test_evicts_oldest_idle(self):
bm = self._bm(max_ctx=3)
for i in range(5):
sig = f"sig_{i}"
bm.contexts_by_config[sig] = f"ctx_{i}"
bm._context_refcounts[sig] = 0
bm._context_last_used[sig] = time.monotonic()
time.sleep(0.002)
evicted = bm._evict_lru_context_locked()
assert evicted == "ctx_0", f"expected oldest ctx_0, got {evicted}"
assert "sig_0" not in bm.contexts_by_config
assert "sig_0" not in bm._context_refcounts
assert "sig_0" not in bm._context_last_used
def test_skips_active_contexts(self):
bm = self._bm(max_ctx=2)
# sig_0: old but active
bm.contexts_by_config["sig_0"] = "ctx_0"
bm._context_refcounts["sig_0"] = 3
bm._context_last_used["sig_0"] = 0 # very old
# sig_1: newer, idle
bm.contexts_by_config["sig_1"] = "ctx_1"
bm._context_refcounts["sig_1"] = 0
bm._context_last_used["sig_1"] = time.monotonic()
# sig_2: newest, idle
bm.contexts_by_config["sig_2"] = "ctx_2"
bm._context_refcounts["sig_2"] = 0
bm._context_last_used["sig_2"] = time.monotonic()
evicted = bm._evict_lru_context_locked()
# sig_0 is oldest but active (refcount=3) — must skip it
assert evicted == "ctx_1", f"expected ctx_1 (oldest idle), got {evicted}"
assert "sig_0" in bm.contexts_by_config, "active context must NOT be evicted"
def test_all_active_no_eviction(self):
bm = self._bm(max_ctx=1)
for i in range(3):
sig = f"sig_{i}"
bm.contexts_by_config[sig] = f"ctx_{i}"
bm._context_refcounts[sig] = 1 # all active
bm._context_last_used[sig] = time.monotonic()
evicted = bm._evict_lru_context_locked()
assert evicted is None, "cannot evict when all are active"
assert len(bm.contexts_by_config) == 3, "all contexts should remain"
def test_eviction_cleans_page_to_sig(self):
bm = self._bm(max_ctx=1)
bm.contexts_by_config["sig_old"] = "ctx_old"
bm._context_refcounts["sig_old"] = 0
bm._context_last_used["sig_old"] = 0
bm.contexts_by_config["sig_new"] = "ctx_new"
bm._context_refcounts["sig_new"] = 0
bm._context_last_used["sig_new"] = time.monotonic()
# Simulate a stale page mapping for the old context
mock_page = object()
bm._page_to_sig[mock_page] = "sig_old"
evicted = bm._evict_lru_context_locked()
assert evicted == "ctx_old"
assert mock_page not in bm._page_to_sig, "stale page mapping should be cleaned"
# ── Integration tests (real browser) ────────────────────────────────────
@pytest.fixture
def event_loop():
loop = asyncio.new_event_loop()
yield loop
loop.close()
def run(coro):
"""Run an async function synchronously."""
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
finally:
loop.close()
class TestRealBrowserContextLifecycle:
"""Real browser tests — verify contexts aren't leaked."""
def test_varying_configs_same_context(self):
"""Different non-context fields should reuse the same context."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
# Crawl with different non-context configs
html = "<html><body><p>Hello World with enough words to pass threshold</p></body></html>"
for wct in [10, 50, 200]:
config = CrawlerRunConfig(word_count_threshold=wct)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Should have at most 1 context (all configs hash the same)
ctx_count = len(bm.contexts_by_config)
assert ctx_count <= 1, (
f"Expected 1 context for identical browser config, got {ctx_count}"
)
run(_test())
def test_batch_crawl_reuses_context(self):
"""Multiple URLs with same config should reuse a single context."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
html1 = "<html><body><p>Page one content here</p></body></html>"
html2 = "<html><body><p>Page two content here</p></body></html>"
html3 = "<html><body><p>Page three content here</p></body></html>"
config = CrawlerRunConfig()
for h in [html1, html2, html3]:
result = await crawler.arun(f"raw:{h}", config=config)
assert result.success
ctx_count = len(bm.contexts_by_config)
assert ctx_count <= 1, f"Batch should reuse context, got {ctx_count}"
run(_test())
def test_refcount_drops_to_zero_after_crawl(self):
"""After a crawl completes, the context refcount should be 0."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
html = "<html><body><p>Test content</p></body></html>"
config = CrawlerRunConfig()
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# All refcounts should be 0 after crawl completes
for sig, count in bm._context_refcounts.items():
assert count == 0, (
f"Refcount for {sig[:8]} should be 0 after crawl, got {count}"
)
run(_test())
def test_page_to_sig_cleaned_after_crawl(self):
"""After crawl, the page->sig mapping should be empty (pages released)."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
html = "<html><body><p>Test</p></body></html>"
result = await crawler.arun(f"raw:{html}", config=CrawlerRunConfig())
assert result.success
assert len(bm._page_to_sig) == 0, (
f"Expected empty _page_to_sig after crawl, got {len(bm._page_to_sig)} entries"
)
run(_test())
def test_concurrent_crawls_refcount_tracking(self):
"""Concurrent crawls should all properly increment/decrement refcounts."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
config = CrawlerRunConfig()
htmls = [
f"raw:<html><body><p>Concurrent page {i}</p></body></html>"
for i in range(5)
]
tasks = [crawler.arun(h, config=config) for h in htmls]
results = await asyncio.gather(*tasks)
for r in results:
assert r.success
# All done — refcounts should be 0
for sig, count in bm._context_refcounts.items():
assert count == 0, (
f"After concurrent crawls, refcount for {sig[:8]} = {count}"
)
assert len(bm._page_to_sig) == 0
run(_test())
def test_lru_eviction_real_browser(self):
"""Verify LRU eviction actually closes contexts when limit exceeded."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
bm._max_contexts = 2 # Low limit to trigger eviction
html = "<html><body><p>Test</p></body></html>"
# Crawl with 4 different locales → 4 different context signatures
for locale in ["en-US", "fr-FR", "de-DE", "ja-JP"]:
config = CrawlerRunConfig(locale=locale)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Should have at most 2 contexts (limit)
ctx_count = len(bm.contexts_by_config)
assert ctx_count <= 2, (
f"Expected <= 2 contexts (limit), got {ctx_count}"
)
# Refcounts should all be 0
for sig, count in bm._context_refcounts.items():
assert count == 0, f"refcount {sig[:8]} = {count}"
run(_test())
def test_close_clears_everything(self):
"""close() should clear all tracking dicts."""
async def _test():
crawler = AsyncWebCrawler(config=BrowserConfig(headless=True))
await crawler.start()
bm = crawler.crawler_strategy.browser_manager
html = "<html><body><p>Test</p></body></html>"
result = await crawler.arun(f"raw:{html}", config=CrawlerRunConfig())
assert result.success
await crawler.close()
assert len(bm.contexts_by_config) == 0
assert len(bm._context_refcounts) == 0
assert len(bm._context_last_used) == 0
assert len(bm._page_to_sig) == 0
run(_test())