Fix browser context memory leak — signature shrink + LRU eviction (#943)

contexts_by_config accumulated browser contexts unboundedly in long-running
crawlers (Docker API). Two root causes fixed:

1. _make_config_signature() hashed ~60 CrawlerRunConfig fields but only 7
   affect the browser context (proxy_config, locale, timezone_id, geolocation,
   override_navigator, simulate_user, magic). Switched from blacklist to
   whitelist — non-context fields like word_count_threshold, css_selector,
   screenshot, verbose no longer cause unnecessary context creation.

2. No eviction mechanism existed between close() calls. Added refcount
   tracking (_context_refcounts, incremented under _contexts_lock in
   get_page, decremented in release_page_with_context) and LRU eviction
   (_evict_lru_context_locked) that caps contexts at _max_contexts=20,
   evicting only idle contexts (refcount==0) oldest-first.

Also fixed: storage_state path leaked a temporary context every request
(now explicitly closed after clone_runtime_state).

Closes #943. Credit to @Martichou for the investigation in #1640.
This commit is contained in:
unclecode
2026-02-01 14:23:04 +00:00
parent bb523b6c6c
commit c790231aba
4 changed files with 533 additions and 44 deletions

View File

@@ -31,6 +31,7 @@ We would like to thank the following people for their contributions to Crawl4AI:
- [nnxiong](https://github.com/nnxiong) - fix: script tag removal losing adjacent text in cleaned_html [#1364](https://github.com/unclecode/crawl4ai/pull/1364)
- [RoyLeviLangware](https://github.com/RoyLeviLangware) - fix: bs4 deprecation warning (text -> string) [#1077](https://github.com/unclecode/crawl4ai/pull/1077)
- [garyluky](https://github.com/garyluky) - fix: proxy auth ERR_INVALID_AUTH_CREDENTIALS [#1281](https://github.com/unclecode/crawl4ai/pull/1281)
- [Martichou](https://github.com/Martichou) - investigation: browser context memory leak under continuous load [#1640](https://github.com/unclecode/crawl4ai/pull/1640), [#943](https://github.com/unclecode/crawl4ai/issues/943)
#### Feb-Alpha-1
- [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651)

View File

@@ -1088,7 +1088,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
pass
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
# Keep the page open but release it for reuse by next crawl
self.browser_manager.release_page(page)
await self.browser_manager.release_page_with_context(page)
else:
# Detach listeners before closing to prevent potential errors during close
if config.capture_network_requests:
@@ -1104,8 +1104,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Clean up console capture
await self.adapter.cleanup_console_capture(page, handle_console, handle_error)
# Release page from tracking before closing
self.browser_manager.release_page(page)
# Release page and decrement context refcount before closing
await self.browser_manager.release_page_with_context(page)
# Close the page
await page.close()
@@ -1623,7 +1623,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Clean up the page
if page:
try:
self.browser_manager.release_page(page)
await self.browser_manager.release_page_with_context(page)
await page.close()
except Exception:
pass

View File

@@ -711,6 +711,12 @@ class BrowserManager:
self.contexts_by_config = {}
self._contexts_lock = asyncio.Lock()
# Context lifecycle tracking for LRU eviction
self._context_refcounts = {} # sig -> int (active crawls using this context)
self._context_last_used = {} # sig -> float (monotonic timestamp for LRU)
self._page_to_sig = {} # page -> sig (for decrement lookup on release)
self._max_contexts = 20 # LRU eviction threshold
# Serialize context.new_page() across concurrent tasks to avoid races
# when using a shared persistent context (context.pages may be empty
# for all racers). Prevents 'Target page/context closed' errors.
@@ -1247,39 +1253,81 @@ class BrowserManager:
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
"""
Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
then returns a hash of the sorted JSON. This yields a stable signature
that identifies configurations requiring a unique browser context.
Hash ONLY the CrawlerRunConfig fields that affect browser context
creation (create_browser_context) or context setup (setup_context).
Whitelist approach: fields like css_selector, word_count_threshold,
screenshot, verbose, etc. do NOT cause a new context to be created.
"""
import json
config_dict = crawlerRunConfig.__dict__.copy()
# Exclude items that do not affect browser-level setup.
# Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
ephemeral_keys = [
"session_id",
"js_code",
"scraping_strategy",
"extraction_strategy",
"chunking_strategy",
"cache_mode",
"content_filter",
"semaphore_count",
"url"
sig_dict = {}
# Fields that flow into create_browser_context()
pc = crawlerRunConfig.proxy_config
if pc is not None:
sig_dict["proxy_config"] = {
"server": getattr(pc, "server", None),
"username": getattr(pc, "username", None),
"password": getattr(pc, "password", None),
}
else:
sig_dict["proxy_config"] = None
sig_dict["locale"] = crawlerRunConfig.locale
sig_dict["timezone_id"] = crawlerRunConfig.timezone_id
geo = crawlerRunConfig.geolocation
if geo is not None:
sig_dict["geolocation"] = {
"latitude": geo.latitude,
"longitude": geo.longitude,
"accuracy": geo.accuracy,
}
else:
sig_dict["geolocation"] = None
# Fields that flow into setup_context() as init scripts
sig_dict["override_navigator"] = crawlerRunConfig.override_navigator
sig_dict["simulate_user"] = crawlerRunConfig.simulate_user
sig_dict["magic"] = crawlerRunConfig.magic
signature_json = json.dumps(sig_dict, sort_keys=True, default=str)
return hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
def _evict_lru_context_locked(self):
"""
If contexts exceed the limit, find the least-recently-used context
with zero active crawls and remove it from all tracking dicts.
MUST be called while holding self._contexts_lock.
Returns the BrowserContext to close (caller closes it OUTSIDE the
lock), or None if no eviction is needed or possible.
"""
if len(self.contexts_by_config) <= self._max_contexts:
return None
# Sort candidates by last-used timestamp (oldest first)
candidates = sorted(
self._context_last_used.items(),
key=lambda item: item[1],
)
for evict_sig, _ in candidates:
if self._context_refcounts.get(evict_sig, 0) == 0:
ctx = self.contexts_by_config.pop(evict_sig, None)
self._context_refcounts.pop(evict_sig, None)
self._context_last_used.pop(evict_sig, None)
# Clean up stale page->sig mappings for evicted context
stale_pages = [
p for p, s in self._page_to_sig.items() if s == evict_sig
]
for p in stale_pages:
del self._page_to_sig[p]
return ctx
# Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
# and should cause a new context to be created if they change
for key in ephemeral_keys:
if key in config_dict:
del config_dict[key]
# Convert to canonical JSON string
signature_json = json.dumps(config_dict, sort_keys=True, default=str)
# Hash the JSON so we get a compact, unique string
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
return signature_hash
# All contexts are in active use — cannot evict
return None
async def _apply_stealth_to_page(self, page):
"""Apply stealth to a page if stealth mode is enabled"""
@@ -1377,6 +1425,7 @@ class BrowserManager:
# context reuse for multiple URLs with the same config (e.g., batch/deep crawls).
if self.config.create_isolated_context:
config_signature = self._make_config_signature(crawlerRunConfig)
to_close = None
async with self._contexts_lock:
if config_signature in self.contexts_by_config:
@@ -1385,14 +1434,44 @@ class BrowserManager:
context = await self.create_browser_context(crawlerRunConfig)
await self.setup_context(context, crawlerRunConfig)
self.contexts_by_config[config_signature] = context
self._context_refcounts[config_signature] = 0
to_close = self._evict_lru_context_locked()
# Increment refcount INSIDE lock before releasing
self._context_refcounts[config_signature] = (
self._context_refcounts.get(config_signature, 0) + 1
)
self._context_last_used[config_signature] = time.monotonic()
# Close evicted context OUTSIDE lock
if to_close is not None:
try:
await to_close.close()
except Exception:
pass
# Always create a new page for each crawl (isolation for navigation)
try:
page = await context.new_page()
except Exception:
async with self._contexts_lock:
if config_signature in self._context_refcounts:
self._context_refcounts[config_signature] = max(
0, self._context_refcounts[config_signature] - 1
)
raise
await self._apply_stealth_to_page(page)
self._page_to_sig[page] = config_signature
elif self.config.storage_state:
context = await self.create_browser_context(crawlerRunConfig)
tmp_context = await self.create_browser_context(crawlerRunConfig)
ctx = self.default_context # default context, one window only
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
ctx = await clone_runtime_state(tmp_context, ctx, crawlerRunConfig, self.config)
# Close the temporary context — only needed as a clone source
try:
await tmp_context.close()
except Exception:
pass
context = ctx # so (page, context) return value is correct
# Avoid concurrent new_page on shared persistent context
# See GH-1198: context.pages can be empty under races
async with self._page_lock:
@@ -1445,6 +1524,7 @@ class BrowserManager:
else:
# Otherwise, check if we have an existing context for this config
config_signature = self._make_config_signature(crawlerRunConfig)
to_close = None
async with self._contexts_lock:
if config_signature in self.contexts_by_config:
@@ -1454,10 +1534,34 @@ class BrowserManager:
context = await self.create_browser_context(crawlerRunConfig)
await self.setup_context(context, crawlerRunConfig)
self.contexts_by_config[config_signature] = context
self._context_refcounts[config_signature] = 0
to_close = self._evict_lru_context_locked()
# Increment refcount INSIDE lock before releasing
self._context_refcounts[config_signature] = (
self._context_refcounts.get(config_signature, 0) + 1
)
self._context_last_used[config_signature] = time.monotonic()
# Close evicted context OUTSIDE lock
if to_close is not None:
try:
await to_close.close()
except Exception:
pass
# Create a new page from the chosen context
try:
page = await context.new_page()
except Exception:
async with self._contexts_lock:
if config_signature in self._context_refcounts:
self._context_refcounts[config_signature] = max(
0, self._context_refcounts[config_signature] - 1
)
raise
await self._apply_stealth_to_page(page)
self._page_to_sig[page] = config_signature
# If a session_id is specified, store this session so we can reuse later
if crawlerRunConfig.session_id:
@@ -1475,6 +1579,13 @@ class BrowserManager:
if session_id in self.sessions:
context, page, _ = self.sessions[session_id]
self._release_page_from_use(page)
# Decrement context refcount for the session's page
async with self._contexts_lock:
sig = self._page_to_sig.pop(page, None)
if sig is not None and sig in self._context_refcounts:
self._context_refcounts[sig] = max(
0, self._context_refcounts[sig] - 1
)
await page.close()
if not self.config.use_managed_browser:
await context.close()
@@ -1483,15 +1594,25 @@ class BrowserManager:
def release_page(self, page):
"""
Release a page from the in-use tracking set (global tracking).
This should be called when a crawl operation completes to allow
the page to be reused by subsequent crawls.
Args:
page: The Playwright page to release.
Sync variant — does NOT decrement context refcount.
"""
self._release_page_from_use(page)
async def release_page_with_context(self, page):
"""
Release a page and decrement its context's refcount under the lock.
Should be called from the async crawl finally block instead of
release_page() so the context lifecycle is properly tracked.
"""
self._release_page_from_use(page)
async with self._contexts_lock:
sig = self._page_to_sig.pop(page, None)
if sig is not None and sig in self._context_refcounts:
self._context_refcounts[sig] = max(
0, self._context_refcounts[sig] - 1
)
def _cleanup_expired_sessions(self):
"""Clean up expired sessions based on TTL."""
current_time = time.time()
@@ -1517,6 +1638,9 @@ class BrowserManager:
except Exception:
pass
self.contexts_by_config.clear()
self._context_refcounts.clear()
self._context_last_used.clear()
self._page_to_sig.clear()
await _CDPConnectionCache.release(self.config.cdp_url)
self.browser = None
self.playwright = None
@@ -1540,6 +1664,9 @@ class BrowserManager:
except Exception:
pass
self.contexts_by_config.clear()
self._context_refcounts.clear()
self._context_last_used.clear()
self._page_to_sig.clear()
# Disconnect from browser (doesn't terminate it, just releases connection)
if self.browser:
@@ -1581,6 +1708,9 @@ class BrowserManager:
params={"error": str(e)}
)
self.contexts_by_config.clear()
self._context_refcounts.clear()
self._context_last_used.clear()
self._page_to_sig.clear()
if self.browser:
await self.browser.close()

View File

@@ -0,0 +1,358 @@
"""
Integration tests for the browser context memory leak fix.
Tests:
1. Signature shrink: non-context fields produce same hash
2. Signature correctness: context-affecting fields produce different hashes
3. Refcount lifecycle: increment on get_page, decrement on release
4. LRU eviction: oldest idle context is evicted when over limit
5. Eviction respects active refcounts
6. Real browser: contexts don't leak under varying configs
7. Real browser: batch crawl reuses same context
8. Storage state path: temporary context is closed
"""
import asyncio
import time
import pytest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.async_configs import ProxyConfig, GeolocationConfig
from crawl4ai.browser_manager import BrowserManager
# ── Unit tests (no browser needed) ──────────────────────────────────────
class TestSignatureShrink:
"""Verify the whitelist signature only considers context-affecting fields."""
def _bm(self):
return BrowserManager(BrowserConfig(), logger=None)
def test_non_context_fields_same_signature(self):
"""Fields that don't affect browser context must produce identical sigs."""
bm = self._bm()
configs = [
CrawlerRunConfig(word_count_threshold=200),
CrawlerRunConfig(word_count_threshold=50),
CrawlerRunConfig(css_selector=".main"),
CrawlerRunConfig(screenshot=True),
CrawlerRunConfig(pdf=True, verbose=False),
CrawlerRunConfig(scan_full_page=True, scroll_delay=0.5),
CrawlerRunConfig(only_text=True),
CrawlerRunConfig(wait_until="networkidle", page_timeout=30000),
CrawlerRunConfig(capture_network_requests=True),
CrawlerRunConfig(exclude_external_links=True),
]
sigs = [bm._make_config_signature(c) for c in configs]
assert len(set(sigs)) == 1, (
f"Expected all same sig, got {len(set(sigs))} unique: {sigs[:3]}"
)
def test_proxy_changes_signature(self):
bm = self._bm()
c1 = CrawlerRunConfig()
c2 = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://p1:8080"))
c3 = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://p2:8080"))
s1 = bm._make_config_signature(c1)
s2 = bm._make_config_signature(c2)
s3 = bm._make_config_signature(c3)
assert s1 != s2, "proxy vs no-proxy should differ"
assert s2 != s3, "different proxies should differ"
def test_locale_changes_signature(self):
bm = self._bm()
s1 = bm._make_config_signature(CrawlerRunConfig())
s2 = bm._make_config_signature(CrawlerRunConfig(locale="en-US"))
s3 = bm._make_config_signature(CrawlerRunConfig(locale="fr-FR"))
assert s1 != s2
assert s2 != s3
def test_timezone_changes_signature(self):
bm = self._bm()
s1 = bm._make_config_signature(CrawlerRunConfig())
s2 = bm._make_config_signature(CrawlerRunConfig(timezone_id="America/New_York"))
assert s1 != s2
def test_geolocation_changes_signature(self):
bm = self._bm()
s1 = bm._make_config_signature(CrawlerRunConfig())
s2 = bm._make_config_signature(CrawlerRunConfig(
geolocation=GeolocationConfig(latitude=40.7, longitude=-74.0)
))
assert s1 != s2
def test_navigator_overrides_change_signature(self):
bm = self._bm()
base = bm._make_config_signature(CrawlerRunConfig())
s_nav = bm._make_config_signature(CrawlerRunConfig(override_navigator=True))
s_sim = bm._make_config_signature(CrawlerRunConfig(simulate_user=True))
s_mag = bm._make_config_signature(CrawlerRunConfig(magic=True))
assert base != s_nav
assert base != s_sim
assert base != s_mag
def test_signature_stability(self):
"""Same config always produces the same hash."""
bm = self._bm()
c = CrawlerRunConfig(locale="ja-JP", override_navigator=True)
assert bm._make_config_signature(c) == bm._make_config_signature(c)
def test_proxy_config_with_credentials(self):
"""ProxyConfig with username/password produces distinct stable sigs."""
bm = self._bm()
c1 = CrawlerRunConfig(proxy_config=ProxyConfig(
server="http://proxy:8080", username="user1", password="pass1"
))
c2 = CrawlerRunConfig(proxy_config=ProxyConfig(
server="http://proxy:8080", username="user2", password="pass2"
))
s1 = bm._make_config_signature(c1)
s2 = bm._make_config_signature(c2)
assert s1 != s2, "different credentials should differ"
assert s1 == bm._make_config_signature(c1), "should be stable"
class TestLRUEviction:
"""Verify eviction logic (no browser needed)."""
def _bm(self, max_ctx=3):
bm = BrowserManager(BrowserConfig(), logger=None)
bm._max_contexts = max_ctx
return bm
def test_no_eviction_under_limit(self):
bm = self._bm(max_ctx=5)
for i in range(5):
sig = f"sig_{i}"
bm.contexts_by_config[sig] = f"ctx_{i}"
bm._context_refcounts[sig] = 0
bm._context_last_used[sig] = time.monotonic()
assert bm._evict_lru_context_locked() is None
def test_evicts_oldest_idle(self):
bm = self._bm(max_ctx=3)
for i in range(5):
sig = f"sig_{i}"
bm.contexts_by_config[sig] = f"ctx_{i}"
bm._context_refcounts[sig] = 0
bm._context_last_used[sig] = time.monotonic()
time.sleep(0.002)
evicted = bm._evict_lru_context_locked()
assert evicted == "ctx_0", f"expected oldest ctx_0, got {evicted}"
assert "sig_0" not in bm.contexts_by_config
assert "sig_0" not in bm._context_refcounts
assert "sig_0" not in bm._context_last_used
def test_skips_active_contexts(self):
bm = self._bm(max_ctx=2)
# sig_0: old but active
bm.contexts_by_config["sig_0"] = "ctx_0"
bm._context_refcounts["sig_0"] = 3
bm._context_last_used["sig_0"] = 0 # very old
# sig_1: newer, idle
bm.contexts_by_config["sig_1"] = "ctx_1"
bm._context_refcounts["sig_1"] = 0
bm._context_last_used["sig_1"] = time.monotonic()
# sig_2: newest, idle
bm.contexts_by_config["sig_2"] = "ctx_2"
bm._context_refcounts["sig_2"] = 0
bm._context_last_used["sig_2"] = time.monotonic()
evicted = bm._evict_lru_context_locked()
# sig_0 is oldest but active (refcount=3) — must skip it
assert evicted == "ctx_1", f"expected ctx_1 (oldest idle), got {evicted}"
assert "sig_0" in bm.contexts_by_config, "active context must NOT be evicted"
def test_all_active_no_eviction(self):
bm = self._bm(max_ctx=1)
for i in range(3):
sig = f"sig_{i}"
bm.contexts_by_config[sig] = f"ctx_{i}"
bm._context_refcounts[sig] = 1 # all active
bm._context_last_used[sig] = time.monotonic()
evicted = bm._evict_lru_context_locked()
assert evicted is None, "cannot evict when all are active"
assert len(bm.contexts_by_config) == 3, "all contexts should remain"
def test_eviction_cleans_page_to_sig(self):
bm = self._bm(max_ctx=1)
bm.contexts_by_config["sig_old"] = "ctx_old"
bm._context_refcounts["sig_old"] = 0
bm._context_last_used["sig_old"] = 0
bm.contexts_by_config["sig_new"] = "ctx_new"
bm._context_refcounts["sig_new"] = 0
bm._context_last_used["sig_new"] = time.monotonic()
# Simulate a stale page mapping for the old context
mock_page = object()
bm._page_to_sig[mock_page] = "sig_old"
evicted = bm._evict_lru_context_locked()
assert evicted == "ctx_old"
assert mock_page not in bm._page_to_sig, "stale page mapping should be cleaned"
# ── Integration tests (real browser) ────────────────────────────────────
@pytest.fixture
def event_loop():
loop = asyncio.new_event_loop()
yield loop
loop.close()
def run(coro):
"""Run an async function synchronously."""
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
finally:
loop.close()
class TestRealBrowserContextLifecycle:
"""Real browser tests — verify contexts aren't leaked."""
def test_varying_configs_same_context(self):
"""Different non-context fields should reuse the same context."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
# Crawl with different non-context configs
html = "<html><body><p>Hello World with enough words to pass threshold</p></body></html>"
for wct in [10, 50, 200]:
config = CrawlerRunConfig(word_count_threshold=wct)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Should have at most 1 context (all configs hash the same)
ctx_count = len(bm.contexts_by_config)
assert ctx_count <= 1, (
f"Expected 1 context for identical browser config, got {ctx_count}"
)
run(_test())
def test_batch_crawl_reuses_context(self):
"""Multiple URLs with same config should reuse a single context."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
html1 = "<html><body><p>Page one content here</p></body></html>"
html2 = "<html><body><p>Page two content here</p></body></html>"
html3 = "<html><body><p>Page three content here</p></body></html>"
config = CrawlerRunConfig()
for h in [html1, html2, html3]:
result = await crawler.arun(f"raw:{h}", config=config)
assert result.success
ctx_count = len(bm.contexts_by_config)
assert ctx_count <= 1, f"Batch should reuse context, got {ctx_count}"
run(_test())
def test_refcount_drops_to_zero_after_crawl(self):
"""After a crawl completes, the context refcount should be 0."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
html = "<html><body><p>Test content</p></body></html>"
config = CrawlerRunConfig()
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# All refcounts should be 0 after crawl completes
for sig, count in bm._context_refcounts.items():
assert count == 0, (
f"Refcount for {sig[:8]} should be 0 after crawl, got {count}"
)
run(_test())
def test_page_to_sig_cleaned_after_crawl(self):
"""After crawl, the page->sig mapping should be empty (pages released)."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
html = "<html><body><p>Test</p></body></html>"
result = await crawler.arun(f"raw:{html}", config=CrawlerRunConfig())
assert result.success
assert len(bm._page_to_sig) == 0, (
f"Expected empty _page_to_sig after crawl, got {len(bm._page_to_sig)} entries"
)
run(_test())
def test_concurrent_crawls_refcount_tracking(self):
"""Concurrent crawls should all properly increment/decrement refcounts."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
config = CrawlerRunConfig()
htmls = [
f"raw:<html><body><p>Concurrent page {i}</p></body></html>"
for i in range(5)
]
tasks = [crawler.arun(h, config=config) for h in htmls]
results = await asyncio.gather(*tasks)
for r in results:
assert r.success
# All done — refcounts should be 0
for sig, count in bm._context_refcounts.items():
assert count == 0, (
f"After concurrent crawls, refcount for {sig[:8]} = {count}"
)
assert len(bm._page_to_sig) == 0
run(_test())
def test_lru_eviction_real_browser(self):
"""Verify LRU eviction actually closes contexts when limit exceeded."""
async def _test():
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
bm = crawler.crawler_strategy.browser_manager
bm._max_contexts = 2 # Low limit to trigger eviction
html = "<html><body><p>Test</p></body></html>"
# Crawl with 4 different locales → 4 different context signatures
for locale in ["en-US", "fr-FR", "de-DE", "ja-JP"]:
config = CrawlerRunConfig(locale=locale)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Should have at most 2 contexts (limit)
ctx_count = len(bm.contexts_by_config)
assert ctx_count <= 2, (
f"Expected <= 2 contexts (limit), got {ctx_count}"
)
# Refcounts should all be 0
for sig, count in bm._context_refcounts.items():
assert count == 0, f"refcount {sig[:8]} = {count}"
run(_test())
def test_close_clears_everything(self):
"""close() should clear all tracking dicts."""
async def _test():
crawler = AsyncWebCrawler(config=BrowserConfig(headless=True))
await crawler.start()
bm = crawler.crawler_strategy.browser_manager
html = "<html><body><p>Test</p></body></html>"
result = await crawler.arun(f"raw:{html}", config=CrawlerRunConfig())
assert result.success
await crawler.close()
assert len(bm.contexts_by_config) == 0
assert len(bm._context_refcounts) == 0
assert len(bm._context_last_used) == 0
assert len(bm._page_to_sig) == 0
run(_test())