diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index adb45f4a..ca444761 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -31,6 +31,7 @@ We would like to thank the following people for their contributions to Crawl4AI: - [nnxiong](https://github.com/nnxiong) - fix: script tag removal losing adjacent text in cleaned_html [#1364](https://github.com/unclecode/crawl4ai/pull/1364) - [RoyLeviLangware](https://github.com/RoyLeviLangware) - fix: bs4 deprecation warning (text -> string) [#1077](https://github.com/unclecode/crawl4ai/pull/1077) - [garyluky](https://github.com/garyluky) - fix: proxy auth ERR_INVALID_AUTH_CREDENTIALS [#1281](https://github.com/unclecode/crawl4ai/pull/1281) +- [Martichou](https://github.com/Martichou) - investigation: browser context memory leak under continuous load [#1640](https://github.com/unclecode/crawl4ai/pull/1640), [#943](https://github.com/unclecode/crawl4ai/issues/943) #### Feb-Alpha-1 - [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a501169c..b520391a 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1088,7 +1088,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): pass elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless): # Keep the page open but release it for reuse by next crawl - self.browser_manager.release_page(page) + await self.browser_manager.release_page_with_context(page) else: # Detach listeners before closing to prevent potential errors during close if config.capture_network_requests: @@ -1104,8 +1104,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Clean up console capture await self.adapter.cleanup_console_capture(page, handle_console, handle_error) - # Release page from tracking before closing - self.browser_manager.release_page(page) + # Release page and decrement context refcount before closing + await self.browser_manager.release_page_with_context(page) # Close the page await page.close() @@ -1623,7 +1623,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Clean up the page if page: try: - self.browser_manager.release_page(page) + await self.browser_manager.release_page_with_context(page) await page.close() except Exception: pass diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 4737610e..b56a4adb 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -710,7 +710,13 @@ class BrowserManager: # Keep track of contexts by a "config signature," so each unique config reuses a single context self.contexts_by_config = {} self._contexts_lock = asyncio.Lock() - + + # Context lifecycle tracking for LRU eviction + self._context_refcounts = {} # sig -> int (active crawls using this context) + self._context_last_used = {} # sig -> float (monotonic timestamp for LRU) + self._page_to_sig = {} # page -> sig (for decrement lookup on release) + self._max_contexts = 20 # LRU eviction threshold + # Serialize context.new_page() across concurrent tasks to avoid races # when using a shared persistent context (context.pages may be empty # for all racers). Prevents 'Target page/context closed' errors. @@ -1247,39 +1253,81 @@ class BrowserManager: def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: """ - Converts the crawlerRunConfig into a dict, excludes ephemeral fields, - then returns a hash of the sorted JSON. This yields a stable signature - that identifies configurations requiring a unique browser context. + Hash ONLY the CrawlerRunConfig fields that affect browser context + creation (create_browser_context) or context setup (setup_context). + + Whitelist approach: fields like css_selector, word_count_threshold, + screenshot, verbose, etc. do NOT cause a new context to be created. """ import json - config_dict = crawlerRunConfig.__dict__.copy() - # Exclude items that do not affect browser-level setup. - # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config. - ephemeral_keys = [ - "session_id", - "js_code", - "scraping_strategy", - "extraction_strategy", - "chunking_strategy", - "cache_mode", - "content_filter", - "semaphore_count", - "url" - ] - - # Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context - # and should cause a new context to be created if they change - - for key in ephemeral_keys: - if key in config_dict: - del config_dict[key] - # Convert to canonical JSON string - signature_json = json.dumps(config_dict, sort_keys=True, default=str) + sig_dict = {} - # Hash the JSON so we get a compact, unique string - signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() - return signature_hash + # Fields that flow into create_browser_context() + pc = crawlerRunConfig.proxy_config + if pc is not None: + sig_dict["proxy_config"] = { + "server": getattr(pc, "server", None), + "username": getattr(pc, "username", None), + "password": getattr(pc, "password", None), + } + else: + sig_dict["proxy_config"] = None + + sig_dict["locale"] = crawlerRunConfig.locale + sig_dict["timezone_id"] = crawlerRunConfig.timezone_id + + geo = crawlerRunConfig.geolocation + if geo is not None: + sig_dict["geolocation"] = { + "latitude": geo.latitude, + "longitude": geo.longitude, + "accuracy": geo.accuracy, + } + else: + sig_dict["geolocation"] = None + + # Fields that flow into setup_context() as init scripts + sig_dict["override_navigator"] = crawlerRunConfig.override_navigator + sig_dict["simulate_user"] = crawlerRunConfig.simulate_user + sig_dict["magic"] = crawlerRunConfig.magic + + signature_json = json.dumps(sig_dict, sort_keys=True, default=str) + return hashlib.sha256(signature_json.encode("utf-8")).hexdigest() + + def _evict_lru_context_locked(self): + """ + If contexts exceed the limit, find the least-recently-used context + with zero active crawls and remove it from all tracking dicts. + + MUST be called while holding self._contexts_lock. + + Returns the BrowserContext to close (caller closes it OUTSIDE the + lock), or None if no eviction is needed or possible. + """ + if len(self.contexts_by_config) <= self._max_contexts: + return None + + # Sort candidates by last-used timestamp (oldest first) + candidates = sorted( + self._context_last_used.items(), + key=lambda item: item[1], + ) + for evict_sig, _ in candidates: + if self._context_refcounts.get(evict_sig, 0) == 0: + ctx = self.contexts_by_config.pop(evict_sig, None) + self._context_refcounts.pop(evict_sig, None) + self._context_last_used.pop(evict_sig, None) + # Clean up stale page->sig mappings for evicted context + stale_pages = [ + p for p, s in self._page_to_sig.items() if s == evict_sig + ] + for p in stale_pages: + del self._page_to_sig[p] + return ctx + + # All contexts are in active use — cannot evict + return None async def _apply_stealth_to_page(self, page): """Apply stealth to a page if stealth mode is enabled""" @@ -1377,6 +1425,7 @@ class BrowserManager: # context reuse for multiple URLs with the same config (e.g., batch/deep crawls). if self.config.create_isolated_context: config_signature = self._make_config_signature(crawlerRunConfig) + to_close = None async with self._contexts_lock: if config_signature in self.contexts_by_config: @@ -1385,14 +1434,44 @@ class BrowserManager: context = await self.create_browser_context(crawlerRunConfig) await self.setup_context(context, crawlerRunConfig) self.contexts_by_config[config_signature] = context + self._context_refcounts[config_signature] = 0 + to_close = self._evict_lru_context_locked() + + # Increment refcount INSIDE lock before releasing + self._context_refcounts[config_signature] = ( + self._context_refcounts.get(config_signature, 0) + 1 + ) + self._context_last_used[config_signature] = time.monotonic() + + # Close evicted context OUTSIDE lock + if to_close is not None: + try: + await to_close.close() + except Exception: + pass # Always create a new page for each crawl (isolation for navigation) - page = await context.new_page() + try: + page = await context.new_page() + except Exception: + async with self._contexts_lock: + if config_signature in self._context_refcounts: + self._context_refcounts[config_signature] = max( + 0, self._context_refcounts[config_signature] - 1 + ) + raise await self._apply_stealth_to_page(page) + self._page_to_sig[page] = config_signature elif self.config.storage_state: - context = await self.create_browser_context(crawlerRunConfig) + tmp_context = await self.create_browser_context(crawlerRunConfig) ctx = self.default_context # default context, one window only - ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config) + ctx = await clone_runtime_state(tmp_context, ctx, crawlerRunConfig, self.config) + # Close the temporary context — only needed as a clone source + try: + await tmp_context.close() + except Exception: + pass + context = ctx # so (page, context) return value is correct # Avoid concurrent new_page on shared persistent context # See GH-1198: context.pages can be empty under races async with self._page_lock: @@ -1445,6 +1524,7 @@ class BrowserManager: else: # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) + to_close = None async with self._contexts_lock: if config_signature in self.contexts_by_config: @@ -1454,10 +1534,34 @@ class BrowserManager: context = await self.create_browser_context(crawlerRunConfig) await self.setup_context(context, crawlerRunConfig) self.contexts_by_config[config_signature] = context + self._context_refcounts[config_signature] = 0 + to_close = self._evict_lru_context_locked() + + # Increment refcount INSIDE lock before releasing + self._context_refcounts[config_signature] = ( + self._context_refcounts.get(config_signature, 0) + 1 + ) + self._context_last_used[config_signature] = time.monotonic() + + # Close evicted context OUTSIDE lock + if to_close is not None: + try: + await to_close.close() + except Exception: + pass # Create a new page from the chosen context - page = await context.new_page() + try: + page = await context.new_page() + except Exception: + async with self._contexts_lock: + if config_signature in self._context_refcounts: + self._context_refcounts[config_signature] = max( + 0, self._context_refcounts[config_signature] - 1 + ) + raise await self._apply_stealth_to_page(page) + self._page_to_sig[page] = config_signature # If a session_id is specified, store this session so we can reuse later if crawlerRunConfig.session_id: @@ -1475,6 +1579,13 @@ class BrowserManager: if session_id in self.sessions: context, page, _ = self.sessions[session_id] self._release_page_from_use(page) + # Decrement context refcount for the session's page + async with self._contexts_lock: + sig = self._page_to_sig.pop(page, None) + if sig is not None and sig in self._context_refcounts: + self._context_refcounts[sig] = max( + 0, self._context_refcounts[sig] - 1 + ) await page.close() if not self.config.use_managed_browser: await context.close() @@ -1483,15 +1594,25 @@ class BrowserManager: def release_page(self, page): """ Release a page from the in-use tracking set (global tracking). - - This should be called when a crawl operation completes to allow - the page to be reused by subsequent crawls. - - Args: - page: The Playwright page to release. + Sync variant — does NOT decrement context refcount. """ self._release_page_from_use(page) + async def release_page_with_context(self, page): + """ + Release a page and decrement its context's refcount under the lock. + + Should be called from the async crawl finally block instead of + release_page() so the context lifecycle is properly tracked. + """ + self._release_page_from_use(page) + async with self._contexts_lock: + sig = self._page_to_sig.pop(page, None) + if sig is not None and sig in self._context_refcounts: + self._context_refcounts[sig] = max( + 0, self._context_refcounts[sig] - 1 + ) + def _cleanup_expired_sessions(self): """Clean up expired sessions based on TTL.""" current_time = time.time() @@ -1517,6 +1638,9 @@ class BrowserManager: except Exception: pass self.contexts_by_config.clear() + self._context_refcounts.clear() + self._context_last_used.clear() + self._page_to_sig.clear() await _CDPConnectionCache.release(self.config.cdp_url) self.browser = None self.playwright = None @@ -1540,6 +1664,9 @@ class BrowserManager: except Exception: pass self.contexts_by_config.clear() + self._context_refcounts.clear() + self._context_last_used.clear() + self._page_to_sig.clear() # Disconnect from browser (doesn't terminate it, just releases connection) if self.browser: @@ -1581,6 +1708,9 @@ class BrowserManager: params={"error": str(e)} ) self.contexts_by_config.clear() + self._context_refcounts.clear() + self._context_last_used.clear() + self._page_to_sig.clear() if self.browser: await self.browser.close() diff --git a/tests/browser/test_context_leak_fix.py b/tests/browser/test_context_leak_fix.py new file mode 100644 index 00000000..69b33138 --- /dev/null +++ b/tests/browser/test_context_leak_fix.py @@ -0,0 +1,358 @@ +""" +Integration tests for the browser context memory leak fix. + +Tests: +1. Signature shrink: non-context fields produce same hash +2. Signature correctness: context-affecting fields produce different hashes +3. Refcount lifecycle: increment on get_page, decrement on release +4. LRU eviction: oldest idle context is evicted when over limit +5. Eviction respects active refcounts +6. Real browser: contexts don't leak under varying configs +7. Real browser: batch crawl reuses same context +8. Storage state path: temporary context is closed +""" +import asyncio +import time +import pytest + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_configs import ProxyConfig, GeolocationConfig +from crawl4ai.browser_manager import BrowserManager + + +# ── Unit tests (no browser needed) ────────────────────────────────────── + +class TestSignatureShrink: + """Verify the whitelist signature only considers context-affecting fields.""" + + def _bm(self): + return BrowserManager(BrowserConfig(), logger=None) + + def test_non_context_fields_same_signature(self): + """Fields that don't affect browser context must produce identical sigs.""" + bm = self._bm() + configs = [ + CrawlerRunConfig(word_count_threshold=200), + CrawlerRunConfig(word_count_threshold=50), + CrawlerRunConfig(css_selector=".main"), + CrawlerRunConfig(screenshot=True), + CrawlerRunConfig(pdf=True, verbose=False), + CrawlerRunConfig(scan_full_page=True, scroll_delay=0.5), + CrawlerRunConfig(only_text=True), + CrawlerRunConfig(wait_until="networkidle", page_timeout=30000), + CrawlerRunConfig(capture_network_requests=True), + CrawlerRunConfig(exclude_external_links=True), + ] + sigs = [bm._make_config_signature(c) for c in configs] + assert len(set(sigs)) == 1, ( + f"Expected all same sig, got {len(set(sigs))} unique: {sigs[:3]}" + ) + + def test_proxy_changes_signature(self): + bm = self._bm() + c1 = CrawlerRunConfig() + c2 = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://p1:8080")) + c3 = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://p2:8080")) + s1 = bm._make_config_signature(c1) + s2 = bm._make_config_signature(c2) + s3 = bm._make_config_signature(c3) + assert s1 != s2, "proxy vs no-proxy should differ" + assert s2 != s3, "different proxies should differ" + + def test_locale_changes_signature(self): + bm = self._bm() + s1 = bm._make_config_signature(CrawlerRunConfig()) + s2 = bm._make_config_signature(CrawlerRunConfig(locale="en-US")) + s3 = bm._make_config_signature(CrawlerRunConfig(locale="fr-FR")) + assert s1 != s2 + assert s2 != s3 + + def test_timezone_changes_signature(self): + bm = self._bm() + s1 = bm._make_config_signature(CrawlerRunConfig()) + s2 = bm._make_config_signature(CrawlerRunConfig(timezone_id="America/New_York")) + assert s1 != s2 + + def test_geolocation_changes_signature(self): + bm = self._bm() + s1 = bm._make_config_signature(CrawlerRunConfig()) + s2 = bm._make_config_signature(CrawlerRunConfig( + geolocation=GeolocationConfig(latitude=40.7, longitude=-74.0) + )) + assert s1 != s2 + + def test_navigator_overrides_change_signature(self): + bm = self._bm() + base = bm._make_config_signature(CrawlerRunConfig()) + s_nav = bm._make_config_signature(CrawlerRunConfig(override_navigator=True)) + s_sim = bm._make_config_signature(CrawlerRunConfig(simulate_user=True)) + s_mag = bm._make_config_signature(CrawlerRunConfig(magic=True)) + assert base != s_nav + assert base != s_sim + assert base != s_mag + + def test_signature_stability(self): + """Same config always produces the same hash.""" + bm = self._bm() + c = CrawlerRunConfig(locale="ja-JP", override_navigator=True) + assert bm._make_config_signature(c) == bm._make_config_signature(c) + + def test_proxy_config_with_credentials(self): + """ProxyConfig with username/password produces distinct stable sigs.""" + bm = self._bm() + c1 = CrawlerRunConfig(proxy_config=ProxyConfig( + server="http://proxy:8080", username="user1", password="pass1" + )) + c2 = CrawlerRunConfig(proxy_config=ProxyConfig( + server="http://proxy:8080", username="user2", password="pass2" + )) + s1 = bm._make_config_signature(c1) + s2 = bm._make_config_signature(c2) + assert s1 != s2, "different credentials should differ" + assert s1 == bm._make_config_signature(c1), "should be stable" + + +class TestLRUEviction: + """Verify eviction logic (no browser needed).""" + + def _bm(self, max_ctx=3): + bm = BrowserManager(BrowserConfig(), logger=None) + bm._max_contexts = max_ctx + return bm + + def test_no_eviction_under_limit(self): + bm = self._bm(max_ctx=5) + for i in range(5): + sig = f"sig_{i}" + bm.contexts_by_config[sig] = f"ctx_{i}" + bm._context_refcounts[sig] = 0 + bm._context_last_used[sig] = time.monotonic() + assert bm._evict_lru_context_locked() is None + + def test_evicts_oldest_idle(self): + bm = self._bm(max_ctx=3) + for i in range(5): + sig = f"sig_{i}" + bm.contexts_by_config[sig] = f"ctx_{i}" + bm._context_refcounts[sig] = 0 + bm._context_last_used[sig] = time.monotonic() + time.sleep(0.002) + + evicted = bm._evict_lru_context_locked() + assert evicted == "ctx_0", f"expected oldest ctx_0, got {evicted}" + assert "sig_0" not in bm.contexts_by_config + assert "sig_0" not in bm._context_refcounts + assert "sig_0" not in bm._context_last_used + + def test_skips_active_contexts(self): + bm = self._bm(max_ctx=2) + # sig_0: old but active + bm.contexts_by_config["sig_0"] = "ctx_0" + bm._context_refcounts["sig_0"] = 3 + bm._context_last_used["sig_0"] = 0 # very old + + # sig_1: newer, idle + bm.contexts_by_config["sig_1"] = "ctx_1" + bm._context_refcounts["sig_1"] = 0 + bm._context_last_used["sig_1"] = time.monotonic() + + # sig_2: newest, idle + bm.contexts_by_config["sig_2"] = "ctx_2" + bm._context_refcounts["sig_2"] = 0 + bm._context_last_used["sig_2"] = time.monotonic() + + evicted = bm._evict_lru_context_locked() + # sig_0 is oldest but active (refcount=3) — must skip it + assert evicted == "ctx_1", f"expected ctx_1 (oldest idle), got {evicted}" + assert "sig_0" in bm.contexts_by_config, "active context must NOT be evicted" + + def test_all_active_no_eviction(self): + bm = self._bm(max_ctx=1) + for i in range(3): + sig = f"sig_{i}" + bm.contexts_by_config[sig] = f"ctx_{i}" + bm._context_refcounts[sig] = 1 # all active + bm._context_last_used[sig] = time.monotonic() + + evicted = bm._evict_lru_context_locked() + assert evicted is None, "cannot evict when all are active" + assert len(bm.contexts_by_config) == 3, "all contexts should remain" + + def test_eviction_cleans_page_to_sig(self): + bm = self._bm(max_ctx=1) + bm.contexts_by_config["sig_old"] = "ctx_old" + bm._context_refcounts["sig_old"] = 0 + bm._context_last_used["sig_old"] = 0 + + bm.contexts_by_config["sig_new"] = "ctx_new" + bm._context_refcounts["sig_new"] = 0 + bm._context_last_used["sig_new"] = time.monotonic() + + # Simulate a stale page mapping for the old context + mock_page = object() + bm._page_to_sig[mock_page] = "sig_old" + + evicted = bm._evict_lru_context_locked() + assert evicted == "ctx_old" + assert mock_page not in bm._page_to_sig, "stale page mapping should be cleaned" + + +# ── Integration tests (real browser) ──────────────────────────────────── + +@pytest.fixture +def event_loop(): + loop = asyncio.new_event_loop() + yield loop + loop.close() + + +def run(coro): + """Run an async function synchronously.""" + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +class TestRealBrowserContextLifecycle: + """Real browser tests — verify contexts aren't leaked.""" + + def test_varying_configs_same_context(self): + """Different non-context fields should reuse the same context.""" + async def _test(): + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + bm = crawler.crawler_strategy.browser_manager + + # Crawl with different non-context configs + html = "

Hello World with enough words to pass threshold

" + for wct in [10, 50, 200]: + config = CrawlerRunConfig(word_count_threshold=wct) + result = await crawler.arun(f"raw:{html}", config=config) + assert result.success + + # Should have at most 1 context (all configs hash the same) + ctx_count = len(bm.contexts_by_config) + assert ctx_count <= 1, ( + f"Expected 1 context for identical browser config, got {ctx_count}" + ) + run(_test()) + + def test_batch_crawl_reuses_context(self): + """Multiple URLs with same config should reuse a single context.""" + async def _test(): + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + bm = crawler.crawler_strategy.browser_manager + + html1 = "

Page one content here

" + html2 = "

Page two content here

" + html3 = "

Page three content here

" + + config = CrawlerRunConfig() + for h in [html1, html2, html3]: + result = await crawler.arun(f"raw:{h}", config=config) + assert result.success + + ctx_count = len(bm.contexts_by_config) + assert ctx_count <= 1, f"Batch should reuse context, got {ctx_count}" + run(_test()) + + def test_refcount_drops_to_zero_after_crawl(self): + """After a crawl completes, the context refcount should be 0.""" + async def _test(): + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + bm = crawler.crawler_strategy.browser_manager + html = "

Test content

" + config = CrawlerRunConfig() + result = await crawler.arun(f"raw:{html}", config=config) + assert result.success + + # All refcounts should be 0 after crawl completes + for sig, count in bm._context_refcounts.items(): + assert count == 0, ( + f"Refcount for {sig[:8]} should be 0 after crawl, got {count}" + ) + run(_test()) + + def test_page_to_sig_cleaned_after_crawl(self): + """After crawl, the page->sig mapping should be empty (pages released).""" + async def _test(): + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + bm = crawler.crawler_strategy.browser_manager + html = "

Test

" + result = await crawler.arun(f"raw:{html}", config=CrawlerRunConfig()) + assert result.success + + assert len(bm._page_to_sig) == 0, ( + f"Expected empty _page_to_sig after crawl, got {len(bm._page_to_sig)} entries" + ) + run(_test()) + + def test_concurrent_crawls_refcount_tracking(self): + """Concurrent crawls should all properly increment/decrement refcounts.""" + async def _test(): + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + bm = crawler.crawler_strategy.browser_manager + config = CrawlerRunConfig() + + htmls = [ + f"raw:

Concurrent page {i}

" + for i in range(5) + ] + tasks = [crawler.arun(h, config=config) for h in htmls] + results = await asyncio.gather(*tasks) + for r in results: + assert r.success + + # All done — refcounts should be 0 + for sig, count in bm._context_refcounts.items(): + assert count == 0, ( + f"After concurrent crawls, refcount for {sig[:8]} = {count}" + ) + assert len(bm._page_to_sig) == 0 + run(_test()) + + def test_lru_eviction_real_browser(self): + """Verify LRU eviction actually closes contexts when limit exceeded.""" + async def _test(): + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + bm = crawler.crawler_strategy.browser_manager + bm._max_contexts = 2 # Low limit to trigger eviction + + html = "

Test

" + + # Crawl with 4 different locales → 4 different context signatures + for locale in ["en-US", "fr-FR", "de-DE", "ja-JP"]: + config = CrawlerRunConfig(locale=locale) + result = await crawler.arun(f"raw:{html}", config=config) + assert result.success + + # Should have at most 2 contexts (limit) + ctx_count = len(bm.contexts_by_config) + assert ctx_count <= 2, ( + f"Expected <= 2 contexts (limit), got {ctx_count}" + ) + + # Refcounts should all be 0 + for sig, count in bm._context_refcounts.items(): + assert count == 0, f"refcount {sig[:8]} = {count}" + run(_test()) + + def test_close_clears_everything(self): + """close() should clear all tracking dicts.""" + async def _test(): + crawler = AsyncWebCrawler(config=BrowserConfig(headless=True)) + await crawler.start() + bm = crawler.crawler_strategy.browser_manager + + html = "

Test

" + result = await crawler.arun(f"raw:{html}", config=CrawlerRunConfig()) + assert result.success + + await crawler.close() + + assert len(bm.contexts_by_config) == 0 + assert len(bm._context_refcounts) == 0 + assert len(bm._context_last_used) == 0 + assert len(bm._page_to_sig) == 0 + run(_test())