diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 53f4aaed..2235020b 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -562,6 +562,14 @@ class BrowserConfig: Default: []. enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection. Cannot be used with use_undetected browser mode. Default: False. + memory_saving_mode (bool): If True, adds aggressive cache discard and V8 heap cap flags + to reduce Chromium memory growth. Recommended for high-volume + crawling (1000+ pages). May slightly reduce performance due to + cache eviction. Default: False. + max_pages_before_recycle (int): Number of pages to crawl before recycling the browser + process to reclaim leaked memory. 0 = disabled. + Recommended: 500-1000 for long-running crawlers. + Default: 0. """ def __init__( @@ -610,6 +618,8 @@ class BrowserConfig: host: str = "localhost", enable_stealth: bool = False, init_scripts: List[str] = None, + memory_saving_mode: bool = False, + max_pages_before_recycle: int = 0, ): self.browser_type = browser_type @@ -672,6 +682,8 @@ class BrowserConfig: self.host = host self.enable_stealth = enable_stealth self.init_scripts = init_scripts if init_scripts is not None else [] + self.memory_saving_mode = memory_saving_mode + self.max_pages_before_recycle = max_pages_before_recycle fa_user_agenr_generator = ValidUAGenerator() if self.user_agent_mode == "random": @@ -752,6 +764,8 @@ class BrowserConfig: host=kwargs.get("host", "localhost"), enable_stealth=kwargs.get("enable_stealth", False), init_scripts=kwargs.get("init_scripts", []), + memory_saving_mode=kwargs.get("memory_saving_mode", False), + max_pages_before_recycle=kwargs.get("max_pages_before_recycle", 0), ) def to_dict(self): @@ -792,6 +806,8 @@ class BrowserConfig: "host": self.host, "enable_stealth": self.enable_stealth, "init_scripts": self.init_scripts, + "memory_saving_mode": self.memory_saving_mode, + "max_pages_before_recycle": self.max_pages_before_recycle, } diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b520391a..a18264fb 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -881,6 +881,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "scale": scale, }, ) + await cdp.detach() except Exception as e: self.logger.warning( message="Failed to adjust viewport to content: {error}", diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index b56a4adb..b57c06fc 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -88,7 +88,16 @@ class ManagedBrowser: "--force-color-profile=srgb", "--mute-audio", "--disable-background-timer-throttling", + # Memory-saving flags: disable unused Chrome features + "--disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider", + "--disable-component-update", + "--disable-domain-reliability", ] + if config.memory_saving_mode: + flags.extend([ + "--aggressive-cache-discard", + '--js-flags=--max-old-space-size=512', + ]) if config.light_mode: flags.extend(BROWSER_DISABLE_OPTIONS) if config.text_mode: @@ -725,6 +734,13 @@ class BrowserManager: # Browser endpoint key for global page tracking (set after browser starts) self._browser_endpoint_key: Optional[str] = None + # Browser recycling state + self._pages_served = 0 + self._recycling = False + self._recycle_lock = asyncio.Lock() + self._recycle_done = asyncio.Event() + self._recycle_done.set() # starts "open" — not recycling + # Stealth adapter for stealth mode self._stealth_adapter = None if self.config.enable_stealth and not self.use_undetected: @@ -972,10 +988,20 @@ class BrowserManager: "--force-color-profile=srgb", "--mute-audio", "--disable-background-timer-throttling", + # Memory-saving flags: disable unused Chrome features + "--disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider", + "--disable-component-update", + "--disable-domain-reliability", # "--single-process", f"--window-size={self.config.viewport_width},{self.config.viewport_height}", ] + if self.config.memory_saving_mode: + args.extend([ + "--aggressive-cache-discard", + '--js-flags=--max-old-space-size=512', + ]) + if self.config.light_mode: args.extend(BROWSER_DISABLE_OPTIONS) @@ -1408,6 +1434,9 @@ class BrowserManager: Returns: (page, context): The Page and its BrowserContext """ + # Block if browser is being recycled; wakes instantly when done + await self._recycle_done.wait() + self._cleanup_expired_sessions() # If a session_id is provided and we already have it, reuse that page + context @@ -1567,6 +1596,7 @@ class BrowserManager: if crawlerRunConfig.session_id: self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + self._pages_served += 1 return page, context async def kill_session(self, session_id: str): @@ -1580,14 +1610,23 @@ class BrowserManager: context, page, _ = self.sessions[session_id] self._release_page_from_use(page) # Decrement context refcount for the session's page + should_close_context = False async with self._contexts_lock: sig = self._page_to_sig.pop(page, None) if sig is not None and sig in self._context_refcounts: self._context_refcounts[sig] = max( 0, self._context_refcounts[sig] - 1 ) + # Only close the context if no other pages are using it + # (refcount dropped to 0) AND we own the context (not managed) + if not self.config.use_managed_browser: + if self._context_refcounts.get(sig, 0) == 0: + self.contexts_by_config.pop(sig, None) + self._context_refcounts.pop(sig, None) + self._context_last_used.pop(sig, None) + should_close_context = True await page.close() - if not self.config.use_managed_browser: + if should_close_context: await context.close() del self.sessions[session_id] @@ -1613,6 +1652,89 @@ class BrowserManager: 0, self._context_refcounts[sig] - 1 ) + # Check if browser recycle is needed + if self._should_recycle(): + await self._maybe_recycle_browser() + + def _should_recycle(self) -> bool: + """Check if page threshold reached for browser recycling.""" + limit = self.config.max_pages_before_recycle + if limit <= 0: + return False + return self._pages_served >= limit + + async def _maybe_recycle_browser(self): + """Recycle browser if no active crawls are in-flight. + + Uses asyncio.Event to block new get_page() callers during recycle, + and sets _recycling inside _contexts_lock to prevent race conditions. + """ + if self._recycling: + return + + async with self._recycle_lock: + if self._recycling: + return + + # Set _recycling and check refcounts under the SAME lock + # to prevent a new crawl slipping in between check and flag set + async with self._contexts_lock: + total_active = sum(self._context_refcounts.values()) + if total_active > 0: + return # active crawls running, next release will re-check + self._recycling = True + self._recycle_done.clear() # block new get_page() callers + + try: + if self.logger: + self.logger.info( + message="Recycling browser after {count} pages to reclaim memory", + tag="BROWSER", + params={"count": self._pages_served}, + ) + + # Force full cleanup to kill the browser process and reclaim + # memory. For external CDP (cdp_url without cache), temporarily + # enable cdp_cleanup_on_close so close() actually disconnects. + # For cached CDP, close() already handles release correctly. + saved_cdp_cleanup = self.config.cdp_cleanup_on_close + if self.config.cdp_url and not self._using_cached_cdp: + self.config.cdp_cleanup_on_close = True + try: + await self.close() + finally: + self.config.cdp_cleanup_on_close = saved_cdp_cleanup + + # close() already clears most tracking dicts, but ensure + # everything is reset for the fresh browser + self.contexts_by_config.clear() + self._context_refcounts.clear() + self._context_last_used.clear() + self._page_to_sig.clear() + self.sessions.clear() + # Clear global page tracking for this endpoint — old pages are dead + if self._browser_endpoint_key and self._browser_endpoint_key in BrowserManager._global_pages_in_use: + BrowserManager._global_pages_in_use[self._browser_endpoint_key].clear() + + # Re-create ManagedBrowser if needed — close() sets it to None, + # but start() expects it to exist for the managed browser path. + if self.config.use_managed_browser and self.managed_browser is None: + self.managed_browser = ManagedBrowser( + browser_type=self.config.browser_type, + user_data_dir=self.config.user_data_dir, + headless=self.config.headless, + logger=self.logger, + debugging_port=self.config.debugging_port, + cdp_url=self.config.cdp_url, + browser_config=self.config, + ) + + await self.start() + self._pages_served = 0 + finally: + self._recycling = False + self._recycle_done.set() # wake ALL waiting get_page() callers + def _cleanup_expired_sessions(self): """Clean up expired sessions based on TTL.""" current_time = time.time() diff --git a/tests/async/test_browser_lifecycle.py b/tests/async/test_browser_lifecycle.py new file mode 100644 index 00000000..b7042cc5 --- /dev/null +++ b/tests/async/test_browser_lifecycle.py @@ -0,0 +1,972 @@ +""" +Browser lifecycle & concurrency tests. + +Covers all the browser launch paths and lock interactions: + - Standalone (playwright.launch) + - Managed browser (subprocess + CDP connect) + - Managed browser with create_isolated_context + - Page reuse on shared default context + - Context caching / LRU eviction + - Session lifecycle across all modes + - Concurrent crawls racing for pages / contexts + - Recycle interacting with managed browser + - Multiple crawlers sharing a managed browser via CDP +""" + +import asyncio +import time +import threading +from http.server import HTTPServer, SimpleHTTPRequestHandler + +import pytest + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + + +# --------------------------------------------------------------------------- +# Local test server +# --------------------------------------------------------------------------- + +PAGES = {} +for i in range(100): + PAGES[f"/page{i}"] = ( + f"Page {i}" + f"

Page {i}

Content for page {i}.

" + f"next" + ).encode() + +# Login/dashboard for session tests +PAGES["/login"] = ( + b"Login" + b"

Login

Logged in.

" +) +PAGES["/dashboard"] = ( + b"Dashboard" + b"

Dashboard

Dashboard content.

" +) + + +class Handler(SimpleHTTPRequestHandler): + def log_message(self, *a): + pass + + def do_GET(self): + body = PAGES.get(self.path, PAGES["/page0"]) + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + self.wfile.write(body) + + +class _Server(HTTPServer): + allow_reuse_address = True + + +@pytest.fixture(scope="module") +def srv(): + s = _Server(("127.0.0.1", 0), Handler) + port = s.server_address[1] + t = threading.Thread(target=s.serve_forever, daemon=True) + t.start() + yield f"http://127.0.0.1:{port}" + s.shutdown() + + +def _u(base, i): + return f"{base}/page{i}" + + +def _bm(c): + return c.crawler_strategy.browser_manager + + +# =================================================================== +# SECTION A — Standalone browser (no CDP, no managed browser) +# =================================================================== + +@pytest.mark.asyncio +async def test_standalone_basic_crawl(srv): + """Standalone browser: launch, crawl, close. Baseline correctness.""" + cfg = BrowserConfig(headless=True, verbose=False) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + r = await c.arun(url=_u(srv, 0), config=run) + assert r.success + assert "Page 0" in r.html + + +@pytest.mark.asyncio +async def test_standalone_sequential_crawls(srv): + """10 sequential pages — each gets its own page, context reused by config sig.""" + cfg = BrowserConfig(headless=True, verbose=False) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + for i in range(10): + r = await c.arun(url=_u(srv, i), config=run) + assert r.success, f"Page {i} failed" + assert f"Page {i}" in r.html + + +@pytest.mark.asyncio +async def test_standalone_concurrent_crawls(srv): + """10 concurrent crawls on standalone browser — no crashes, + context lock prevents race conditions.""" + cfg = BrowserConfig(headless=True, verbose=False) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + tasks = [c.arun(url=_u(srv, i), config=run) for i in range(10)] + results = await asyncio.gather(*tasks, return_exceptions=True) + excs = [r for r in results if isinstance(r, Exception)] + assert len(excs) == 0, f"Exceptions: {excs[:3]}" + assert all(r.success for r in results if not isinstance(r, Exception)) + + +@pytest.mark.asyncio +async def test_standalone_context_reuse(srv): + """Two crawls with identical config should reuse the same context. + Two crawls with different configs should create different contexts.""" + cfg = BrowserConfig(headless=True, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + run_a = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + r1 = await c.arun(url=_u(srv, 0), config=run_a) + assert r1.success + ctx_count_after_first = len(bm.contexts_by_config) + + # Same config → same context + r2 = await c.arun(url=_u(srv, 1), config=run_a) + assert r2.success + assert len(bm.contexts_by_config) == ctx_count_after_first, ( + "Same config should reuse context" + ) + + # Different config → new context + run_b = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, verbose=False, + override_navigator=True, + ) + r3 = await c.arun(url=_u(srv, 2), config=run_b) + assert r3.success + assert len(bm.contexts_by_config) == ctx_count_after_first + 1, ( + "Different config should create new context" + ) + + +@pytest.mark.asyncio +async def test_standalone_session_multistep(srv): + """Session across 3 pages on standalone browser.""" + cfg = BrowserConfig(headless=True, verbose=False) + sess = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="standalone_sess", verbose=False, + ) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + for i in range(3): + r = await c.arun(url=_u(srv, i), config=sess) + assert r.success + assert "standalone_sess" in bm.sessions + + # Refcount should be exactly 1 + _, page, _ = bm.sessions["standalone_sess"] + sig = bm._page_to_sig.get(page) + if sig: + assert bm._context_refcounts.get(sig, 0) == 1 + + # Kill session and verify cleanup + await c.crawler_strategy.kill_session("standalone_sess") + assert "standalone_sess" not in bm.sessions + if sig: + assert bm._context_refcounts.get(sig, 0) == 0 + + +@pytest.mark.asyncio +async def test_standalone_recycle(srv): + """Recycling on standalone browser — close/start cycle.""" + cfg = BrowserConfig( + headless=True, verbose=False, max_pages_before_recycle=5, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + for i in range(8): + r = await c.arun(url=_u(srv, i), config=run) + assert r.success, f"Page {i} failed" + + # Recycle happened at page 5, pages 6-8 after → counter = 3 + assert bm._pages_served == 3 + + +@pytest.mark.asyncio +async def test_standalone_recycle_with_concurrent_crawls(srv): + """15 concurrent crawls straddling a recycle boundary on standalone.""" + cfg = BrowserConfig( + headless=True, verbose=False, max_pages_before_recycle=5, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + tasks = [c.arun(url=_u(srv, i), config=run) for i in range(15)] + results = await asyncio.gather(*tasks, return_exceptions=True) + excs = [r for r in results if isinstance(r, Exception)] + assert len(excs) == 0, f"Exceptions: {excs[:3]}" + successes = [r for r in results if not isinstance(r, Exception) and r.success] + assert len(successes) == 15 + + +# =================================================================== +# SECTION B — Managed browser (subprocess + CDP) +# =================================================================== + +@pytest.mark.asyncio +async def test_managed_basic_crawl(srv): + """Managed browser: start subprocess, connect via CDP, crawl, close.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + r = await c.arun(url=_u(srv, 0), config=run) + assert r.success + assert "Page 0" in r.html + + +@pytest.mark.asyncio +async def test_managed_sequential_crawls(srv): + """Sequential crawls on managed browser — pages reused from default context.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + for i in range(8): + r = await c.arun(url=_u(srv, i), config=run) + assert r.success, f"Page {i} failed" + + +@pytest.mark.asyncio +async def test_managed_concurrent_crawls(srv): + """Concurrent crawls on managed browser — _global_pages_lock prevents + two tasks from grabbing the same page.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + tasks = [c.arun(url=_u(srv, i), config=run) for i in range(8)] + results = await asyncio.gather(*tasks, return_exceptions=True) + excs = [r for r in results if isinstance(r, Exception)] + assert len(excs) == 0, f"Exceptions: {excs[:3]}" + successes = [r for r in results if not isinstance(r, Exception) and r.success] + assert len(successes) == 8 + + +@pytest.mark.asyncio +async def test_managed_page_reuse(srv): + """On managed browser (non-isolated), pages should be reused when + released back to the pool.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + # Crawl 3 pages sequentially — page should be reused each time + for i in range(3): + r = await c.arun(url=_u(srv, i), config=run) + assert r.success + + # On managed browser, total pages created should be small + # (pages reused, not new ones for each crawl) + default_ctx = bm.default_context + total_pages = len(default_ctx.pages) + assert total_pages <= 3, ( + f"Expected page reuse, but {total_pages} pages exist" + ) + + +@pytest.mark.asyncio +async def test_managed_session_multistep(srv): + """Multi-step session on managed browser — session page stays alive.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + sess = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="managed_sess", verbose=False, + ) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + r = await c.arun(url=f"{srv}/login", config=sess) + assert r.success + + r = await c.arun(url=f"{srv}/dashboard", config=sess) + assert r.success + + assert "managed_sess" in bm.sessions + + +@pytest.mark.asyncio +async def test_managed_recycle(srv): + """Recycling on managed browser — kills subprocess, restarts, crawls resume.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + max_pages_before_recycle=4, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + for i in range(7): + r = await c.arun(url=_u(srv, i), config=run) + assert r.success, f"Page {i} failed after managed recycle" + + # Recycled at 4 → pages 5,6,7 after → counter = 3 + assert bm._pages_served == 3 + + +# =================================================================== +# SECTION C — Managed browser with create_isolated_context +# =================================================================== + +@pytest.mark.asyncio +async def test_isolated_context_basic(srv): + """Isolated context mode: each config gets its own browser context.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + r = await c.arun(url=_u(srv, 0), config=run) + assert r.success + + +@pytest.mark.asyncio +async def test_isolated_context_concurrent(srv): + """Concurrent crawls with isolated contexts — _contexts_lock prevents + race conditions in context creation.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + tasks = [c.arun(url=_u(srv, i), config=run) for i in range(10)] + results = await asyncio.gather(*tasks, return_exceptions=True) + excs = [r for r in results if isinstance(r, Exception)] + assert len(excs) == 0, f"Exceptions: {excs[:3]}" + successes = [r for r in results if not isinstance(r, Exception) and r.success] + assert len(successes) == 10 + + +@pytest.mark.asyncio +async def test_isolated_context_caching(srv): + """Same config signature → same context. Different config → different context.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + ) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + run_a = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + await c.arun(url=_u(srv, 0), config=run_a) + count_after_a = len(bm.contexts_by_config) + + # Same config → reuse + await c.arun(url=_u(srv, 1), config=run_a) + assert len(bm.contexts_by_config) == count_after_a + + # Different config → new context + run_b = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, verbose=False, + override_navigator=True, + ) + await c.arun(url=_u(srv, 2), config=run_b) + assert len(bm.contexts_by_config) == count_after_a + 1 + + +@pytest.mark.asyncio +async def test_isolated_context_refcount(srv): + """Refcount increases with concurrent crawls and decreases on release.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + # After a single sequential crawl (page released), refcount should be 0 + r = await c.arun(url=_u(srv, 0), config=run) + assert r.success + + # All contexts should have refcount 0 (page was released) + for sig, rc in bm._context_refcounts.items(): + assert rc == 0, f"Refcount for {sig[:8]}... should be 0, got {rc}" + + +@pytest.mark.asyncio +async def test_isolated_context_session_with_interleaved(srv): + """Session on isolated context + non-session crawls interleaved.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + ) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + sess = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="iso_sess", verbose=False, + ) + r = await c.arun(url=f"{srv}/login", config=sess) + assert r.success + + # Non-session crawls + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + for i in range(5): + r = await c.arun(url=_u(srv, i), config=run) + assert r.success + + # Session still alive + assert "iso_sess" in bm.sessions + r = await c.arun(url=f"{srv}/dashboard", config=sess) + assert r.success + + +@pytest.mark.asyncio +async def test_isolated_context_recycle(srv): + """Recycling with isolated contexts — all contexts cleared, new ones + created fresh on the new browser.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + max_pages_before_recycle=4, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + for i in range(6): + r = await c.arun(url=_u(srv, i), config=run) + assert r.success, f"Page {i} failed" + + # Recycled at 4 → 5,6 after → counter = 2 + assert bm._pages_served == 2 + # Contexts dict should only have entries from after recycle + assert all(rc == 0 for rc in bm._context_refcounts.values()), ( + "All refcounts should be 0 after sequential crawls" + ) + + +# =================================================================== +# SECTION D — Two crawlers sharing one managed browser via CDP URL +# =================================================================== + +@pytest.mark.asyncio +async def test_two_crawlers_share_managed_browser(srv): + """Two AsyncWebCrawler instances connect to the same managed browser + via its CDP URL. Both should crawl successfully without interfering.""" + # First crawler owns the managed browser + cfg1 = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + + async with AsyncWebCrawler(config=cfg1) as c1: + bm1 = _bm(c1) + # Grab the CDP URL from the managed browser + cdp_url = f"http://{bm1.managed_browser.host}:{bm1.managed_browser.debugging_port}" + + # Second crawler connects to the same browser via CDP + cfg2 = BrowserConfig( + headless=True, verbose=False, + cdp_url=cdp_url, + cdp_cleanup_on_close=True, + ) + async with AsyncWebCrawler(config=cfg2) as c2: + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + # Crawl sequentially to avoid page contention on shared context + r1 = await c1.arun(url=_u(srv, 0), config=run) + r2 = await c2.arun(url=_u(srv, 1), config=run) + + assert r1.success, f"Crawler 1 failed: {r1.error_message}" + assert r2.success, f"Crawler 2 failed: {r2.error_message}" + assert "Page 0" in r1.html + assert "Page 1" in r2.html + + +@pytest.mark.asyncio +async def test_two_crawlers_concurrent_heavy(srv): + """Two crawlers sharing one managed browser, each doing 5 concurrent crawls.""" + cfg1 = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + + async with AsyncWebCrawler(config=cfg1) as c1: + bm1 = _bm(c1) + cdp_url = f"http://{bm1.managed_browser.host}:{bm1.managed_browser.debugging_port}" + + cfg2 = BrowserConfig( + headless=True, verbose=False, + cdp_url=cdp_url, + cdp_cleanup_on_close=True, + ) + async with AsyncWebCrawler(config=cfg2) as c2: + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + # Each crawler does 5 sequential crawls while both are connected + for i in range(5): + r1 = await c1.arun(url=_u(srv, i), config=run) + assert r1.success, f"Crawler 1 page {i} failed: {r1.error_message}" + r2 = await c2.arun(url=_u(srv, i + 50), config=run) + assert r2.success, f"Crawler 2 page {i} failed: {r2.error_message}" + + +# =================================================================== +# SECTION E — Session lifecycle edge cases +# =================================================================== + +@pytest.mark.asyncio +async def test_session_then_nonsession_then_session(srv): + """session crawl → non-session crawl → session crawl. + The session should persist across non-session activity.""" + cfg = BrowserConfig(headless=True, verbose=False) + sess = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="interleave_sess", verbose=False, + ) + no_sess = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + r = await c.arun(url=_u(srv, 0), config=sess) + assert r.success + + # Non-session crawls + for i in range(3): + r = await c.arun(url=_u(srv, 10 + i), config=no_sess) + assert r.success + + # Session should still exist and work + assert "interleave_sess" in bm.sessions + r = await c.arun(url=_u(srv, 99), config=sess) + assert r.success + + +@pytest.mark.asyncio +async def test_multiple_sessions_simultaneous(srv): + """3 independent sessions open at the same time, each navigating + different pages. They should not interfere.""" + cfg = BrowserConfig(headless=True, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + sessions = [ + CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id=f"sess_{j}", verbose=False, + ) + for j in range(3) + ] + + # Step 1: open all sessions + for j, s in enumerate(sessions): + r = await c.arun(url=_u(srv, j * 10), config=s) + assert r.success, f"Session {j} open failed" + + assert len(bm.sessions) == 3 + + # Step 2: navigate each session to a second page + for j, s in enumerate(sessions): + r = await c.arun(url=_u(srv, j * 10 + 1), config=s) + assert r.success, f"Session {j} step 2 failed" + + # Step 3: kill sessions one by one, verify others unaffected + await c.crawler_strategy.kill_session("sess_0") + assert "sess_0" not in bm.sessions + assert "sess_1" in bm.sessions + assert "sess_2" in bm.sessions + + # Remaining sessions still work + r = await c.arun(url=_u(srv, 99), config=sessions[1]) + assert r.success + + +@pytest.mark.asyncio +async def test_session_kill_then_recreate(srv): + """Kill a session, then create a new session with the same ID. + The new session should work on a fresh page.""" + cfg = BrowserConfig(headless=True, verbose=False) + sess = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="reuse_id", verbose=False, + ) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + r = await c.arun(url=_u(srv, 0), config=sess) + assert r.success + _, page_v1, _ = bm.sessions["reuse_id"] + + await c.crawler_strategy.kill_session("reuse_id") + assert "reuse_id" not in bm.sessions + + # Re-create with same ID + r = await c.arun(url=_u(srv, 50), config=sess) + assert r.success + assert "reuse_id" in bm.sessions + _, page_v2, _ = bm.sessions["reuse_id"] + + # Should be a different page object + assert page_v1 is not page_v2, "Re-created session should have a new page" + + +# =================================================================== +# SECTION F — Concurrent recycle + session stress tests +# =================================================================== + +@pytest.mark.asyncio +async def test_recycle_concurrent_sessions_and_nonsessions(srv): + """Open 2 sessions + fire 10 non-session crawls concurrently with + recycle threshold=5. Sessions should block recycle until they're + done or killed. All crawls should succeed.""" + cfg = BrowserConfig( + headless=True, verbose=False, + max_pages_before_recycle=5, + ) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + # Open sessions first + sess_a = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="stress_a", verbose=False, + ) + sess_b = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="stress_b", verbose=False, + ) + r = await c.arun(url=f"{srv}/login", config=sess_a) + assert r.success + r = await c.arun(url=f"{srv}/login", config=sess_b) + assert r.success + + # Fire 10 concurrent non-session crawls + no_sess = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + tasks = [c.arun(url=_u(srv, i), config=no_sess) for i in range(10)] + results = await asyncio.gather(*tasks, return_exceptions=True) + + excs = [r for r in results if isinstance(r, Exception)] + assert len(excs) == 0, f"Exceptions: {excs[:3]}" + + # Sessions should still be alive (blocking recycle) + assert "stress_a" in bm.sessions + assert "stress_b" in bm.sessions + + # Use sessions again — should work + r = await c.arun(url=f"{srv}/dashboard", config=sess_a) + assert r.success + r = await c.arun(url=f"{srv}/dashboard", config=sess_b) + assert r.success + + +@pytest.mark.asyncio +async def test_arun_many_with_session_open(srv): + """Session open while arun_many batch runs with recycle enabled. + Session survives the batch.""" + cfg = BrowserConfig( + headless=True, verbose=False, + max_pages_before_recycle=5, + ) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + sess = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="batch_guard", verbose=False, + ) + r = await c.arun(url=f"{srv}/login", config=sess) + assert r.success + + no_sess = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + urls = [_u(srv, i) for i in range(12)] + results = await c.arun_many(urls, config=no_sess) + assert all(r.success for r in results) + + # Session still alive + assert "batch_guard" in bm.sessions + + +@pytest.mark.asyncio +async def test_rapid_recycle_stress(srv): + """Recycle threshold=2 with 20 sequential crawls → 10 recycle cycles. + Every crawl must succeed. Proves recycle is stable under rapid cycling.""" + cfg = BrowserConfig( + headless=True, verbose=False, + max_pages_before_recycle=2, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + for i in range(20): + r = await c.arun(url=_u(srv, i % 100), config=run) + assert r.success, f"Page {i} failed during rapid recycle" + + +@pytest.mark.asyncio +async def test_rapid_recycle_concurrent(srv): + """Recycle threshold=3 with 12 concurrent crawls. Concurrency + + rapid recycling together.""" + cfg = BrowserConfig( + headless=True, verbose=False, + max_pages_before_recycle=3, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + tasks = [c.arun(url=_u(srv, i), config=run) for i in range(12)] + results = await asyncio.gather(*tasks, return_exceptions=True) + excs = [r for r in results if isinstance(r, Exception)] + assert len(excs) == 0, f"Exceptions: {excs[:3]}" + successes = [r for r in results if not isinstance(r, Exception) and r.success] + assert len(successes) == 12 + + +# =================================================================== +# SECTION G — Lock correctness under contention +# =================================================================== + +@pytest.mark.asyncio +async def test_context_lock_no_duplicate_contexts(srv): + """Fire 20 concurrent crawls with the same config on isolated context mode. + Despite concurrency, only 1 context should be created (all share the + same config signature).""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + tasks = [c.arun(url=_u(srv, i), config=run) for i in range(20)] + results = await asyncio.gather(*tasks, return_exceptions=True) + excs = [r for r in results if isinstance(r, Exception)] + assert len(excs) == 0, f"Exceptions: {excs[:3]}" + + # All had the same config → only 1 context should exist + assert len(bm.contexts_by_config) == 1, ( + f"Expected 1 context, got {len(bm.contexts_by_config)} — " + f"lock failed to prevent duplicate creation" + ) + + +@pytest.mark.asyncio +async def test_page_lock_no_duplicate_pages_managed(srv): + """On managed browser (shared default context), concurrent crawls should + never get the same page. After all complete, pages_in_use should be empty.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + tasks = [c.arun(url=_u(srv, i), config=run) for i in range(8)] + await asyncio.gather(*tasks) + + # After all crawls complete, no pages should be marked in use + piu = bm._get_pages_in_use() + assert len(piu) == 0, ( + f"After all crawls complete, {len(piu)} pages still marked in use" + ) + + +@pytest.mark.asyncio +async def test_refcount_correctness_under_concurrency(srv): + """Fire 15 concurrent crawls with isolated context. After all complete, + all refcounts should be 0.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + + tasks = [c.arun(url=_u(srv, i), config=run) for i in range(15)] + await asyncio.gather(*tasks) + + for sig, rc in bm._context_refcounts.items(): + assert rc == 0, ( + f"Refcount for context {sig[:8]}... is {rc}, expected 0 " + f"after all crawls complete" + ) + + +# =================================================================== +# SECTION H — Close / cleanup correctness +# =================================================================== + +@pytest.mark.asyncio +async def test_close_cleans_up_standalone(srv): + """After closing standalone crawler, browser and playwright are None.""" + cfg = BrowserConfig(headless=True, verbose=False) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + c = AsyncWebCrawler(config=cfg) + await c.start() + bm = _bm(c) + + r = await c.arun(url=_u(srv, 0), config=run) + assert r.success + + await c.close() + assert bm.browser is None + assert bm.playwright is None + + +@pytest.mark.asyncio +async def test_close_cleans_up_managed(srv): + """After closing managed crawler, managed_browser is cleaned up.""" + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + ) + run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + c = AsyncWebCrawler(config=cfg) + await c.start() + bm = _bm(c) + + r = await c.arun(url=_u(srv, 0), config=run) + assert r.success + + await c.close() + assert bm.browser is None + assert bm.managed_browser is None + + +@pytest.mark.asyncio +async def test_double_close_safe(srv): + """Calling close() twice should not raise.""" + cfg = BrowserConfig(headless=True, verbose=False) + + c = AsyncWebCrawler(config=cfg) + await c.start() + r = await c.arun(url=_u(srv, 0), config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, verbose=False, + )) + assert r.success + + await c.close() + # Second close should be safe + await c.close() + + +# =================================================================== +# SECTION I — Mixed modes: session + recycle + managed + concurrent +# =================================================================== + +@pytest.mark.asyncio +async def test_managed_isolated_session_recycle_concurrent(srv): + """The ultimate stress test: managed browser + isolated contexts + + sessions + recycle + concurrent crawls. + + Flow: + 1. Open session A + 2. Fire 8 concurrent non-session crawls (threshold=5, but session blocks) + 3. Kill session A + 4. Fire 3 more non-session crawls to trigger recycle + 5. Open session B on the fresh browser + 6. Verify session B works + """ + cfg = BrowserConfig( + headless=True, verbose=False, + use_managed_browser=True, + create_isolated_context=True, + max_pages_before_recycle=5, + ) + + async with AsyncWebCrawler(config=cfg) as c: + bm = _bm(c) + no_sess = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + # Step 1: open session + sess_a = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="ultimate_a", verbose=False, + ) + r = await c.arun(url=f"{srv}/login", config=sess_a) + assert r.success + + # Step 2: concurrent non-session crawls + tasks = [c.arun(url=_u(srv, i), config=no_sess) for i in range(8)] + results = await asyncio.gather(*tasks, return_exceptions=True) + excs = [r for r in results if isinstance(r, Exception)] + assert len(excs) == 0, f"Exceptions in step 2: {excs[:3]}" + + # Session blocks recycle + assert "ultimate_a" in bm.sessions + + # Step 3: kill session + await c.crawler_strategy.kill_session("ultimate_a") + + # Step 4: trigger recycle + for i in range(3): + r = await c.arun(url=_u(srv, 80 + i), config=no_sess) + assert r.success + + await asyncio.sleep(0.5) + + # Step 5: new session on fresh browser + sess_b = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="ultimate_b", verbose=False, + ) + r = await c.arun(url=f"{srv}/login", config=sess_b) + assert r.success + assert "ultimate_b" in bm.sessions + + # Step 6: verify it works + r = await c.arun(url=f"{srv}/dashboard", config=sess_b) + assert r.success diff --git a/tests/async/test_browser_memory.py b/tests/async/test_browser_memory.py new file mode 100644 index 00000000..cd1685d0 --- /dev/null +++ b/tests/async/test_browser_memory.py @@ -0,0 +1,1169 @@ +""" +Tests for browser memory management: memory_saving_mode, browser recycling, +and CDP session leak fixes. + +These are integration tests that launch real browsers and crawl real pages. +They verify: + 1. memory_saving_mode Chrome flags are applied + 2. Browser recycling fires at the right threshold and doesn't break crawling + 3. Concurrent crawls survive a recycle boundary without errors + 4. Recycling resets all internal tracking state cleanly + 5. Memory doesn't grow unbounded over many pages + 6. CDP session detach fix doesn't regress viewport adjustment +""" + +import asyncio +import os +import time +import threading +from http.server import HTTPServer, SimpleHTTPRequestHandler + +import psutil +import pytest + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + + +# --------------------------------------------------------------------------- +# Local test server — avoids network flakiness +# --------------------------------------------------------------------------- + +PAGES_HTML = {} +for i in range(200): + PAGES_HTML[f"/page{i}"] = f""" +Page {i} + +

Test page {i}

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit. +Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. +Paragraph {i} with enough text to exercise the content pipeline.

+Next +""" + + +class MemTestHandler(SimpleHTTPRequestHandler): + """Serves lightweight HTML pages for memory tests. + + Also serves /login and /dashboard for multi-step session tests. + /login sets a cookie, /dashboard checks the cookie to prove session state. + """ + + def log_message(self, *args): + pass # silent + + def do_GET(self): + if self.path == "/login": + self.send_response(200) + self.send_header("Content-type", "text/html") + self.send_header("Set-Cookie", "auth_token=valid123; Path=/") + self.end_headers() + self.wfile.write(b""" +Login +

Login Page

You are now logged in.

+Go to dashboard""") + return + + if self.path == "/dashboard": + cookie = self.headers.get("Cookie", "") + if "auth_token=valid123" in cookie: + body = "

Dashboard

Welcome, authenticated user!

" + else: + body = "

Dashboard

NOT AUTHENTICATED

" + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + self.wfile.write( + f"Dashboard" + f"{body}".encode() + ) + return + + if self.path == "/step1": + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + self.wfile.write(b""" +Step 1 +

Step 1

First step complete

""") + return + + if self.path == "/step2": + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + self.wfile.write(b""" +Step 2 +

Step 2

Second step complete

""") + return + + if self.path == "/step3": + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + self.wfile.write(b""" +Step 3 +

Step 3

Third step complete

""") + return + + html = PAGES_HTML.get(self.path) + if html is None: + # Fallback for root and unknown paths + html = PAGES_HTML["/page0"] + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + self.wfile.write(html.encode()) + + +class ReuseAddrHTTPServer(HTTPServer): + allow_reuse_address = True + + +@pytest.fixture(scope="module") +def test_server(): + """Start a local HTTP server for the test module.""" + server = ReuseAddrHTTPServer(("127.0.0.1", 0), MemTestHandler) + port = server.server_address[1] + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + yield f"http://127.0.0.1:{port}" + server.shutdown() + + +def _url(base, i): + return f"{base}/page{i}" + + +def _get_chromium_rss_mb(): + """Sum RSS of all chromium/chrome child processes in MB.""" + total = 0 + for proc in psutil.process_iter(["name", "cmdline"]): + try: + name = (proc.info["name"] or "").lower() + cmdline = " ".join(proc.info["cmdline"] or []).lower() + if "chrom" in name or "chrom" in cmdline: + total += proc.memory_info().rss + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return total / (1024 * 1024) + + +# --------------------------------------------------------------------------- +# Helpers to reach into BrowserManager internals +# --------------------------------------------------------------------------- + +def _bm(crawler: AsyncWebCrawler): + """Shortcut to get the BrowserManager from a crawler.""" + return crawler.crawler_strategy.browser_manager + + +# =========================================================================== +# Test 1: memory_saving_mode flag propagation +# =========================================================================== + +@pytest.mark.asyncio +async def test_memory_saving_flags_applied(test_server): + """Verify --aggressive-cache-discard and --js-flags are in the launch args + when memory_saving_mode=True, and absent when False.""" + config_on = BrowserConfig( + headless=True, + verbose=False, + memory_saving_mode=True, + ) + config_off = BrowserConfig( + headless=True, + verbose=False, + memory_saving_mode=False, + ) + + async with AsyncWebCrawler(config=config_on) as crawler: + bm = _bm(crawler) + browser_args = bm._build_browser_args() + # _build_browser_args returns a dict with an "args" key + args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args + assert "--aggressive-cache-discard" in args_list, ( + "memory_saving_mode=True should add --aggressive-cache-discard" + ) + assert any("max-old-space-size" in a for a in args_list), ( + "memory_saving_mode=True should add V8 heap cap" + ) + # Always-on flags should be present regardless + assert any("OptimizationHints" in a for a in args_list) + + async with AsyncWebCrawler(config=config_off) as crawler: + bm = _bm(crawler) + browser_args = bm._build_browser_args() + args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args + assert "--aggressive-cache-discard" not in args_list, ( + "memory_saving_mode=False should NOT add --aggressive-cache-discard" + ) + assert not any("max-old-space-size" in a for a in args_list), ( + "memory_saving_mode=False should NOT add V8 heap cap" + ) + # Always-on flags should still be there + assert any("OptimizationHints" in a for a in args_list) + + +# =========================================================================== +# Test 2: Always-on flags present in both code paths +# =========================================================================== + +@pytest.mark.asyncio +async def test_always_on_flags_present(test_server): + """The 3 always-on memory flags should appear in _build_browser_args + even with default BrowserConfig.""" + config = BrowserConfig(headless=True, verbose=False) + async with AsyncWebCrawler(config=config) as crawler: + browser_args = _bm(crawler)._build_browser_args() + args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args + assert any("disable-component-update" in a for a in args_list) + assert any("disable-domain-reliability" in a for a in args_list) + assert any("OptimizationHints" in a for a in args_list) + + +# =========================================================================== +# Test 3: Basic recycling — counter increments, recycle fires, crawls resume +# =========================================================================== + +@pytest.mark.asyncio +async def test_recycle_fires_at_threshold(test_server): + """Set max_pages_before_recycle=5, crawl 8 pages sequentially. + Verify the counter resets after recycle and all crawls succeed.""" + config = BrowserConfig( + headless=True, + verbose=False, + memory_saving_mode=True, + max_pages_before_recycle=5, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + assert bm._pages_served == 0 + + results = [] + for i in range(8): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + results.append(r) + + # All 8 crawls should succeed — recycle happened transparently + assert len(results) == 8 + assert all(r.success for r in results), ( + f"Failed crawls: {[i for i, r in enumerate(results) if not r.success]}" + ) + + # After 8 pages with threshold=5, recycle happened once (at page 5). + # Pages 6,7,8 served after recycle → counter should be 3. + assert bm._pages_served == 3, ( + f"Expected 3 pages after recycle, got {bm._pages_served}" + ) + + +# =========================================================================== +# Test 4: Recycling resets all tracking state +# =========================================================================== + +@pytest.mark.asyncio +async def test_recycle_clears_tracking_state(test_server): + """After a recycle, internal dicts should be clean.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=3, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + # Crawl 3 pages → triggers recycle + for i in range(3): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + assert r.success + + # Give recycle a moment to complete (it fires in release_page_with_context) + await asyncio.sleep(0.5) + + # Recycle should have reset these + assert bm._pages_served == 0, f"Counter not reset: {bm._pages_served}" + assert sum(bm._context_refcounts.values()) == 0, ( + f"Refcounts not zero after recycle: {bm._context_refcounts}" + ) + + # Crawl one more page to prove browser is alive + r = await crawler.arun(url=_url(test_server, 99), config=run_config) + assert r.success + assert bm._pages_served == 1 + + +# =========================================================================== +# Test 5: Concurrent crawls across a recycle boundary +# =========================================================================== + +@pytest.mark.asyncio +async def test_concurrent_crawls_across_recycle(test_server): + """Launch concurrent crawls that straddle the recycle threshold. + Recycling should wait for in-flight crawls to finish, not crash them.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=5, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + # Fire 10 concurrent crawls with threshold=5 + urls = [_url(test_server, i) for i in range(10)] + tasks = [crawler.arun(url=u, config=run_config) for u in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + + exceptions = [r for r in results if isinstance(r, Exception)] + assert len(exceptions) == 0, ( + f"Got {len(exceptions)} exceptions during concurrent recycle: " + f"{exceptions[:3]}" + ) + successes = [r for r in results if not isinstance(r, Exception) and r.success] + assert len(successes) == 10, ( + f"Only {len(successes)}/10 crawls succeeded" + ) + + +# =========================================================================== +# Test 6: Recycle with sessions — sessions cleared, new session works after +# =========================================================================== + +@pytest.mark.asyncio +async def test_recycle_blocked_by_active_session(test_server): + """An active session holds a context refcount, so the browser should NOT + recycle while the session is open — even if pages_served >= threshold. + This proves recycling is safe around sessions.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=3, + ) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + run_no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + # Crawl 2 non-session pages (released immediately) + for i in range(2): + r = await crawler.arun(url=_url(test_server, i), config=run_no_session) + assert r.success + + # Create a named session on page 3 — hits the threshold + run_with_session = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="test_session", + verbose=False, + ) + r = await crawler.arun(url=_url(test_server, 2), config=run_with_session) + assert r.success + assert "test_session" in bm.sessions + + # We've hit 3 pages (the threshold), but the session holds a refcount + # so recycle must NOT fire + assert bm._pages_served == 3 + assert not bm._recycling, ( + "Recycle should not fire while a session holds a refcount" + ) + + # Browser should still be alive — use the session again + r = await crawler.arun(url=_url(test_server, 50), config=run_with_session) + assert r.success, "Session should still work even past recycle threshold" + + # Session reuses the same page, so counter stays at 3 + # (only get_page increments it, and session reuse skips get_page) + assert bm._pages_served >= 3 + assert not bm._recycling + + +@pytest.mark.asyncio +async def test_sessions_cleared_by_recycle(test_server): + """After a recycle, the sessions dict is empty and new sessions work.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=3, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + # Crawl 3 non-session pages → recycle fires (all refcounts 0) + for i in range(3): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + assert r.success + + await asyncio.sleep(0.5) + + # Sessions dict cleared by recycle + assert len(bm.sessions) == 0, ( + f"Sessions should be empty after recycle, got {list(bm.sessions.keys())}" + ) + + # New session should work on the fresh browser + run_with_session = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="post_recycle_session", + verbose=False, + ) + r = await crawler.arun(url=_url(test_server, 99), config=run_with_session) + assert r.success + assert "post_recycle_session" in bm.sessions + + +# =========================================================================== +# Test 7: Multiple recycle cycles — browser survives repeated recycling +# =========================================================================== + +@pytest.mark.asyncio +async def test_multiple_recycle_cycles(test_server): + """Recycle the browser 4 times (threshold=5, crawl 22 pages). + Every single crawl must succeed.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=5, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + all_results = [] + + for i in range(22): + r = await crawler.arun(url=_url(test_server, i % 200), config=run_config) + all_results.append(r) + + assert all(r.success for r in all_results), ( + f"Failed at pages: " + f"{[i for i, r in enumerate(all_results) if not r.success]}" + ) + # 22 pages, threshold 5 → recycles at 5, 10, 15, 20 → 4 recycles + # After last recycle at page 20, pages 21,22 served → counter = 2 + assert bm._pages_served == 2 + + +# =========================================================================== +# Test 8: Recycling disabled by default (max_pages_before_recycle=0) +# =========================================================================== + +@pytest.mark.asyncio +async def test_recycle_disabled_by_default(test_server): + """With default config (max_pages_before_recycle=0), no recycling happens + no matter how many pages are crawled.""" + config = BrowserConfig(headless=True, verbose=False) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + for i in range(10): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + assert r.success + + # Counter increments but never resets + assert bm._pages_served == 10 + assert not bm._recycling + + +# =========================================================================== +# Test 9: _recycle_done event blocks get_page during recycle +# =========================================================================== + +@pytest.mark.asyncio +async def test_recycle_event_blocks_new_pages(test_server): + """Simulate a recycle by manually clearing the event, then verify that + get_page blocks until the event is set.""" + config = BrowserConfig(headless=True, verbose=False) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + # Manually block the gate + bm._recycle_done.clear() + + got_page = False + + async def try_get_page(): + nonlocal got_page + r = await crawler.arun(url=_url(test_server, 0), config=run_config) + got_page = r.success + + task = asyncio.create_task(try_get_page()) + + # Wait a bit — the crawl should be blocked + await asyncio.sleep(0.5) + assert not got_page, "get_page should block while _recycle_done is cleared" + + # Release the gate + bm._recycle_done.set() + await asyncio.wait_for(task, timeout=15.0) + assert got_page, "Crawl should succeed after recycle_done is set" + + +# =========================================================================== +# Test 10: BrowserConfig serialization round-trip +# =========================================================================== + +@pytest.mark.asyncio +async def test_config_serialization_roundtrip(): + """memory_saving_mode and max_pages_before_recycle survive + to_dict → from_kwargs → clone round-trips.""" + original = BrowserConfig( + headless=True, + memory_saving_mode=True, + max_pages_before_recycle=500, + ) + + # to_dict → from_kwargs + d = original.to_dict() + assert d["memory_saving_mode"] is True + assert d["max_pages_before_recycle"] == 500 + + restored = BrowserConfig.from_kwargs(d) + assert restored.memory_saving_mode is True + assert restored.max_pages_before_recycle == 500 + + # clone with override + cloned = original.clone(max_pages_before_recycle=1000) + assert cloned.memory_saving_mode is True # inherited + assert cloned.max_pages_before_recycle == 1000 # overridden + + # dump / load + dumped = original.dump() + loaded = BrowserConfig.load(dumped) + assert loaded.memory_saving_mode is True + assert loaded.max_pages_before_recycle == 500 + + +# =========================================================================== +# Test 11: Memory stays bounded over many pages with recycling +# =========================================================================== + +@pytest.mark.asyncio +async def test_memory_bounded_with_recycling(test_server): + """Crawl 40 pages with recycling every 10. Measure RSS at page 10 + (just after first recycle) and at page 40. Memory should not grow + significantly — the recycle should keep it bounded. + + This is the core proof that recycling controls memory growth. + Without recycling, Chromium RSS grows ~2-5 MB per page. + With recycling, it should stay roughly flat.""" + config = BrowserConfig( + headless=True, + verbose=False, + memory_saving_mode=True, + max_pages_before_recycle=10, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + rss_samples = [] + + for i in range(40): + r = await crawler.arun(url=_url(test_server, i % 200), config=run_config) + assert r.success, f"Page {i} failed" + + # Sample after each recycle boundary + a few extra + if (i + 1) % 10 == 0: + await asyncio.sleep(0.3) # let recycle finish + rss_samples.append(_get_chromium_rss_mb()) + + # We should have 4 samples (at pages 10, 20, 30, 40) + assert len(rss_samples) == 4 + + # The key assertion: RSS at page 40 should not be dramatically larger + # than at page 10. Allow 50% growth as tolerance for GC timing etc. + # Without recycling, we'd expect 60-150 MB growth over 30 extra pages. + if rss_samples[0] > 0: # guard against measurement issues + growth_ratio = rss_samples[-1] / rss_samples[0] + assert growth_ratio < 2.0, ( + f"Memory grew {growth_ratio:.1f}x from {rss_samples[0]:.0f}MB " + f"to {rss_samples[-1]:.0f}MB over 30 pages with recycling. " + f"All samples: {[f'{s:.0f}' for s in rss_samples]} MB" + ) + + +# =========================================================================== +# Test 12: Memory grows WITHOUT recycling (control test) +# =========================================================================== + +@pytest.mark.asyncio +async def test_memory_grows_without_recycling(test_server): + """Control test: crawl 30 pages WITHOUT recycling and observe that + chromium RSS is higher at the end than at the start. + This proves that recycling is what keeps memory bounded.""" + config = BrowserConfig( + headless=True, + verbose=False, + memory_saving_mode=False, + max_pages_before_recycle=0, # disabled + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + # Warm up — let initial browser memory stabilize + for i in range(3): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + assert r.success + await asyncio.sleep(0.3) + rss_start = _get_chromium_rss_mb() + + # Crawl 30 more pages + for i in range(3, 33): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + assert r.success + + await asyncio.sleep(0.3) + rss_end = _get_chromium_rss_mb() + + # RSS should be at least somewhat higher (chromium leaks) + # We just need this to not be 0 — proving our measurement works + if rss_start > 0: + print( + f"\n[CONTROL] RSS without recycling: " + f"{rss_start:.0f}MB → {rss_end:.0f}MB " + f"(+{rss_end - rss_start:.0f}MB over 30 pages)" + ) + + +# =========================================================================== +# Test 13: Viewport adjustment doesn't leak CDP sessions +# =========================================================================== + +@pytest.mark.asyncio +async def test_viewport_adjustment_no_cdp_leak(test_server): + """Crawl several pages that trigger viewport adjustment (scan_full_page). + If CDP sessions leak, Chromium's DevTools session count grows and + eventually causes slowdowns. We just verify all crawls succeed and + the browser stays healthy.""" + config = BrowserConfig(headless=True, verbose=False) + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + scan_full_page=True, # triggers fit_to_viewport_adjustment → CDP session + verbose=False, + ) + + async with AsyncWebCrawler(config=config) as crawler: + for i in range(15): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + assert r.success, f"Page {i} failed with scan_full_page" + + +# =========================================================================== +# Test 14: Recycle under concurrent load with arun_many +# =========================================================================== + +@pytest.mark.asyncio +async def test_recycle_with_arun_many(test_server): + """Use arun_many to crawl a batch that exceeds the recycle threshold. + This tests the dispatcher + recycling interaction.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=5, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + urls = [_url(test_server, i) for i in range(12)] + results = await crawler.arun_many(urls, config=run_config) + + successes = [r for r in results if r.success] + assert len(successes) == 12, ( + f"Only {len(successes)}/12 succeeded with arun_many + recycling" + ) + + +# =========================================================================== +# Test 15: _global_pages_in_use cleaned after recycle +# =========================================================================== + +@pytest.mark.asyncio +async def test_global_pages_in_use_cleared(test_server): + """After a recycle, the _global_pages_in_use set for this browser's + endpoint should be empty (old pages are dead).""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=3, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + for i in range(3): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + assert r.success + + await asyncio.sleep(0.5) + + # After recycle, pages_in_use for old endpoint should be empty + from crawl4ai.browser_manager import BrowserManager + if bm._browser_endpoint_key: + piu = BrowserManager._global_pages_in_use.get( + bm._browser_endpoint_key, set() + ) + assert len(piu) == 0, ( + f"_global_pages_in_use should be empty after recycle, " + f"has {len(piu)} stale entries" + ) + + +# =========================================================================== +# Test 16: Content integrity across recycle — page content is correct +# =========================================================================== + +@pytest.mark.asyncio +async def test_content_integrity_across_recycle(test_server): + """Verify that pages crawled AFTER a recycle return correct content, + not stale data from before the recycle.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=3, + ) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + # Crawl pages 0,1,2 → triggers recycle + for i in range(3): + r = await crawler.arun(url=_url(test_server, i), config=run_config) + assert r.success + + await asyncio.sleep(0.5) + + # Crawl page 150 after recycle — content should match page 150 + r = await crawler.arun(url=_url(test_server, 150), config=run_config) + assert r.success + assert "Test page 150" in r.html, ( + "Content after recycle should be from the correct page" + ) + assert "Paragraph 150" in r.html + + +# =========================================================================== +# SESSION + RECYCLE INTERACTION TESTS +# =========================================================================== + + +# =========================================================================== +# Test 17: Multi-step session crawl — login → dashboard with cookie +# =========================================================================== + +@pytest.mark.asyncio +async def test_multistep_session_login_flow(test_server): + """Simulate login → dashboard multi-step crawl using session_id. + The session preserves cookies, so dashboard should see authenticated state. + No recycling involved — baseline session behavior.""" + config = BrowserConfig(headless=True, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + session_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="login_flow", + verbose=False, + ) + + # Step 1: login — sets cookie + r = await crawler.arun(url=f"{test_server}/login", config=session_cfg) + assert r.success + assert "Login Page" in r.html + + # Step 2: dashboard — cookie should carry over via session + r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg) + assert r.success + assert "Welcome, authenticated user" in r.html, ( + "Session should carry cookies from login to dashboard" + ) + + +# =========================================================================== +# Test 18: Multi-step session survives non-session crawls past threshold +# =========================================================================== + +@pytest.mark.asyncio +async def test_session_survives_threshold_with_interleaved_crawls(test_server): + """Open a session, then do many non-session crawls that push + pages_served past the recycle threshold. The session should prevent + recycle from firing (refcount > 0). Then continue using the session + and it should still work.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=5, + ) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + # Start a session — step 1 + session_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="persistent_session", + verbose=False, + ) + r = await crawler.arun(url=f"{test_server}/login", config=session_cfg) + assert r.success + assert "persistent_session" in bm.sessions + + # Fire 8 non-session crawls — pushes pages_served to 9 + # (1 from session + 8 = 9, well past threshold of 5) + no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + for i in range(8): + r = await crawler.arun(url=_url(test_server, i), config=no_session) + assert r.success, f"Non-session crawl {i} failed" + + # Recycle should NOT have fired — session holds refcount + assert bm._pages_served == 9, ( + f"Expected 9 pages served, got {bm._pages_served}" + ) + assert not bm._recycling + assert "persistent_session" in bm.sessions, ( + "Session should still exist — recycle blocked by refcount" + ) + + # Session should still work — navigate to dashboard with cookies + r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg) + assert r.success + assert "Welcome, authenticated user" in r.html, ( + "Session cookies should still work after interleaved non-session crawls" + ) + + +# =========================================================================== +# Test 19: 3-step session flow with recycle threshold — recycle blocked +# =========================================================================== + +@pytest.mark.asyncio +async def test_three_step_session_blocks_recycle(test_server): + """3-step session (step1 → step2 → step3) with low threshold. + The session's refcount should block recycle for the entire flow.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=2, # very low threshold + ) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + session_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="multistep", + verbose=False, + ) + + # Step 1 + r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg) + assert r.success + assert "Step 1" in r.html + + # Step 2 — pages_served is still 1 (session reuse doesn't increment) + # but even if it did, refcount blocks recycle + r = await crawler.arun(url=f"{test_server}/step2", config=session_cfg) + assert r.success + assert "Step 2" in r.html + + # Step 3 + r = await crawler.arun(url=f"{test_server}/step3", config=session_cfg) + assert r.success + assert "Step 3" in r.html + + # Session page reuse doesn't increment counter (only get_page does) + # Initial creation = 1 page, subsequent calls reuse it + assert bm._pages_served == 1 + assert not bm._recycling + assert "multistep" in bm.sessions + + +# =========================================================================== +# Test 20: Two concurrent sessions — both survive past threshold +# =========================================================================== + +@pytest.mark.asyncio +async def test_two_concurrent_sessions_block_recycle(test_server): + """Two sessions open at the same time, with non-session crawls interleaved. + Both sessions should prevent recycle and remain functional.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=3, + ) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + session_a = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="sess_a", verbose=False, + ) + session_b = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="sess_b", verbose=False, + ) + no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + # Open session A + r = await crawler.arun(url=f"{test_server}/login", config=session_a) + assert r.success + + # Open session B + r = await crawler.arun(url=f"{test_server}/step1", config=session_b) + assert r.success + + # 5 non-session crawls — pages_served goes to 7 (2 sessions + 5) + for i in range(5): + r = await crawler.arun(url=_url(test_server, i), config=no_session) + assert r.success + + # Both sessions hold refcounts → recycle blocked + assert not bm._recycling + assert "sess_a" in bm.sessions + assert "sess_b" in bm.sessions + + # Both sessions still work + r = await crawler.arun(url=f"{test_server}/dashboard", config=session_a) + assert r.success + assert "Welcome, authenticated user" in r.html + + r = await crawler.arun(url=f"{test_server}/step2", config=session_b) + assert r.success + assert "Step 2" in r.html + + +# =========================================================================== +# Test 21: Session killed, then recycle fires on next non-session crawl +# =========================================================================== + +@pytest.mark.asyncio +async def test_recycle_fires_after_session_killed(test_server): + """Session blocks recycle. After session is killed (refcount drops to 0), + the next non-session crawl that pushes past threshold triggers recycle.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=3, + ) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + # Open a session (1 page) + session_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="temp_sess", verbose=False, + ) + r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg) + assert r.success + + # 3 non-session crawls (4 pages total, threshold=3, but session blocks) + for i in range(3): + r = await crawler.arun(url=_url(test_server, i), config=no_session) + assert r.success + + pages_before_kill = bm._pages_served + assert pages_before_kill == 4 + assert not bm._recycling + + # Kill the session — refcount drops to 0 + await crawler.crawler_strategy.kill_session("temp_sess") + assert "temp_sess" not in bm.sessions + + # One more crawl — should trigger recycle (pages_served=5 >= 3, refcounts=0) + r = await crawler.arun(url=_url(test_server, 99), config=no_session) + assert r.success + + await asyncio.sleep(0.5) + + # Recycle should have fired — counter reset + assert bm._pages_served < pages_before_kill, ( + f"Expected counter reset after recycle, got {bm._pages_served}" + ) + + +# =========================================================================== +# Test 22: Concurrent session crawls — same session from multiple tasks +# =========================================================================== + +@pytest.mark.asyncio +async def test_concurrent_same_session_crawls(test_server): + """Multiple asyncio tasks using the same session_id concurrently. + The session page should be shared safely between them.""" + config = BrowserConfig(headless=True, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + session_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="shared_session", + verbose=False, + ) + + # Login first to set cookie + r = await crawler.arun(url=f"{test_server}/login", config=session_cfg) + assert r.success + + # Fire 5 concurrent crawls on the same session + urls = [f"{test_server}/page{i}" for i in range(5)] + tasks = [ + crawler.arun(url=u, config=session_cfg) for u in urls + ] + results = await asyncio.gather(*tasks, return_exceptions=True) + + exceptions = [r for r in results if isinstance(r, Exception)] + # Some may fail due to navigation conflicts (same page, concurrent goto), + # but there should be no crashes or browser death + assert len(exceptions) == 0, ( + f"Exceptions in concurrent same-session crawls: {exceptions[:3]}" + ) + + +# =========================================================================== +# Test 23: Session + recycling — session killed mid-batch, recycle fires, +# new session works after +# =========================================================================== + +@pytest.mark.asyncio +async def test_session_lifecycle_across_recycle(test_server): + """Full lifecycle: create session → use it → kill it → recycle fires → + create new session → use it. End-to-end proof that recycling is safe.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=4, + ) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + + # Phase 1: create and use a session + sess_v1 = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="lifecycle_sess", verbose=False, + ) + r = await crawler.arun(url=f"{test_server}/login", config=sess_v1) + assert r.success + + r = await crawler.arun(url=f"{test_server}/dashboard", config=sess_v1) + assert r.success + assert "Welcome, authenticated user" in r.html + + # Phase 2: kill session + await crawler.crawler_strategy.kill_session("lifecycle_sess") + + # Phase 3: push past threshold with non-session crawls + for i in range(5): + r = await crawler.arun(url=_url(test_server, i), config=no_session) + assert r.success + + await asyncio.sleep(0.5) + + # Recycle should have happened (session killed, refcount=0) + assert bm._pages_served < 6, ( + f"Expected reset after recycle, got {bm._pages_served}" + ) + + # Phase 4: new session on the fresh browser + sess_v2 = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="lifecycle_sess_v2", verbose=False, + ) + r = await crawler.arun(url=f"{test_server}/login", config=sess_v2) + assert r.success + assert "lifecycle_sess_v2" in bm.sessions + + r = await crawler.arun(url=f"{test_server}/dashboard", config=sess_v2) + assert r.success + assert "Welcome, authenticated user" in r.html, ( + "New session after recycle should work with cookies" + ) + + +# =========================================================================== +# Test 24: Parallel sessions + non-session crawls with arun_many +# =========================================================================== + +@pytest.mark.asyncio +async def test_session_with_arun_many_interleaved(test_server): + """Open a session, then fire arun_many for non-session URLs. + The session should survive the batch and remain usable after.""" + config = BrowserConfig( + headless=True, + verbose=False, + max_pages_before_recycle=10, + ) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + # Open session + session_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="batch_sess", verbose=False, + ) + r = await crawler.arun(url=f"{test_server}/login", config=session_cfg) + assert r.success + + # Batch of non-session crawls + no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False) + urls = [_url(test_server, i) for i in range(8)] + results = await crawler.arun_many(urls, config=no_session) + assert all(r.success for r in results), "All batch crawls should succeed" + + # Session still alive + assert "batch_sess" in bm.sessions + r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg) + assert r.success + assert "Welcome, authenticated user" in r.html + + +# =========================================================================== +# Test 25: Session refcount tracking correctness +# =========================================================================== + +@pytest.mark.asyncio +async def test_session_refcount_stays_at_one(test_server): + """Verify that a session holds exactly 1 refcount throughout its + lifecycle, regardless of how many times it's reused.""" + config = BrowserConfig(headless=True, verbose=False) + + async with AsyncWebCrawler(config=config) as crawler: + bm = _bm(crawler) + + session_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, session_id="refcount_test", verbose=False, + ) + + # Create session + r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg) + assert r.success + + # Find the session's context signature + _, page, _ = bm.sessions["refcount_test"] + sig = bm._page_to_sig.get(page) + if sig: + refcount = bm._context_refcounts.get(sig, 0) + assert refcount == 1, ( + f"Session should hold exactly 1 refcount, got {refcount}" + ) + + # Reuse session multiple times — refcount should stay at 1 + for url in ["/step2", "/step3", "/dashboard"]: + r = await crawler.arun(url=f"{test_server}{url}", config=session_cfg) + assert r.success + + if sig: + refcount = bm._context_refcounts.get(sig, 0) + assert refcount == 1, ( + f"After reuse, refcount should still be 1, got {refcount}" + ) + + # Kill session — refcount should drop to 0 + await crawler.crawler_strategy.kill_session("refcount_test") + if sig: + refcount = bm._context_refcounts.get(sig, 0) + assert refcount == 0, ( + f"After kill, refcount should be 0, got {refcount}" + )