Fix browser recycling under high concurrency — version-based approach

The previous recycle logic waited for all refcounts to hit 0 before recycling, which never happened under sustained concurrent load (20+ crawls always had at least one active). New approach: - Add _browser_version to config signature — bump it to force new contexts - When threshold is hit: bump version, move old sigs to _pending_cleanup - New requests get new contexts automatically (different signature) - Old contexts drain naturally and get cleaned up when refcount hits 0 - Safety cap: max 3 pending browsers draining at once This means recycling now works under any load pattern — no blocking, no waiting for quiet moments. Old and new browsers coexist briefly during transitions. Includes 12 new tests covering version bumps, concurrent recycling, safety cap, and edge cases.
2026-02-05 07:48:12 +00:00
parent c046918bb4
commit 3401dd1620
2 changed files with 502 additions and 70 deletions
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -734,12 +734,14 @@ class BrowserManager:
        # Browser endpoint key for global page tracking (set after browser starts)
        self._browser_endpoint_key: Optional[str] = None
-        # Browser recycling state
+        # Browser recycling state (version-based approach)
        self._pages_served = 0
-        self._recycling = False
+        self._browser_version = 1  # included in signature, bump to create new browser
-        self._recycle_lock = asyncio.Lock()
+        self._pending_cleanup = {}  # old_sig -> {"browser": browser, "contexts": [...], "done": Event}
-        self._recycle_done = asyncio.Event()
+        self._pending_cleanup_lock = asyncio.Lock()
-        self._recycle_done.set()  # starts "open" — not recycling
+        self._max_pending_browsers = 3  # safety cap — block if too many draining
        self._cleanup_slot_available = asyncio.Event()
        self._cleanup_slot_available.set()  # starts open
        # Stealth adapter for stealth mode
        self._stealth_adapter = None
@@ -1318,6 +1320,9 @@ class BrowserManager:
        sig_dict["simulate_user"] = crawlerRunConfig.simulate_user
        sig_dict["magic"] = crawlerRunConfig.magic
        # Browser version — bumped on recycle to force new browser instance
        sig_dict["_browser_version"] = self._browser_version
        signature_json = json.dumps(sig_dict, sort_keys=True, default=str)
        return hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
@@ -1434,9 +1439,6 @@ class BrowserManager:
        Returns:
            (page, context): The Page and its BrowserContext
        """
        # Block if browser is being recycled; wakes instantly when done
        await self._recycle_done.wait()
        self._cleanup_expired_sessions()
        # If a session_id is provided and we already have it, reuse that page + context
@@ -1597,6 +1599,11 @@ class BrowserManager:
            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
        self._pages_served += 1
        # Check if browser recycle threshold is hit — bump version for next requests
        # This happens AFTER incrementing counter so concurrent requests see correct count
        await self._maybe_bump_browser_version()
        return page, context
    async def kill_session(self, session_id: str):
@@ -1645,16 +1652,19 @@ class BrowserManager:
        release_page() so the context lifecycle is properly tracked.
        """
        self._release_page_from_use(page)
        sig = None
        refcount = -1
        async with self._contexts_lock:
            sig = self._page_to_sig.pop(page, None)
            if sig is not None and sig in self._context_refcounts:
                self._context_refcounts[sig] = max(
                    0, self._context_refcounts[sig] - 1
                )
                refcount = self._context_refcounts[sig]
-        # Check if browser recycle is needed
+        # Check if this signature belongs to an old browser waiting to be cleaned up
-        if self._should_recycle():
+        if sig is not None and refcount == 0:
-            await self._maybe_recycle_browser()
+            await self._maybe_cleanup_old_browser(sig)
    def _should_recycle(self) -> bool:
        """Check if page threshold reached for browser recycling."""
@@ -1663,77 +1673,113 @@ class BrowserManager:
            return False
        return self._pages_served >= limit
-    async def _maybe_recycle_browser(self):
+    async def _maybe_bump_browser_version(self):
-        """Recycle browser if no active crawls are in-flight.
+        """Bump browser version if threshold reached, moving old browser to pending cleanup.
-        Uses asyncio.Event to block new get_page() callers during recycle,
+        New requests automatically get a new browser (via new signature).
-        and sets _recycling inside _contexts_lock to prevent race conditions.
+        Old browser drains naturally and gets cleaned up when refcount hits 0.
        """
-        if self._recycling:
+        if not self._should_recycle():
            return
-        async with self._recycle_lock:
+        # Safety cap: wait if too many old browsers are draining
-            if self._recycling:
+        while True:
-                return
+            async with self._pending_cleanup_lock:
                # Re-check threshold under lock (another request may have bumped already)
                if not self._should_recycle():
                    return
-            # Set _recycling and check refcounts under the SAME lock
+                # Check safety cap
-            # to prevent a new crawl slipping in between check and flag set
+                if len(self._pending_cleanup) >= self._max_pending_browsers:
                    if self.logger:
                        self.logger.debug(
                            message="Waiting for old browser to drain (pending: {count})",
                            tag="BROWSER",
                            params={"count": len(self._pending_cleanup)},
                        )
                    self._cleanup_slot_available.clear()
                    # Release lock and wait
                else:
                    # We have a slot — do the bump inside this lock hold
                    old_version = self._browser_version
                    old_sigs = []
                    async with self._contexts_lock:
                        for sig in list(self._context_refcounts.keys()):
                            old_sigs.append(sig)
                    if self.logger:
                        self.logger.info(
                            message="Bumping browser version {old} -> {new} after {count} pages",
                            tag="BROWSER",
                            params={
                                "old": old_version,
                                "new": old_version + 1,
                                "count": self._pages_served,
                            },
                        )
                    # Mark old signatures for cleanup when their refcount hits 0
                    done_event = asyncio.Event()
                    for sig in old_sigs:
                        self._pending_cleanup[sig] = {
                            "version": old_version,
                            "done": done_event,
                        }
                    # Bump version — new get_page() calls will create new contexts
                    self._browser_version += 1
                    self._pages_served = 0
                    return  # Done!
            # If we get here, we need to wait for a cleanup slot
            await self._cleanup_slot_available.wait()
    async def _maybe_cleanup_old_browser(self, sig: str):
        """Clean up an old browser's context if its refcount hit 0 and it's pending cleanup."""
        async with self._pending_cleanup_lock:
            if sig not in self._pending_cleanup:
                return  # Not an old browser signature
            cleanup_info = self._pending_cleanup.pop(sig)
            old_version = cleanup_info["version"]
            if self.logger:
                self.logger.debug(
                    message="Cleaning up context from browser version {version} (sig: {sig})",
                    tag="BROWSER",
                    params={"version": old_version, "sig": sig[:12]},
                )
            # Remove context from tracking
            async with self._contexts_lock:
-                total_active = sum(self._context_refcounts.values())
+                context = self.contexts_by_config.pop(sig, None)
-                if total_active > 0:
+                self._context_refcounts.pop(sig, None)
-                    return  # active crawls running, next release will re-check
+                self._context_last_used.pop(sig, None)
                self._recycling = True
                self._recycle_done.clear()  # block new get_page() callers
-            try:
+            # Close context outside locks
            if context is not None:
                try:
                    await context.close()
                except Exception:
                    pass
            # Check if any signatures from this old version remain
            remaining_old = [
                s for s, info in self._pending_cleanup.items()
                if info["version"] == old_version
            ]
            if not remaining_old:
                if self.logger:
                    self.logger.info(
-                        message="Recycling browser after {count} pages to reclaim memory",
+                        message="All contexts from browser version {version} cleaned up",
                        tag="BROWSER",
-                        params={"count": self._pages_served},
+                        params={"version": old_version},
                    )
-                # Force full cleanup to kill the browser process and reclaim
+            # Open a cleanup slot if we're below the cap
-                # memory. For external CDP (cdp_url without cache), temporarily
+            if len(self._pending_cleanup) < self._max_pending_browsers:
-                # enable cdp_cleanup_on_close so close() actually disconnects.
+                self._cleanup_slot_available.set()
                # For cached CDP, close() already handles release correctly.
                saved_cdp_cleanup = self.config.cdp_cleanup_on_close
                if self.config.cdp_url and not self._using_cached_cdp:
                    self.config.cdp_cleanup_on_close = True
                try:
                    await self.close()
                finally:
                    self.config.cdp_cleanup_on_close = saved_cdp_cleanup
                # close() already clears most tracking dicts, but ensure
                # everything is reset for the fresh browser
                self.contexts_by_config.clear()
                self._context_refcounts.clear()
                self._context_last_used.clear()
                self._page_to_sig.clear()
                self.sessions.clear()
                # Clear global page tracking for this endpoint — old pages are dead
                if self._browser_endpoint_key and self._browser_endpoint_key in BrowserManager._global_pages_in_use:
                    BrowserManager._global_pages_in_use[self._browser_endpoint_key].clear()
                # Re-create ManagedBrowser if needed — close() sets it to None,
                # but start() expects it to exist for the managed browser path.
                if self.config.use_managed_browser and self.managed_browser is None:
                    self.managed_browser = ManagedBrowser(
                        browser_type=self.config.browser_type,
                        user_data_dir=self.config.user_data_dir,
                        headless=self.config.headless,
                        logger=self.logger,
                        debugging_port=self.config.debugging_port,
                        cdp_url=self.config.cdp_url,
                        browser_config=self.config,
                    )
                await self.start()
                self._pages_served = 0
            finally:
                self._recycling = False
                self._recycle_done.set()  # wake ALL waiting get_page() callers
    def _cleanup_expired_sessions(self):
        """Clean up expired sessions based on TTL."""
--- a/tests/async/test_browser_recycle_v2.py
+++ b/tests/async/test_browser_recycle_v2.py
@@ -0,0 +1,386 @@
 """
 Tests for version-based browser recycling.
 The new recycle approach:
 1. When pages_served hits threshold, bump _browser_version
 2. Old signatures go to _pending_cleanup
 3. New requests get new contexts (different version = different signature)
 4. When old context's refcount hits 0, it gets cleaned up
 5. No blocking — old and new browsers coexist during transition
 These tests use small thresholds (3-5 pages) to verify the mechanics.
 """
 import asyncio
 import threading
 from http.server import HTTPServer, SimpleHTTPRequestHandler
 import pytest
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 # ---------------------------------------------------------------------------
 # Local test server
 # ---------------------------------------------------------------------------
 PAGES = {}
 for i in range(100):
    PAGES[f"/page{i}"] = (
        f"<!DOCTYPE html><html><head><title>Page {i}</title></head>"
        f"<body><h1>Page {i}</h1><p>Content for page {i}.</p></body></html>"
    ).encode()
 class Handler(SimpleHTTPRequestHandler):
    def log_message(self, *a):
        pass
    def do_GET(self):
        body = PAGES.get(self.path, PAGES["/page0"])
        self.send_response(200)
        self.send_header("Content-type", "text/html")
        self.end_headers()
        self.wfile.write(body)
 class _Server(HTTPServer):
    allow_reuse_address = True
@pytest.fixture(scope="module")
 def srv():
    s = _Server(("127.0.0.1", 0), Handler)
    port = s.server_address[1]
    t = threading.Thread(target=s.serve_forever, daemon=True)
    t.start()
    yield f"http://127.0.0.1:{port}"
    s.shutdown()
 def _u(base, i):
    return f"{base}/page{i}"
 def _bm(c):
    return c.crawler_strategy.browser_manager
 # ===================================================================
 # SECTION A — Version bump mechanics
 # ===================================================================
@pytest.mark.asyncio
 async def test_version_bump_on_threshold(srv):
    """Browser version should bump when threshold is reached."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=3,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        assert bm._browser_version == 1
        # Crawl 2 pages — no bump yet
        for i in range(2):
            r = await c.arun(url=_u(srv, i), config=run)
            assert r.success
        assert bm._browser_version == 1, "Version should still be 1 after 2 pages"
        assert bm._pages_served == 2
        # 3rd page hits threshold (3) and triggers bump AFTER the page is served
        r = await c.arun(url=_u(srv, 2), config=run)
        assert r.success
        assert bm._browser_version == 2, "Version should bump after 3rd page"
        assert bm._pages_served == 0, "Counter resets after bump"
        # 4th page is first page of version 2
        r = await c.arun(url=_u(srv, 3), config=run)
        assert r.success
        assert bm._pages_served == 1
@pytest.mark.asyncio
 async def test_signature_changes_after_version_bump(srv):
    """Same CrawlerRunConfig should produce different signatures after version bump."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=2,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        # Get signature before bump
        sig_v1 = bm._make_config_signature(run)
        # Crawl 2 pages
        for i in range(2):
            await c.arun(url=_u(srv, i), config=run)
        # 3rd request triggers bump
        await c.arun(url=_u(srv, 2), config=run)
        # Signature should be different now
        sig_v2 = bm._make_config_signature(run)
        assert sig_v1 != sig_v2, "Signature should change after version bump"
@pytest.mark.asyncio
 async def test_no_version_bump_when_disabled(srv):
    """Version should stay at 1 when max_pages_before_recycle=0."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=0,  # Disabled
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        for i in range(20):
            r = await c.arun(url=_u(srv, i), config=run)
            assert r.success
        assert bm._browser_version == 1, "Version should not bump when disabled"
        assert bm._pages_served == 20
 # ===================================================================
 # SECTION B — Pending cleanup mechanics
 # ===================================================================
@pytest.mark.asyncio
 async def test_old_signature_goes_to_pending_cleanup(srv):
    """Version bump works and old contexts get cleaned up."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=2,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        # Crawl 2 pages — creates signature for version 1, bumps on 2nd
        for i in range(2):
            await c.arun(url=_u(srv, i), config=run)
        # After 2 pages with threshold=2, version should have bumped
        assert bm._browser_version == 2
        # Since sequential crawls release pages immediately (refcount=0),
        # old contexts get cleaned up right away. Pending cleanup should be empty.
        # This is correct behavior — cleanup is eager when possible.
        assert len(bm._pending_cleanup) == 0
@pytest.mark.asyncio
 async def test_cleanup_happens_when_refcount_hits_zero(srv):
    """Old context should be closed when its refcount drops to 0."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=3,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        # Sequential crawls: each page is released before next request
        # So refcount is always 0 between requests, and cleanup happens immediately
        for i in range(10):
            r = await c.arun(url=_u(srv, i), config=run)
            assert r.success
        # Should have bumped twice (at 3 and 6) with version now at 3
        # But since refcount=0 immediately, pending_cleanup should be empty
        assert len(bm._pending_cleanup) == 0, "All old contexts should be cleaned up"
 # ===================================================================
 # SECTION C — Concurrent crawls with recycling
 # ===================================================================
@pytest.mark.asyncio
 async def test_concurrent_crawls_dont_block_on_recycle(srv):
    """Concurrent crawls should not block — old browser drains while new one serves."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=5,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        # Launch 20 concurrent crawls
        tasks = [c.arun(url=_u(srv, i), config=run) for i in range(20)]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        # All should succeed — no blocking, no errors
        excs = [r for r in results if isinstance(r, Exception)]
        assert len(excs) == 0, f"Exceptions: {excs[:3]}"
        successes = [r for r in results if not isinstance(r, Exception) and r.success]
        assert len(successes) == 20, f"Only {len(successes)} succeeded"
        # Version should have bumped multiple times
        assert bm._browser_version >= 2, "Should have recycled at least once"
@pytest.mark.asyncio
 async def test_high_concurrency_with_small_threshold(srv):
    """Stress test: 50 concurrent crawls with threshold=3."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=3,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        # 50 concurrent crawls with threshold of 3 — many version bumps
        tasks = [c.arun(url=_u(srv, i % 100), config=run) for i in range(50)]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        excs = [r for r in results if isinstance(r, Exception)]
        assert len(excs) == 0, f"Exceptions: {excs[:3]}"
        successes = [r for r in results if not isinstance(r, Exception) and r.success]
        assert len(successes) == 50
 # ===================================================================
 # SECTION D — Safety cap (max pending browsers)
 # ===================================================================
@pytest.mark.asyncio
 async def test_safety_cap_limits_pending_browsers(srv):
    """Should not exceed _max_pending_browsers old browsers draining."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=2,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        bm._max_pending_browsers = 2  # Lower cap for testing
        # Run enough crawls to potentially exceed the cap
        for i in range(15):
            r = await c.arun(url=_u(srv, i), config=run)
            assert r.success
        # Pending cleanup should never have exceeded the cap
        # (We can't directly test this during execution, but if it works without
        # deadlock/timeout, the cap logic is functioning)
        assert len(bm._pending_cleanup) <= bm._max_pending_browsers
 # ===================================================================
 # SECTION E — Managed browser mode
 # ===================================================================
@pytest.mark.asyncio
 async def test_managed_browser_recycle(srv):
    """Recycling should work with managed browser mode."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        use_managed_browser=True,
        max_pages_before_recycle=3,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        for i in range(10):
            r = await c.arun(url=_u(srv, i), config=run)
            assert r.success, f"Page {i} failed"
        # Version should have bumped
        assert bm._browser_version >= 2
@pytest.mark.asyncio
 async def test_managed_browser_isolated_context_recycle(srv):
    """Recycling with managed browser + isolated contexts."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        use_managed_browser=True,
        create_isolated_context=True,
        max_pages_before_recycle=3,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        for i in range(10):
            r = await c.arun(url=_u(srv, i), config=run)
            assert r.success, f"Page {i} failed"
        assert bm._browser_version >= 2
 # ===================================================================
 # SECTION F — Edge cases
 # ===================================================================
@pytest.mark.asyncio
 async def test_threshold_of_one(srv):
    """Edge case: threshold=1 means version bump after every page."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=1,
    )
    run = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        for i in range(5):
            r = await c.arun(url=_u(srv, i), config=run)
            assert r.success
        # With threshold=1, each page triggers a bump after being served:
        # Page 0: served, counter=1 >= 1, bump -> version=2, counter=0
        # Page 1: served, counter=1 >= 1, bump -> version=3, counter=0
        # ... etc.
        # After 5 pages, should have bumped 5 times
        assert bm._browser_version == 6  # Started at 1, bumped 5 times
@pytest.mark.asyncio
 async def test_different_configs_get_separate_cleanup_tracking(srv):
    """Different CrawlerRunConfigs should track separately in pending cleanup."""
    cfg = BrowserConfig(
        headless=True, verbose=False,
        max_pages_before_recycle=2,
    )
    async with AsyncWebCrawler(config=cfg) as c:
        bm = _bm(c)
        run_a = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
        run_b = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS, verbose=False,
            override_navigator=True,  # Different config
        )
        # Alternate between configs
        for i in range(6):
            cfg_to_use = run_a if i % 2 == 0 else run_b
            r = await c.arun(url=_u(srv, i), config=cfg_to_use)
            assert r.success
        # Both configs should work fine
        assert bm._browser_version >= 2