diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index f0bb0be0..309f6b0f 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -717,13 +717,38 @@ class BrowserManager: self.default_context = self.browser async def _verify_cdp_ready(self, cdp_url: str) -> bool: - """Verify CDP endpoint is ready with exponential backoff""" + """Verify CDP endpoint is ready with exponential backoff. + + Supports multiple URL formats: + - HTTP URLs: http://localhost:9222 + - HTTP URLs with query params: http://localhost:9222?browser_id=XXX + - WebSocket URLs: ws://localhost:9222/devtools/browser/XXX + """ import aiohttp - self.logger.debug(f"Starting CDP verification for {cdp_url}", tag="BROWSER") + from urllib.parse import urlparse, urlunparse + + # If WebSocket URL, Playwright handles connection directly - skip HTTP verification + if cdp_url.startswith(('ws://', 'wss://')): + self.logger.debug(f"WebSocket CDP URL provided, skipping HTTP verification", tag="BROWSER") + return True + + # Parse HTTP URL and properly construct /json/version endpoint + parsed = urlparse(cdp_url) + # Build URL with /json/version path, preserving query params + verify_url = urlunparse(( + parsed.scheme, + parsed.netloc, + '/json/version', # Always use this path for verification + '', # params + parsed.query, # preserve query string + '' # fragment + )) + + self.logger.debug(f"Starting CDP verification for {verify_url}", tag="BROWSER") for attempt in range(5): try: async with aiohttp.ClientSession() as session: - async with session.get(f"{cdp_url}/json/version", timeout=aiohttp.ClientTimeout(total=2)) as response: + async with session.get(verify_url, timeout=aiohttp.ClientTimeout(total=2)) as response: if response.status == 200: self.logger.debug(f"CDP endpoint ready after {attempt + 1} attempts", tag="BROWSER") return True @@ -1257,12 +1282,42 @@ class BrowserManager: async def close(self): """Close all browser resources and clean up.""" if self.config.cdp_url: - # When using external CDP, we don't own the browser so skip most cleanup. - # But if cdp_cleanup_on_close is True, still clean up local Playwright client - # to prevent memory leaks in server/cloud scenarios. - if self.config.cdp_cleanup_on_close and self.playwright: - await self.playwright.stop() - self.playwright = None + # When using external CDP, we don't own the browser process. + # If cdp_cleanup_on_close is True, properly disconnect from the browser + # and clean up Playwright resources. This frees the browser for other clients. + if self.config.cdp_cleanup_on_close: + # First close all sessions (pages) + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + + # Close all contexts we created + for ctx in self.contexts_by_config.values(): + try: + await ctx.close() + except Exception: + pass + self.contexts_by_config.clear() + + # Disconnect from browser (doesn't terminate it, just releases connection) + if self.browser: + try: + await self.browser.close() + except Exception as e: + if self.logger: + self.logger.debug( + message="Error disconnecting from CDP browser: {error}", + tag="BROWSER", + params={"error": str(e)} + ) + self.browser = None + # Allow time for CDP connection to fully release before another client connects + await asyncio.sleep(1.0) + + # Stop Playwright instance to prevent memory leaks + if self.playwright: + await self.playwright.stop() + self.playwright = None return if self.config.sleep_on_close: diff --git a/tests/browser/test_cdp_cleanup_reuse.py b/tests/browser/test_cdp_cleanup_reuse.py new file mode 100644 index 00000000..dc5a93f0 --- /dev/null +++ b/tests/browser/test_cdp_cleanup_reuse.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +Tests for CDP connection cleanup and browser reuse. + +These tests verify that: +1. WebSocket URLs are properly handled (skip HTTP verification) +2. cdp_cleanup_on_close properly disconnects without terminating the browser +3. The same browser can be reused by multiple sequential connections + +Requirements: +- A CDP-compatible browser pool service running (e.g., chromepoold) +- Service should be accessible at CDP_SERVICE_URL (default: http://localhost:11235) + +Usage: + pytest tests/browser/test_cdp_cleanup_reuse.py -v + +Or run directly: + python tests/browser/test_cdp_cleanup_reuse.py +""" + +import asyncio +import os +import pytest +import requests +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +# Configuration +CDP_SERVICE_URL = os.getenv("CDP_SERVICE_URL", "http://localhost:11235") + + +def is_cdp_service_available(): + """Check if CDP service is running.""" + try: + resp = requests.get(f"{CDP_SERVICE_URL}/health", timeout=2) + return resp.status_code == 200 + except: + return False + + +def create_browser(): + """Create a browser via CDP service API.""" + resp = requests.post( + f"{CDP_SERVICE_URL}/v1/browsers", + json={"headless": True}, + timeout=10 + ) + resp.raise_for_status() + return resp.json() + + +def get_browser_info(browser_id): + """Get browser info from CDP service.""" + resp = requests.get(f"{CDP_SERVICE_URL}/v1/browsers", timeout=5) + for browser in resp.json(): + if browser["id"] == browser_id: + return browser + return None + + +def delete_browser(browser_id): + """Delete a browser via CDP service API.""" + try: + requests.delete(f"{CDP_SERVICE_URL}/v1/browsers/{browser_id}", timeout=5) + except: + pass + + +# Skip all tests if CDP service is not available +pytestmark = pytest.mark.skipif( + not is_cdp_service_available(), + reason=f"CDP service not available at {CDP_SERVICE_URL}" +) + + +class TestCDPWebSocketURL: + """Tests for WebSocket URL handling.""" + + @pytest.mark.asyncio + async def test_websocket_url_skips_http_verification(self): + """WebSocket URLs should skip HTTP /json/version verification.""" + browser = create_browser() + try: + ws_url = browser["ws_url"] + assert ws_url.startswith("ws://") or ws_url.startswith("wss://") + + async with AsyncWebCrawler( + config=BrowserConfig( + browser_mode="cdp", + cdp_url=ws_url, + headless=True, + cdp_cleanup_on_close=True, + ) + ) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(verbose=False), + ) + assert result.success + assert "Example Domain" in result.metadata.get("title", "") + finally: + delete_browser(browser["browser_id"]) + + +class TestCDPCleanupOnClose: + """Tests for cdp_cleanup_on_close behavior.""" + + @pytest.mark.asyncio + async def test_browser_survives_after_cleanup_close(self): + """Browser should remain alive after close with cdp_cleanup_on_close=True.""" + browser = create_browser() + browser_id = browser["browser_id"] + ws_url = browser["ws_url"] + + try: + # Verify browser exists + info_before = get_browser_info(browser_id) + assert info_before is not None + pid_before = info_before["pid"] + + # Connect, crawl, and close with cleanup + async with AsyncWebCrawler( + config=BrowserConfig( + browser_mode="cdp", + cdp_url=ws_url, + headless=True, + cdp_cleanup_on_close=True, + ) + ) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(verbose=False), + ) + assert result.success + + # Browser should still exist with same PID + info_after = get_browser_info(browser_id) + assert info_after is not None, "Browser was terminated but should only disconnect" + assert info_after["pid"] == pid_before, "Browser PID changed unexpectedly" + finally: + delete_browser(browser_id) + + +class TestCDPBrowserReuse: + """Tests for reusing the same browser with multiple connections.""" + + @pytest.mark.asyncio + async def test_sequential_connections_same_browser(self): + """Multiple sequential connections to the same browser should work.""" + browser = create_browser() + browser_id = browser["browser_id"] + ws_url = browser["ws_url"] + + try: + urls = [ + "https://example.com", + "https://httpbin.org/ip", + "https://httpbin.org/headers", + ] + + for i, url in enumerate(urls, 1): + # Each connection uses cdp_cleanup_on_close=True + async with AsyncWebCrawler( + config=BrowserConfig( + browser_mode="cdp", + cdp_url=ws_url, + headless=True, + cdp_cleanup_on_close=True, + ) + ) as crawler: + result = await crawler.arun( + url=url, + config=CrawlerRunConfig(verbose=False), + ) + assert result.success, f"Connection {i} failed for {url}" + + # Verify browser is still healthy + info = get_browser_info(browser_id) + assert info is not None, f"Browser died after connection {i}" + + finally: + delete_browser(browser_id) + + @pytest.mark.asyncio + async def test_no_user_wait_needed_between_connections(self): + """With cdp_cleanup_on_close=True, no user wait should be needed.""" + browser = create_browser() + browser_id = browser["browser_id"] + ws_url = browser["ws_url"] + + try: + # Rapid-fire connections with NO sleep between them + for i in range(3): + async with AsyncWebCrawler( + config=BrowserConfig( + browser_mode="cdp", + cdp_url=ws_url, + headless=True, + cdp_cleanup_on_close=True, + ) + ) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(verbose=False), + ) + assert result.success, f"Rapid connection {i+1} failed" + # NO asyncio.sleep() here - internal delay should be sufficient + finally: + delete_browser(browser_id) + + +class TestCDPBackwardCompatibility: + """Tests for backward compatibility with existing CDP usage.""" + + @pytest.mark.asyncio + async def test_http_url_with_browser_id_works(self): + """HTTP URL with browser_id query param should work (backward compatibility).""" + browser = create_browser() + browser_id = browser["browser_id"] + try: + # Use HTTP URL with browser_id query parameter + http_url = f"{CDP_SERVICE_URL}?browser_id={browser_id}" + + async with AsyncWebCrawler( + config=BrowserConfig( + browser_mode="cdp", + cdp_url=http_url, + headless=True, + cdp_cleanup_on_close=True, + ) + ) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(verbose=False), + ) + assert result.success + finally: + delete_browser(browser_id) + + +# Allow running directly +if __name__ == "__main__": + if not is_cdp_service_available(): + print(f"CDP service not available at {CDP_SERVICE_URL}") + print("Please start a CDP-compatible browser pool service first.") + exit(1) + + async def run_tests(): + print("=" * 60) + print("CDP Cleanup and Browser Reuse Tests") + print("=" * 60) + + tests = [ + ("WebSocket URL handling", TestCDPWebSocketURL().test_websocket_url_skips_http_verification), + ("Browser survives after cleanup", TestCDPCleanupOnClose().test_browser_survives_after_cleanup_close), + ("Sequential connections", TestCDPBrowserReuse().test_sequential_connections_same_browser), + ("No user wait needed", TestCDPBrowserReuse().test_no_user_wait_needed_between_connections), + ("HTTP URL with browser_id", TestCDPBackwardCompatibility().test_http_url_with_browser_id_works), + ] + + results = [] + for name, test_func in tests: + print(f"\n--- {name} ---") + try: + await test_func() + print(f"PASS") + results.append((name, True)) + except Exception as e: + print(f"FAIL: {e}") + results.append((name, False)) + + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + for name, passed in results: + print(f" {name}: {'PASS' if passed else 'FAIL'}") + + all_passed = all(r[1] for r in results) + print(f"\nOverall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}") + return 0 if all_passed else 1 + + exit(asyncio.run(run_tests()))