Fix CDP connection handling: support WS URLs and proper cleanup
Changes to browser_manager.py:
1. _verify_cdp_ready(): Support multiple URL formats
- WebSocket URLs (ws://, wss://): Skip HTTP verification, Playwright handles directly
- HTTP URLs with query params: Properly parse with urlparse to preserve query string
- Fixes issue where naive f"{cdp_url}/json/version" broke WS URLs and query params
2. close(): Proper cleanup when cdp_cleanup_on_close=True
- Close all sessions (pages)
- Close all contexts
- Call browser.close() to disconnect (doesn't terminate browser, just releases connection)
- Wait 1 second for CDP connection to fully release
- Stop Playwright instance to prevent memory leaks
This enables:
- Connecting to specific browsers via WS URL
- Reusing the same browser with multiple sequential connections
- No user wait needed between connections (internal 1s delay handles it)
Added tests/browser/test_cdp_cleanup_reuse.py with comprehensive tests.
This commit is contained in:
281
tests/browser/test_cdp_cleanup_reuse.py
Normal file
281
tests/browser/test_cdp_cleanup_reuse.py
Normal file
@@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for CDP connection cleanup and browser reuse.
|
||||
|
||||
These tests verify that:
|
||||
1. WebSocket URLs are properly handled (skip HTTP verification)
|
||||
2. cdp_cleanup_on_close properly disconnects without terminating the browser
|
||||
3. The same browser can be reused by multiple sequential connections
|
||||
|
||||
Requirements:
|
||||
- A CDP-compatible browser pool service running (e.g., chromepoold)
|
||||
- Service should be accessible at CDP_SERVICE_URL (default: http://localhost:11235)
|
||||
|
||||
Usage:
|
||||
pytest tests/browser/test_cdp_cleanup_reuse.py -v
|
||||
|
||||
Or run directly:
|
||||
python tests/browser/test_cdp_cleanup_reuse.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import pytest
|
||||
import requests
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
# Configuration
|
||||
CDP_SERVICE_URL = os.getenv("CDP_SERVICE_URL", "http://localhost:11235")
|
||||
|
||||
|
||||
def is_cdp_service_available():
|
||||
"""Check if CDP service is running."""
|
||||
try:
|
||||
resp = requests.get(f"{CDP_SERVICE_URL}/health", timeout=2)
|
||||
return resp.status_code == 200
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
def create_browser():
|
||||
"""Create a browser via CDP service API."""
|
||||
resp = requests.post(
|
||||
f"{CDP_SERVICE_URL}/v1/browsers",
|
||||
json={"headless": True},
|
||||
timeout=10
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
|
||||
def get_browser_info(browser_id):
|
||||
"""Get browser info from CDP service."""
|
||||
resp = requests.get(f"{CDP_SERVICE_URL}/v1/browsers", timeout=5)
|
||||
for browser in resp.json():
|
||||
if browser["id"] == browser_id:
|
||||
return browser
|
||||
return None
|
||||
|
||||
|
||||
def delete_browser(browser_id):
|
||||
"""Delete a browser via CDP service API."""
|
||||
try:
|
||||
requests.delete(f"{CDP_SERVICE_URL}/v1/browsers/{browser_id}", timeout=5)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# Skip all tests if CDP service is not available
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not is_cdp_service_available(),
|
||||
reason=f"CDP service not available at {CDP_SERVICE_URL}"
|
||||
)
|
||||
|
||||
|
||||
class TestCDPWebSocketURL:
|
||||
"""Tests for WebSocket URL handling."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_websocket_url_skips_http_verification(self):
|
||||
"""WebSocket URLs should skip HTTP /json/version verification."""
|
||||
browser = create_browser()
|
||||
try:
|
||||
ws_url = browser["ws_url"]
|
||||
assert ws_url.startswith("ws://") or ws_url.startswith("wss://")
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig(
|
||||
browser_mode="cdp",
|
||||
cdp_url=ws_url,
|
||||
headless=True,
|
||||
cdp_cleanup_on_close=True,
|
||||
)
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(verbose=False),
|
||||
)
|
||||
assert result.success
|
||||
assert "Example Domain" in result.metadata.get("title", "")
|
||||
finally:
|
||||
delete_browser(browser["browser_id"])
|
||||
|
||||
|
||||
class TestCDPCleanupOnClose:
|
||||
"""Tests for cdp_cleanup_on_close behavior."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_browser_survives_after_cleanup_close(self):
|
||||
"""Browser should remain alive after close with cdp_cleanup_on_close=True."""
|
||||
browser = create_browser()
|
||||
browser_id = browser["browser_id"]
|
||||
ws_url = browser["ws_url"]
|
||||
|
||||
try:
|
||||
# Verify browser exists
|
||||
info_before = get_browser_info(browser_id)
|
||||
assert info_before is not None
|
||||
pid_before = info_before["pid"]
|
||||
|
||||
# Connect, crawl, and close with cleanup
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig(
|
||||
browser_mode="cdp",
|
||||
cdp_url=ws_url,
|
||||
headless=True,
|
||||
cdp_cleanup_on_close=True,
|
||||
)
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(verbose=False),
|
||||
)
|
||||
assert result.success
|
||||
|
||||
# Browser should still exist with same PID
|
||||
info_after = get_browser_info(browser_id)
|
||||
assert info_after is not None, "Browser was terminated but should only disconnect"
|
||||
assert info_after["pid"] == pid_before, "Browser PID changed unexpectedly"
|
||||
finally:
|
||||
delete_browser(browser_id)
|
||||
|
||||
|
||||
class TestCDPBrowserReuse:
|
||||
"""Tests for reusing the same browser with multiple connections."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sequential_connections_same_browser(self):
|
||||
"""Multiple sequential connections to the same browser should work."""
|
||||
browser = create_browser()
|
||||
browser_id = browser["browser_id"]
|
||||
ws_url = browser["ws_url"]
|
||||
|
||||
try:
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://httpbin.org/ip",
|
||||
"https://httpbin.org/headers",
|
||||
]
|
||||
|
||||
for i, url in enumerate(urls, 1):
|
||||
# Each connection uses cdp_cleanup_on_close=True
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig(
|
||||
browser_mode="cdp",
|
||||
cdp_url=ws_url,
|
||||
headless=True,
|
||||
cdp_cleanup_on_close=True,
|
||||
)
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=CrawlerRunConfig(verbose=False),
|
||||
)
|
||||
assert result.success, f"Connection {i} failed for {url}"
|
||||
|
||||
# Verify browser is still healthy
|
||||
info = get_browser_info(browser_id)
|
||||
assert info is not None, f"Browser died after connection {i}"
|
||||
|
||||
finally:
|
||||
delete_browser(browser_id)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_user_wait_needed_between_connections(self):
|
||||
"""With cdp_cleanup_on_close=True, no user wait should be needed."""
|
||||
browser = create_browser()
|
||||
browser_id = browser["browser_id"]
|
||||
ws_url = browser["ws_url"]
|
||||
|
||||
try:
|
||||
# Rapid-fire connections with NO sleep between them
|
||||
for i in range(3):
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig(
|
||||
browser_mode="cdp",
|
||||
cdp_url=ws_url,
|
||||
headless=True,
|
||||
cdp_cleanup_on_close=True,
|
||||
)
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(verbose=False),
|
||||
)
|
||||
assert result.success, f"Rapid connection {i+1} failed"
|
||||
# NO asyncio.sleep() here - internal delay should be sufficient
|
||||
finally:
|
||||
delete_browser(browser_id)
|
||||
|
||||
|
||||
class TestCDPBackwardCompatibility:
|
||||
"""Tests for backward compatibility with existing CDP usage."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_url_with_browser_id_works(self):
|
||||
"""HTTP URL with browser_id query param should work (backward compatibility)."""
|
||||
browser = create_browser()
|
||||
browser_id = browser["browser_id"]
|
||||
try:
|
||||
# Use HTTP URL with browser_id query parameter
|
||||
http_url = f"{CDP_SERVICE_URL}?browser_id={browser_id}"
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig(
|
||||
browser_mode="cdp",
|
||||
cdp_url=http_url,
|
||||
headless=True,
|
||||
cdp_cleanup_on_close=True,
|
||||
)
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(verbose=False),
|
||||
)
|
||||
assert result.success
|
||||
finally:
|
||||
delete_browser(browser_id)
|
||||
|
||||
|
||||
# Allow running directly
|
||||
if __name__ == "__main__":
|
||||
if not is_cdp_service_available():
|
||||
print(f"CDP service not available at {CDP_SERVICE_URL}")
|
||||
print("Please start a CDP-compatible browser pool service first.")
|
||||
exit(1)
|
||||
|
||||
async def run_tests():
|
||||
print("=" * 60)
|
||||
print("CDP Cleanup and Browser Reuse Tests")
|
||||
print("=" * 60)
|
||||
|
||||
tests = [
|
||||
("WebSocket URL handling", TestCDPWebSocketURL().test_websocket_url_skips_http_verification),
|
||||
("Browser survives after cleanup", TestCDPCleanupOnClose().test_browser_survives_after_cleanup_close),
|
||||
("Sequential connections", TestCDPBrowserReuse().test_sequential_connections_same_browser),
|
||||
("No user wait needed", TestCDPBrowserReuse().test_no_user_wait_needed_between_connections),
|
||||
("HTTP URL with browser_id", TestCDPBackwardCompatibility().test_http_url_with_browser_id_works),
|
||||
]
|
||||
|
||||
results = []
|
||||
for name, test_func in tests:
|
||||
print(f"\n--- {name} ---")
|
||||
try:
|
||||
await test_func()
|
||||
print(f"PASS")
|
||||
results.append((name, True))
|
||||
except Exception as e:
|
||||
print(f"FAIL: {e}")
|
||||
results.append((name, False))
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
for name, passed in results:
|
||||
print(f" {name}: {'PASS' if passed else 'FAIL'}")
|
||||
|
||||
all_passed = all(r[1] for r in results)
|
||||
print(f"\nOverall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}")
|
||||
return 0 if all_passed else 1
|
||||
|
||||
exit(asyncio.run(run_tests()))
|
||||
Reference in New Issue
Block a user