Changes to browser_manager.py:
1. _verify_cdp_ready(): Support multiple URL formats
- WebSocket URLs (ws://, wss://): Skip HTTP verification, Playwright handles directly
- HTTP URLs with query params: Properly parse with urlparse to preserve query string
- Fixes issue where naive f"{cdp_url}/json/version" broke WS URLs and query params
2. close(): Proper cleanup when cdp_cleanup_on_close=True
- Close all sessions (pages)
- Close all contexts
- Call browser.close() to disconnect (doesn't terminate browser, just releases connection)
- Wait 1 second for CDP connection to fully release
- Stop Playwright instance to prevent memory leaks
This enables:
- Connecting to specific browsers via WS URL
- Reusing the same browser with multiple sequential connections
- No user wait needed between connections (internal 1s delay handles it)
Added tests/browser/test_cdp_cleanup_reuse.py with comprehensive tests.
282 lines
9.5 KiB
Python
282 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tests for CDP connection cleanup and browser reuse.
|
|
|
|
These tests verify that:
|
|
1. WebSocket URLs are properly handled (skip HTTP verification)
|
|
2. cdp_cleanup_on_close properly disconnects without terminating the browser
|
|
3. The same browser can be reused by multiple sequential connections
|
|
|
|
Requirements:
|
|
- A CDP-compatible browser pool service running (e.g., chromepoold)
|
|
- Service should be accessible at CDP_SERVICE_URL (default: http://localhost:11235)
|
|
|
|
Usage:
|
|
pytest tests/browser/test_cdp_cleanup_reuse.py -v
|
|
|
|
Or run directly:
|
|
python tests/browser/test_cdp_cleanup_reuse.py
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import pytest
|
|
import requests
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
|
|
# Configuration
|
|
CDP_SERVICE_URL = os.getenv("CDP_SERVICE_URL", "http://localhost:11235")
|
|
|
|
|
|
def is_cdp_service_available():
|
|
"""Check if CDP service is running."""
|
|
try:
|
|
resp = requests.get(f"{CDP_SERVICE_URL}/health", timeout=2)
|
|
return resp.status_code == 200
|
|
except:
|
|
return False
|
|
|
|
|
|
def create_browser():
|
|
"""Create a browser via CDP service API."""
|
|
resp = requests.post(
|
|
f"{CDP_SERVICE_URL}/v1/browsers",
|
|
json={"headless": True},
|
|
timeout=10
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
|
|
def get_browser_info(browser_id):
|
|
"""Get browser info from CDP service."""
|
|
resp = requests.get(f"{CDP_SERVICE_URL}/v1/browsers", timeout=5)
|
|
for browser in resp.json():
|
|
if browser["id"] == browser_id:
|
|
return browser
|
|
return None
|
|
|
|
|
|
def delete_browser(browser_id):
|
|
"""Delete a browser via CDP service API."""
|
|
try:
|
|
requests.delete(f"{CDP_SERVICE_URL}/v1/browsers/{browser_id}", timeout=5)
|
|
except:
|
|
pass
|
|
|
|
|
|
# Skip all tests if CDP service is not available
|
|
pytestmark = pytest.mark.skipif(
|
|
not is_cdp_service_available(),
|
|
reason=f"CDP service not available at {CDP_SERVICE_URL}"
|
|
)
|
|
|
|
|
|
class TestCDPWebSocketURL:
|
|
"""Tests for WebSocket URL handling."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_websocket_url_skips_http_verification(self):
|
|
"""WebSocket URLs should skip HTTP /json/version verification."""
|
|
browser = create_browser()
|
|
try:
|
|
ws_url = browser["ws_url"]
|
|
assert ws_url.startswith("ws://") or ws_url.startswith("wss://")
|
|
|
|
async with AsyncWebCrawler(
|
|
config=BrowserConfig(
|
|
browser_mode="cdp",
|
|
cdp_url=ws_url,
|
|
headless=True,
|
|
cdp_cleanup_on_close=True,
|
|
)
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success
|
|
assert "Example Domain" in result.metadata.get("title", "")
|
|
finally:
|
|
delete_browser(browser["browser_id"])
|
|
|
|
|
|
class TestCDPCleanupOnClose:
|
|
"""Tests for cdp_cleanup_on_close behavior."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_browser_survives_after_cleanup_close(self):
|
|
"""Browser should remain alive after close with cdp_cleanup_on_close=True."""
|
|
browser = create_browser()
|
|
browser_id = browser["browser_id"]
|
|
ws_url = browser["ws_url"]
|
|
|
|
try:
|
|
# Verify browser exists
|
|
info_before = get_browser_info(browser_id)
|
|
assert info_before is not None
|
|
pid_before = info_before["pid"]
|
|
|
|
# Connect, crawl, and close with cleanup
|
|
async with AsyncWebCrawler(
|
|
config=BrowserConfig(
|
|
browser_mode="cdp",
|
|
cdp_url=ws_url,
|
|
headless=True,
|
|
cdp_cleanup_on_close=True,
|
|
)
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success
|
|
|
|
# Browser should still exist with same PID
|
|
info_after = get_browser_info(browser_id)
|
|
assert info_after is not None, "Browser was terminated but should only disconnect"
|
|
assert info_after["pid"] == pid_before, "Browser PID changed unexpectedly"
|
|
finally:
|
|
delete_browser(browser_id)
|
|
|
|
|
|
class TestCDPBrowserReuse:
|
|
"""Tests for reusing the same browser with multiple connections."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_sequential_connections_same_browser(self):
|
|
"""Multiple sequential connections to the same browser should work."""
|
|
browser = create_browser()
|
|
browser_id = browser["browser_id"]
|
|
ws_url = browser["ws_url"]
|
|
|
|
try:
|
|
urls = [
|
|
"https://example.com",
|
|
"https://httpbin.org/ip",
|
|
"https://httpbin.org/headers",
|
|
]
|
|
|
|
for i, url in enumerate(urls, 1):
|
|
# Each connection uses cdp_cleanup_on_close=True
|
|
async with AsyncWebCrawler(
|
|
config=BrowserConfig(
|
|
browser_mode="cdp",
|
|
cdp_url=ws_url,
|
|
headless=True,
|
|
cdp_cleanup_on_close=True,
|
|
)
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url=url,
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success, f"Connection {i} failed for {url}"
|
|
|
|
# Verify browser is still healthy
|
|
info = get_browser_info(browser_id)
|
|
assert info is not None, f"Browser died after connection {i}"
|
|
|
|
finally:
|
|
delete_browser(browser_id)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_user_wait_needed_between_connections(self):
|
|
"""With cdp_cleanup_on_close=True, no user wait should be needed."""
|
|
browser = create_browser()
|
|
browser_id = browser["browser_id"]
|
|
ws_url = browser["ws_url"]
|
|
|
|
try:
|
|
# Rapid-fire connections with NO sleep between them
|
|
for i in range(3):
|
|
async with AsyncWebCrawler(
|
|
config=BrowserConfig(
|
|
browser_mode="cdp",
|
|
cdp_url=ws_url,
|
|
headless=True,
|
|
cdp_cleanup_on_close=True,
|
|
)
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success, f"Rapid connection {i+1} failed"
|
|
# NO asyncio.sleep() here - internal delay should be sufficient
|
|
finally:
|
|
delete_browser(browser_id)
|
|
|
|
|
|
class TestCDPBackwardCompatibility:
|
|
"""Tests for backward compatibility with existing CDP usage."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_http_url_with_browser_id_works(self):
|
|
"""HTTP URL with browser_id query param should work (backward compatibility)."""
|
|
browser = create_browser()
|
|
browser_id = browser["browser_id"]
|
|
try:
|
|
# Use HTTP URL with browser_id query parameter
|
|
http_url = f"{CDP_SERVICE_URL}?browser_id={browser_id}"
|
|
|
|
async with AsyncWebCrawler(
|
|
config=BrowserConfig(
|
|
browser_mode="cdp",
|
|
cdp_url=http_url,
|
|
headless=True,
|
|
cdp_cleanup_on_close=True,
|
|
)
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success
|
|
finally:
|
|
delete_browser(browser_id)
|
|
|
|
|
|
# Allow running directly
|
|
if __name__ == "__main__":
|
|
if not is_cdp_service_available():
|
|
print(f"CDP service not available at {CDP_SERVICE_URL}")
|
|
print("Please start a CDP-compatible browser pool service first.")
|
|
exit(1)
|
|
|
|
async def run_tests():
|
|
print("=" * 60)
|
|
print("CDP Cleanup and Browser Reuse Tests")
|
|
print("=" * 60)
|
|
|
|
tests = [
|
|
("WebSocket URL handling", TestCDPWebSocketURL().test_websocket_url_skips_http_verification),
|
|
("Browser survives after cleanup", TestCDPCleanupOnClose().test_browser_survives_after_cleanup_close),
|
|
("Sequential connections", TestCDPBrowserReuse().test_sequential_connections_same_browser),
|
|
("No user wait needed", TestCDPBrowserReuse().test_no_user_wait_needed_between_connections),
|
|
("HTTP URL with browser_id", TestCDPBackwardCompatibility().test_http_url_with_browser_id_works),
|
|
]
|
|
|
|
results = []
|
|
for name, test_func in tests:
|
|
print(f"\n--- {name} ---")
|
|
try:
|
|
await test_func()
|
|
print(f"PASS")
|
|
results.append((name, True))
|
|
except Exception as e:
|
|
print(f"FAIL: {e}")
|
|
results.append((name, False))
|
|
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
for name, passed in results:
|
|
print(f" {name}: {'PASS' if passed else 'FAIL'}")
|
|
|
|
all_passed = all(r[1] for r in results)
|
|
print(f"\nOverall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}")
|
|
return 0 if all_passed else 1
|
|
|
|
exit(asyncio.run(run_tests()))
|