- Use class-level tracking keyed by normalized CDP URL - All BrowserManager instances connecting to same browser share tracking - For CDP connections, always create new pages (cross-connection page sharing isn't reliable in Playwright) - For managed browsers, page reuse works within same process - Normalize CDP URLs to handle different formats (http, ws, query params)
607 lines
21 KiB
Python
607 lines
21 KiB
Python
"""
|
|
Real integration tests for page reuse race condition fix.
|
|
|
|
Tests that when create_isolated_context=False:
|
|
1. Single crawls still work correctly
|
|
2. Concurrent crawls don't cause race conditions
|
|
3. Pages are properly tracked and released
|
|
4. Page reuse works when pages become available
|
|
|
|
These are REAL tests - no mocking, actual browser operations.
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
# Add the project root to Python path if running directly
|
|
if __name__ == "__main__":
|
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
|
|
|
|
async def test_single_crawl_still_works():
|
|
"""
|
|
Test 1: Basic single crawl functionality still works with create_isolated_context=False.
|
|
This ensures we haven't broken existing functionality.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST 1: Single crawl with create_isolated_context=False")
|
|
print("="*70)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
use_managed_browser=True,
|
|
create_isolated_context=False,
|
|
)
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun("https://example.com")
|
|
|
|
assert result.success, f"Crawl failed: {result.error_message}"
|
|
assert result.html, "No HTML content returned"
|
|
assert "Example Domain" in result.html, "Expected content not found"
|
|
|
|
print(f" Status: {result.status_code}")
|
|
print(f" HTML length: {len(result.html)} chars")
|
|
print(" PASSED: Single crawl works correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" FAILED: {str(e)}")
|
|
return False
|
|
|
|
|
|
async def test_sequential_crawls_work():
|
|
"""
|
|
Test 2: Sequential crawls reuse the same page (when released).
|
|
This tests that page tracking and release works correctly.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST 2: Sequential crawls with page reuse")
|
|
print("="*70)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
use_managed_browser=True,
|
|
create_isolated_context=False,
|
|
)
|
|
|
|
urls = [
|
|
"https://example.com",
|
|
"https://httpbin.org/html",
|
|
"https://example.org",
|
|
]
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
results = []
|
|
for url in urls:
|
|
result = await crawler.arun(url)
|
|
results.append(result)
|
|
print(f" Crawled {url}: success={result.success}, status={result.status_code}")
|
|
|
|
# All should succeed
|
|
for i, result in enumerate(results):
|
|
assert result.success, f"Crawl {i+1} failed: {result.error_message}"
|
|
|
|
print(" PASSED: Sequential crawls work correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" FAILED: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
async def test_concurrent_crawls_no_race_condition():
|
|
"""
|
|
Test 3: Multiple concurrent crawls don't cause race conditions.
|
|
This is the main bug we're fixing - concurrent crawls should each get their own page.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST 3: Concurrent crawls with create_isolated_context=False")
|
|
print("="*70)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
use_managed_browser=True,
|
|
create_isolated_context=False,
|
|
)
|
|
|
|
# Use different URLs to ensure they can't accidentally succeed by being on the same page
|
|
urls = [
|
|
"https://example.com",
|
|
"https://httpbin.org/html",
|
|
"https://example.org",
|
|
"https://httpbin.org/get",
|
|
"https://www.iana.org/domains/reserved",
|
|
]
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
print(f" Launching {len(urls)} concurrent crawls...")
|
|
start_time = time.time()
|
|
|
|
# Launch all crawls concurrently
|
|
tasks = [crawler.arun(url) for url in urls]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
elapsed = time.time() - start_time
|
|
print(f" Completed in {elapsed:.2f}s")
|
|
|
|
# Check results
|
|
success_count = 0
|
|
for i, (url, result) in enumerate(zip(urls, results)):
|
|
if isinstance(result, Exception):
|
|
print(f" [{i+1}] {url}: EXCEPTION - {result}")
|
|
elif result.success:
|
|
success_count += 1
|
|
print(f" [{i+1}] {url}: OK (status={result.status_code})")
|
|
else:
|
|
print(f" [{i+1}] {url}: FAILED - {result.error_message}")
|
|
|
|
# All should succeed
|
|
assert success_count == len(urls), f"Only {success_count}/{len(urls)} succeeded"
|
|
|
|
print(f" PASSED: All {len(urls)} concurrent crawls succeeded without race conditions")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" FAILED: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
async def test_high_concurrency_stress():
|
|
"""
|
|
Test 4: High concurrency stress test - many concurrent crawls.
|
|
This stresses the page tracking system to ensure it handles many concurrent operations.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST 4: High concurrency stress test (10 concurrent crawls)")
|
|
print("="*70)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
use_managed_browser=True,
|
|
create_isolated_context=False,
|
|
)
|
|
|
|
# Generate multiple unique URLs
|
|
base_urls = [
|
|
"https://example.com",
|
|
"https://httpbin.org/html",
|
|
"https://example.org",
|
|
"https://httpbin.org/get",
|
|
"https://www.iana.org/domains/reserved",
|
|
]
|
|
|
|
# Create 10 URLs by adding query params
|
|
urls = []
|
|
for i in range(10):
|
|
url = f"{base_urls[i % len(base_urls)]}?test={i}&t={int(time.time())}"
|
|
urls.append(url)
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
print(f" Launching {len(urls)} concurrent crawls...")
|
|
start_time = time.time()
|
|
|
|
# Launch all crawls concurrently
|
|
tasks = [crawler.arun(url) for url in urls]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
elapsed = time.time() - start_time
|
|
print(f" Completed in {elapsed:.2f}s")
|
|
|
|
# Count results
|
|
success_count = 0
|
|
error_count = 0
|
|
exception_count = 0
|
|
|
|
for url, result in zip(urls, results):
|
|
if isinstance(result, Exception):
|
|
exception_count += 1
|
|
elif result.success:
|
|
success_count += 1
|
|
else:
|
|
error_count += 1
|
|
|
|
print(f" Results: {success_count} success, {error_count} errors, {exception_count} exceptions")
|
|
|
|
# At least 80% should succeed (allowing for some network issues)
|
|
min_success = int(len(urls) * 0.8)
|
|
assert success_count >= min_success, f"Only {success_count}/{len(urls)} succeeded (min: {min_success})"
|
|
|
|
print(f" PASSED: High concurrency test ({success_count}/{len(urls)} succeeded)")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" FAILED: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
async def test_page_tracking_internal_state():
|
|
"""
|
|
Test 5: Verify internal page tracking state is correct.
|
|
This directly tests the global page tracking mechanism.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST 5: Internal page tracking state verification")
|
|
print("="*70)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
use_managed_browser=True,
|
|
create_isolated_context=False,
|
|
)
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
browser_manager = crawler.crawler_strategy.browser_manager
|
|
|
|
# Check endpoint key is set
|
|
endpoint_key = browser_manager._browser_endpoint_key
|
|
print(f" Browser endpoint key: {endpoint_key}")
|
|
assert endpoint_key, "Endpoint key should be set"
|
|
|
|
# Initially, no pages should be in use
|
|
initial_in_use = len(browser_manager._get_pages_in_use())
|
|
print(f" Initial pages in use: {initial_in_use}")
|
|
|
|
# Do a crawl
|
|
result = await crawler.arun("https://example.com")
|
|
assert result.success, f"Crawl failed: {result.error_message}"
|
|
|
|
# After crawl completes, page should be released
|
|
after_crawl_in_use = len(browser_manager._get_pages_in_use())
|
|
print(f" Pages in use after crawl: {after_crawl_in_use}")
|
|
|
|
# The page should have been released (or kept as the last page)
|
|
# Either way, tracking should be consistent
|
|
|
|
# Do another crawl - should work fine
|
|
result2 = await crawler.arun("https://example.org")
|
|
assert result2.success, f"Second crawl failed: {result2.error_message}"
|
|
|
|
final_in_use = len(browser_manager._get_pages_in_use())
|
|
print(f" Pages in use after second crawl: {final_in_use}")
|
|
|
|
print(" PASSED: Page tracking state is consistent")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" FAILED: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
async def test_mixed_sequential_and_concurrent():
|
|
"""
|
|
Test 6: Mixed sequential and concurrent crawls.
|
|
Tests realistic usage pattern where some crawls are sequential and some concurrent.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST 6: Mixed sequential and concurrent crawls")
|
|
print("="*70)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
use_managed_browser=True,
|
|
create_isolated_context=False,
|
|
)
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Sequential crawl 1
|
|
print(" Phase 1: Sequential crawl")
|
|
result1 = await crawler.arun("https://example.com")
|
|
assert result1.success, f"Sequential crawl 1 failed"
|
|
print(f" Crawl 1: OK")
|
|
|
|
# Concurrent crawls
|
|
print(" Phase 2: Concurrent crawls (3 URLs)")
|
|
concurrent_urls = [
|
|
"https://httpbin.org/html",
|
|
"https://example.org",
|
|
"https://httpbin.org/get",
|
|
]
|
|
tasks = [crawler.arun(url) for url in concurrent_urls]
|
|
concurrent_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
for i, result in enumerate(concurrent_results):
|
|
if isinstance(result, Exception):
|
|
print(f" Concurrent {i+1}: EXCEPTION - {result}")
|
|
else:
|
|
assert result.success, f"Concurrent crawl {i+1} failed"
|
|
print(f" Concurrent {i+1}: OK")
|
|
|
|
# Sequential crawl 2
|
|
print(" Phase 3: Sequential crawl")
|
|
result2 = await crawler.arun("https://www.iana.org/domains/reserved")
|
|
assert result2.success, f"Sequential crawl 2 failed"
|
|
print(f" Crawl 2: OK")
|
|
|
|
# Another batch of concurrent
|
|
print(" Phase 4: More concurrent crawls (2 URLs)")
|
|
tasks2 = [
|
|
crawler.arun("https://example.com?test=1"),
|
|
crawler.arun("https://example.org?test=2"),
|
|
]
|
|
results2 = await asyncio.gather(*tasks2, return_exceptions=True)
|
|
for i, result in enumerate(results2):
|
|
if isinstance(result, Exception):
|
|
print(f" Concurrent {i+1}: EXCEPTION - {result}")
|
|
else:
|
|
assert result.success, f"Batch 2 crawl {i+1} failed"
|
|
print(f" Concurrent {i+1}: OK")
|
|
|
|
print(" PASSED: Mixed sequential and concurrent crawls work correctly")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" FAILED: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
async def test_compare_isolated_vs_shared_context():
|
|
"""
|
|
Test 7: Compare behavior between isolated and shared context modes.
|
|
Both should work for concurrent crawls now.
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST 7: Compare isolated vs shared context modes")
|
|
print("="*70)
|
|
|
|
urls = [
|
|
"https://example.com",
|
|
"https://httpbin.org/html",
|
|
"https://example.org",
|
|
]
|
|
|
|
# Test with create_isolated_context=True
|
|
print(" Testing with create_isolated_context=True:")
|
|
browser_config_isolated = BrowserConfig(
|
|
headless=True,
|
|
use_managed_browser=True,
|
|
create_isolated_context=True,
|
|
)
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config_isolated) as crawler:
|
|
tasks = [crawler.arun(url) for url in urls]
|
|
results_isolated = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
isolated_success = sum(1 for r in results_isolated if not isinstance(r, Exception) and r.success)
|
|
print(f" Isolated context: {isolated_success}/{len(urls)} succeeded")
|
|
except Exception as e:
|
|
print(f" Isolated context: FAILED - {e}")
|
|
isolated_success = 0
|
|
|
|
# Test with create_isolated_context=False
|
|
print(" Testing with create_isolated_context=False:")
|
|
browser_config_shared = BrowserConfig(
|
|
headless=True,
|
|
use_managed_browser=True,
|
|
create_isolated_context=False,
|
|
)
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config_shared) as crawler:
|
|
tasks = [crawler.arun(url) for url in urls]
|
|
results_shared = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
shared_success = sum(1 for r in results_shared if not isinstance(r, Exception) and r.success)
|
|
print(f" Shared context: {shared_success}/{len(urls)} succeeded")
|
|
except Exception as e:
|
|
print(f" Shared context: FAILED - {e}")
|
|
shared_success = 0
|
|
|
|
# Both modes should work
|
|
assert isolated_success == len(urls), f"Isolated context: only {isolated_success}/{len(urls)} succeeded"
|
|
assert shared_success == len(urls), f"Shared context: only {shared_success}/{len(urls)} succeeded"
|
|
|
|
print(" PASSED: Both context modes work correctly for concurrent crawls")
|
|
return True
|
|
|
|
|
|
async def test_multiple_crawlers_same_cdp():
|
|
"""
|
|
Test 8: Multiple AsyncWebCrawler instances connecting to the same CDP endpoint.
|
|
|
|
This tests the realistic scenario where:
|
|
1. A browser is started externally (or by a managed browser)
|
|
2. Multiple crawler instances connect to it via CDP URL
|
|
3. All use create_isolated_context=False to share cookies/session
|
|
4. Each should get its own page to avoid race conditions
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("TEST 8: Multiple crawlers connecting to same CDP endpoint")
|
|
print("="*70)
|
|
|
|
import subprocess
|
|
import tempfile
|
|
|
|
# Start a browser manually using subprocess
|
|
port = 9444
|
|
temp_dir = tempfile.mkdtemp(prefix="browser-test-")
|
|
|
|
browser_process = None
|
|
try:
|
|
# Start chromium with remote debugging - use Playwright's bundled chromium
|
|
import os
|
|
playwright_path = os.path.expanduser("~/.cache/ms-playwright/chromium-1200/chrome-linux64/chrome")
|
|
if not os.path.exists(playwright_path):
|
|
# Fallback - try to find it
|
|
for path in [
|
|
"/usr/bin/chromium",
|
|
"/usr/bin/chromium-browser",
|
|
"/usr/bin/google-chrome",
|
|
]:
|
|
if os.path.exists(path):
|
|
playwright_path = path
|
|
break
|
|
chrome_path = playwright_path
|
|
|
|
cmd = [
|
|
chrome_path,
|
|
f"--remote-debugging-port={port}",
|
|
f"--user-data-dir={temp_dir}",
|
|
"--headless=new",
|
|
"--no-sandbox",
|
|
"--disable-gpu",
|
|
"--disable-dev-shm-usage",
|
|
]
|
|
|
|
browser_process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
await asyncio.sleep(2) # Wait for browser to start
|
|
|
|
cdp_url = f"http://localhost:{port}"
|
|
print(f" Started browser at {cdp_url}")
|
|
|
|
# Both crawlers connect via CDP URL
|
|
browser_config1 = BrowserConfig(
|
|
headless=True,
|
|
cdp_url=cdp_url,
|
|
create_isolated_context=False,
|
|
)
|
|
browser_config2 = BrowserConfig(
|
|
headless=True,
|
|
cdp_url=cdp_url,
|
|
create_isolated_context=False,
|
|
)
|
|
|
|
urls_crawler1 = [
|
|
"https://example.com?crawler=1",
|
|
"https://example.org?crawler=1",
|
|
]
|
|
urls_crawler2 = [
|
|
"https://httpbin.org/html?crawler=2",
|
|
"https://httpbin.org/get?crawler=2",
|
|
]
|
|
|
|
async with AsyncWebCrawler(config=browser_config1) as crawler1:
|
|
async with AsyncWebCrawler(config=browser_config2) as crawler2:
|
|
bm1 = crawler1.crawler_strategy.browser_manager
|
|
bm2 = crawler2.crawler_strategy.browser_manager
|
|
|
|
print(f" Crawler 1 endpoint key: {bm1._browser_endpoint_key}")
|
|
print(f" Crawler 2 endpoint key: {bm2._browser_endpoint_key}")
|
|
print(f" Keys match: {bm1._browser_endpoint_key == bm2._browser_endpoint_key}")
|
|
|
|
# Launch concurrent crawls from BOTH crawlers simultaneously
|
|
print(f" Launching {len(urls_crawler1) + len(urls_crawler2)} concurrent crawls...")
|
|
|
|
tasks1 = [crawler1.arun(url) for url in urls_crawler1]
|
|
tasks2 = [crawler2.arun(url) for url in urls_crawler2]
|
|
|
|
all_results = await asyncio.gather(
|
|
*tasks1, *tasks2,
|
|
return_exceptions=True
|
|
)
|
|
|
|
# Check results
|
|
success_count = 0
|
|
for i, result in enumerate(all_results):
|
|
crawler_id = 1 if i < len(urls_crawler1) else 2
|
|
url_idx = i if i < len(urls_crawler1) else i - len(urls_crawler1)
|
|
|
|
if isinstance(result, Exception):
|
|
print(f" Crawler {crawler_id}, URL {url_idx+1}: EXCEPTION - {result}")
|
|
elif result.success:
|
|
success_count += 1
|
|
print(f" Crawler {crawler_id}, URL {url_idx+1}: OK")
|
|
else:
|
|
print(f" Crawler {crawler_id}, URL {url_idx+1}: FAILED - {result.error_message}")
|
|
|
|
total = len(urls_crawler1) + len(urls_crawler2)
|
|
assert success_count == total, f"Only {success_count}/{total} succeeded"
|
|
|
|
print(f" PASSED: All {total} concurrent crawls from 2 crawlers succeeded")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" FAILED: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
finally:
|
|
# Clean up browser process
|
|
if browser_process:
|
|
browser_process.terminate()
|
|
try:
|
|
browser_process.wait(timeout=5)
|
|
except:
|
|
browser_process.kill()
|
|
# Clean up temp dir
|
|
import shutil
|
|
try:
|
|
shutil.rmtree(temp_dir)
|
|
except:
|
|
pass
|
|
|
|
|
|
async def run_all_tests():
|
|
"""Run all tests and report results."""
|
|
print("\n" + "#"*70)
|
|
print("# PAGE REUSE RACE CONDITION FIX - INTEGRATION TESTS")
|
|
print("#"*70)
|
|
|
|
tests = [
|
|
("Single crawl works", test_single_crawl_still_works),
|
|
("Sequential crawls work", test_sequential_crawls_work),
|
|
("Concurrent crawls no race", test_concurrent_crawls_no_race_condition),
|
|
("High concurrency stress", test_high_concurrency_stress),
|
|
("Page tracking state", test_page_tracking_internal_state),
|
|
("Mixed sequential/concurrent", test_mixed_sequential_and_concurrent),
|
|
("Isolated vs shared context", test_compare_isolated_vs_shared_context),
|
|
]
|
|
|
|
results = []
|
|
for name, test_func in tests:
|
|
try:
|
|
passed = await test_func()
|
|
results.append((name, passed))
|
|
except Exception as e:
|
|
print(f" EXCEPTION in {name}: {e}")
|
|
results.append((name, False))
|
|
|
|
# Summary
|
|
print("\n" + "="*70)
|
|
print("TEST SUMMARY")
|
|
print("="*70)
|
|
|
|
passed = sum(1 for _, p in results if p)
|
|
total = len(results)
|
|
|
|
for name, p in results:
|
|
status = "PASS" if p else "FAIL"
|
|
print(f" [{status}] {name}")
|
|
|
|
print("-"*70)
|
|
print(f" Total: {passed}/{total} tests passed")
|
|
|
|
if passed == total:
|
|
print("\n ALL TESTS PASSED!")
|
|
return 0
|
|
else:
|
|
print(f"\n {total - passed} TESTS FAILED!")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit_code = asyncio.run(run_all_tests())
|
|
sys.exit(exit_code)
|