- Created standalone smoke test script for quick validation - Added detailed CHANGES_CDP_CONCURRENCY.md documentation - Documented all fixes, testing approach, and migration guide - Smoke test can run without pytest for easy verification Co-authored-by: Ahmed-Tawfik94 <106467151+Ahmed-Tawfik94@users.noreply.github.com>
166 lines
5.3 KiB
Python
Executable File
166 lines
5.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Simple smoke test for CDP concurrency fixes.
|
|
This can be run without pytest to quickly validate the changes.
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
import os
|
|
|
|
# Add the project root to Python path
|
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|
|
|
|
|
async def test_basic_cdp():
|
|
"""Basic test that CDP browser works"""
|
|
print("Test 1: Basic CDP browser test...")
|
|
|
|
browser_config = BrowserConfig(
|
|
use_managed_browser=True,
|
|
headless=True,
|
|
verbose=False
|
|
)
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
)
|
|
assert result.success, f"Failed: {result.error_message}"
|
|
assert len(result.html) > 0, "Empty HTML"
|
|
print(" ✓ Basic CDP test passed")
|
|
return True
|
|
except Exception as e:
|
|
print(f" ✗ Basic CDP test failed: {e}")
|
|
return False
|
|
|
|
|
|
async def test_arun_many_cdp():
|
|
"""Test arun_many with CDP browser - the key concurrency fix"""
|
|
print("\nTest 2: arun_many with CDP browser...")
|
|
|
|
browser_config = BrowserConfig(
|
|
use_managed_browser=True,
|
|
headless=True,
|
|
verbose=False
|
|
)
|
|
|
|
urls = [
|
|
"https://example.com",
|
|
"https://httpbin.org/html",
|
|
"https://www.example.org",
|
|
]
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
)
|
|
|
|
assert len(results) == len(urls), f"Expected {len(urls)} results, got {len(results)}"
|
|
|
|
success_count = sum(1 for r in results if r.success)
|
|
print(f" ✓ Crawled {success_count}/{len(urls)} URLs successfully")
|
|
|
|
if success_count >= len(urls) * 0.8: # Allow 20% failure for network issues
|
|
print(" ✓ arun_many CDP test passed")
|
|
return True
|
|
else:
|
|
print(f" ✗ Too many failures: {len(urls) - success_count}/{len(urls)}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f" ✗ arun_many CDP test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
async def test_concurrent_arun_many():
|
|
"""Test concurrent arun_many calls - stress test for page lock"""
|
|
print("\nTest 3: Concurrent arun_many calls...")
|
|
|
|
browser_config = BrowserConfig(
|
|
use_managed_browser=True,
|
|
headless=True,
|
|
verbose=False
|
|
)
|
|
|
|
try:
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Run two arun_many calls concurrently
|
|
task1 = crawler.arun_many(
|
|
urls=["https://example.com", "https://httpbin.org/html"],
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
)
|
|
|
|
task2 = crawler.arun_many(
|
|
urls=["https://www.example.org", "https://example.com"],
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
)
|
|
|
|
results1, results2 = await asyncio.gather(task1, task2, return_exceptions=True)
|
|
|
|
# Check for exceptions
|
|
if isinstance(results1, Exception):
|
|
print(f" ✗ Task 1 raised exception: {results1}")
|
|
return False
|
|
if isinstance(results2, Exception):
|
|
print(f" ✗ Task 2 raised exception: {results2}")
|
|
return False
|
|
|
|
total_success = sum(1 for r in results1 if r.success) + sum(1 for r in results2 if r.success)
|
|
total_requests = len(results1) + len(results2)
|
|
|
|
print(f" ✓ {total_success}/{total_requests} concurrent requests succeeded")
|
|
|
|
if total_success >= total_requests * 0.7: # Allow 30% failure for concurrent stress
|
|
print(" ✓ Concurrent arun_many test passed")
|
|
return True
|
|
else:
|
|
print(f" ✗ Too many concurrent failures")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Concurrent test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
async def main():
|
|
"""Run all smoke tests"""
|
|
print("=" * 60)
|
|
print("CDP Concurrency Smoke Tests")
|
|
print("=" * 60)
|
|
|
|
results = []
|
|
|
|
# Run tests sequentially
|
|
results.append(await test_basic_cdp())
|
|
results.append(await test_arun_many_cdp())
|
|
results.append(await test_concurrent_arun_many())
|
|
|
|
print("\n" + "=" * 60)
|
|
passed = sum(results)
|
|
total = len(results)
|
|
|
|
if passed == total:
|
|
print(f"✓ All {total} smoke tests passed!")
|
|
print("=" * 60)
|
|
return 0
|
|
else:
|
|
print(f"✗ {total - passed}/{total} smoke tests failed")
|
|
print("=" * 60)
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit_code = asyncio.run(main())
|
|
sys.exit(exit_code)
|