feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
This commit is contained in:
185
tests/general/test_network_console_capture.py
Normal file
185
tests/general/test_network_console_capture.py
Normal file
@@ -0,0 +1,185 @@
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from aiohttp import web
|
||||
import tempfile
|
||||
import shutil
|
||||
import os, sys, time, json
|
||||
|
||||
|
||||
async def start_test_server():
|
||||
app = web.Application()
|
||||
|
||||
async def basic_page(request):
|
||||
return web.Response(text="""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Network Request Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test Page for Network Capture</h1>
|
||||
<p>This page performs network requests and console logging.</p>
|
||||
<img src="/image.png" alt="Test Image">
|
||||
<script>
|
||||
console.log("Basic console log");
|
||||
console.error("Error message");
|
||||
console.warn("Warning message");
|
||||
|
||||
// Make some XHR requests
|
||||
const xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', '/api/data', true);
|
||||
xhr.send();
|
||||
|
||||
// Make a fetch request
|
||||
fetch('/api/json')
|
||||
.then(response => response.json())
|
||||
.catch(error => console.error('Fetch error:', error));
|
||||
|
||||
// Trigger an error
|
||||
setTimeout(() => {
|
||||
try {
|
||||
nonExistentFunction();
|
||||
} catch (e) {
|
||||
console.error("Caught error:", e);
|
||||
}
|
||||
}, 100);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
""", content_type="text/html")
|
||||
|
||||
async def image(request):
|
||||
# Return a small 1x1 transparent PNG
|
||||
return web.Response(body=bytes.fromhex('89504E470D0A1A0A0000000D49484452000000010000000108060000001F15C4890000000D4944415478DA63FAFFFF3F030079DB00018D959DE70000000049454E44AE426082'), content_type="image/png")
|
||||
|
||||
async def api_data(request):
|
||||
return web.Response(text="sample data")
|
||||
|
||||
async def api_json(request):
|
||||
return web.json_response({"status": "success", "message": "JSON data"})
|
||||
|
||||
# Register routes
|
||||
app.router.add_get('/', basic_page)
|
||||
app.router.add_get('/image.png', image)
|
||||
app.router.add_get('/api/data', api_data)
|
||||
app.router.add_get('/api/json', api_json)
|
||||
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
site = web.TCPSite(runner, 'localhost', 8080)
|
||||
await site.start()
|
||||
|
||||
return runner
|
||||
|
||||
|
||||
async def test_network_console_capture():
|
||||
print("\n=== Testing Network and Console Capture ===\n")
|
||||
|
||||
# Start test server
|
||||
runner = await start_test_server()
|
||||
try:
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Test with capture disabled (default)
|
||||
print("\n1. Testing with capture disabled (default)...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle", # Wait for network to be idle
|
||||
)
|
||||
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
||||
|
||||
assert result.network_requests is None, "Network requests should be None when capture is disabled"
|
||||
assert result.console_messages is None, "Console messages should be None when capture is disabled"
|
||||
print("✓ Default config correctly returns None for network_requests and console_messages")
|
||||
|
||||
# Test with network capture enabled
|
||||
print("\n2. Testing with network capture enabled...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle", # Wait for network to be idle
|
||||
capture_network_requests=True
|
||||
)
|
||||
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
||||
|
||||
assert result.network_requests is not None, "Network requests should be captured"
|
||||
print(f"✓ Captured {len(result.network_requests)} network requests")
|
||||
|
||||
# Check if we have both requests and responses
|
||||
request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
|
||||
response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
|
||||
print(f" - {request_count} requests, {response_count} responses")
|
||||
|
||||
# Check if we captured specific resources
|
||||
urls = [r.get("url") for r in result.network_requests]
|
||||
has_image = any("/image.png" in url for url in urls)
|
||||
has_api_data = any("/api/data" in url for url in urls)
|
||||
has_api_json = any("/api/json" in url for url in urls)
|
||||
|
||||
assert has_image, "Should have captured image request"
|
||||
assert has_api_data, "Should have captured API data request"
|
||||
assert has_api_json, "Should have captured API JSON request"
|
||||
print("✓ Captured expected network requests (image, API endpoints)")
|
||||
|
||||
# Test with console capture enabled
|
||||
print("\n3. Testing with console capture enabled...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle", # Wait for network to be idle
|
||||
capture_console_messages=True
|
||||
)
|
||||
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
||||
|
||||
assert result.console_messages is not None, "Console messages should be captured"
|
||||
print(f"✓ Captured {len(result.console_messages)} console messages")
|
||||
|
||||
# Check if we have different types of console messages
|
||||
message_types = set(msg.get("type") for msg in result.console_messages if "type" in msg)
|
||||
print(f" - Message types: {', '.join(message_types)}")
|
||||
|
||||
# Print all captured messages for debugging
|
||||
print(" - Captured messages:")
|
||||
for msg in result.console_messages:
|
||||
print(f" * Type: {msg.get('type', 'N/A')}, Text: {msg.get('text', 'N/A')}")
|
||||
|
||||
# Look for specific messages
|
||||
messages = [msg.get("text") for msg in result.console_messages if "text" in msg]
|
||||
has_basic_log = any("Basic console log" in msg for msg in messages)
|
||||
has_error_msg = any("Error message" in msg for msg in messages)
|
||||
has_warning_msg = any("Warning message" in msg for msg in messages)
|
||||
|
||||
assert has_basic_log, "Should have captured basic console.log message"
|
||||
assert has_error_msg, "Should have captured console.error message"
|
||||
assert has_warning_msg, "Should have captured console.warn message"
|
||||
print("✓ Captured expected console messages (log, error, warning)")
|
||||
|
||||
# Test with both captures enabled
|
||||
print("\n4. Testing with both network and console capture enabled...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle", # Wait for network to be idle
|
||||
capture_network_requests=True,
|
||||
capture_console_messages=True
|
||||
)
|
||||
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
||||
|
||||
assert result.network_requests is not None, "Network requests should be captured"
|
||||
assert result.console_messages is not None, "Console messages should be captured"
|
||||
print(f"✓ Successfully captured both {len(result.network_requests)} network requests and {len(result.console_messages)} console messages")
|
||||
|
||||
finally:
|
||||
await runner.cleanup()
|
||||
print("\nTest server shutdown")
|
||||
|
||||
|
||||
async def main():
|
||||
try:
|
||||
await test_network_console_capture()
|
||||
print("\n✅ All tests passed successfully!")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test failed: {str(e)}")
|
||||
raise
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user