feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
This commit is contained in:
149
tests/general/test_async_webcrawler.py
Normal file
149
tests/general/test_async_webcrawler.py
Normal file
@@ -0,0 +1,149 @@
|
||||
import asyncio
|
||||
import pytest
|
||||
from typing import List
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
MemoryAdaptiveDispatcher,
|
||||
RateLimiter,
|
||||
CacheMode
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("viewport", [
|
||||
(800, 600),
|
||||
(1024, 768),
|
||||
(1920, 1080)
|
||||
])
|
||||
async def test_viewport_config(viewport):
|
||||
"""Test different viewport configurations"""
|
||||
width, height = viewport
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
viewport_width=width,
|
||||
viewport_height=height
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(
|
||||
# cache_mode=CacheMode.BYPASS,
|
||||
page_timeout=30000 # 30 seconds
|
||||
)
|
||||
)
|
||||
assert result.success
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_memory_management():
|
||||
"""Test memory-adaptive dispatching"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
viewport_width=1024,
|
||||
viewport_height=768
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
check_interval=1.0,
|
||||
max_session_permit=5
|
||||
)
|
||||
|
||||
urls = ["https://example.com"] * 3 # Test with multiple identical URLs
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=CrawlerRunConfig(page_timeout=30000),
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
assert len(results) == len(urls)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rate_limiting():
|
||||
"""Test rate limiting functionality"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(1.0, 2.0),
|
||||
max_delay=5.0,
|
||||
max_retries=2
|
||||
),
|
||||
memory_threshold_percent=70.0
|
||||
)
|
||||
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://example.org",
|
||||
"https://example.net"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=CrawlerRunConfig(page_timeout=30000),
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
assert len(results) == len(urls)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_javascript_execution():
|
||||
"""Test JavaScript execution capabilities"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
java_script_enabled=True
|
||||
)
|
||||
|
||||
js_code = """
|
||||
document.body.style.backgroundColor = 'red';
|
||||
return document.body.style.backgroundColor;
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(
|
||||
js_code=js_code,
|
||||
page_timeout=30000
|
||||
)
|
||||
)
|
||||
assert result.success
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("error_url", [
|
||||
"https://invalid.domain.test",
|
||||
"https://httpbin.org/status/404",
|
||||
"https://httpbin.org/status/503",
|
||||
"https://httpbin.org/status/403"
|
||||
])
|
||||
async def test_error_handling(error_url):
|
||||
"""Test error handling for various failure scenarios"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=error_url,
|
||||
config=CrawlerRunConfig(
|
||||
page_timeout=10000, # Short timeout for error cases
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
assert not result.success
|
||||
assert result.error_message is not None
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_viewport_config((1024, 768)))
|
||||
asyncio.run(test_memory_management())
|
||||
asyncio.run(test_rate_limiting())
|
||||
asyncio.run(test_javascript_execution())
|
||||
Reference in New Issue
Block a user