feat(crawler): add network request and console message capturing

Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00
parent a2061bf31e
commit 66ac07b4f3
31 changed files with 1686 additions and 10 deletions
--- a/tests/general/test_http_crawler_strategy.py
+++ b/tests/general/test_http_crawler_strategy.py
@@ -0,0 +1,116 @@
+from tkinter import N
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai import CrawlerRunConfig, HTTPCrawlerConfig
+from crawl4ai.async_crawler_strategy import ConnectionTimeoutError
+import asyncio
+import os
+
+async def main():
+    """Test the AsyncHTTPCrawlerStrategy with various scenarios"""
+    logger = AsyncLogger(verbose=True)
+
+    # Initialize the strategy with default HTTPCrawlerConfig
+    crawler = AsyncHTTPCrawlerStrategy(
+        browser_config=HTTPCrawlerConfig(),
+        logger=logger
+    )
+    # Test 1: Basic HTTP GET
+    print("\n=== Test 1: Basic HTTP GET ===")
+    result = await crawler.crawl("https://example.com")
+    print(f"Status: {result.status_code}")
+    print(f"Content length: {len(result.html)}")
+    print(f"Headers: {dict(result.response_headers)}")
+
+    # Test 2: POST request with JSON
+    print("\n=== Test 2: POST with JSON ===")
+    crawler.browser_config = crawler.browser_config.clone(
+        method="POST",
+        json={"test": "data"},
+        headers={"Content-Type": "application/json"}
+    )
+    try:
+        result = await crawler.crawl(
+            "https://httpbin.org/post",
+        )
+        print(f"Status: {result.status_code}")
+        print(f"Response: {result.html[:200]}...")
+    except Exception as e:
+        print(f"Error: {e}")
+
+    # Test 3: File handling
+    crawler.browser_config = HTTPCrawlerConfig()
+    print("\n=== Test 3: Local file handling ===")
+    # Create a tmp file with test content
+    from tempfile import NamedTemporaryFile
+    with NamedTemporaryFile(delete=False) as f:
+        f.write(b"<html><body>Test content</body></html>")
+        f.close()
+        result = await crawler.crawl(f"file://{f.name}")
+        print(f"File content: {result.html}")
+
+    # Test 4: Raw content
+    print("\n=== Test 4: Raw content handling ===")
+    raw_html = "raw://<html><body>Raw test content</body></html>"
+    result = await crawler.crawl(raw_html)
+    print(f"Raw content: {result.html}")
+
+    # Test 5: Custom hooks
+    print("\n=== Test 5: Custom hooks ===")
+    async def before_request(url, kwargs):
+        print(f"Before request to {url}")
+        kwargs['headers']['X-Custom'] = 'test'
+
+    async def after_request(response):
+        print(f"After request, status: {response.status_code}")
+
+    crawler.set_hook('before_request', before_request)
+    crawler.set_hook('after_request', after_request)
+    result = await crawler.crawl("https://example.com")
+
+    # Test 6: Error handling
+    print("\n=== Test 6: Error handling ===")
+    try:
+        await crawler.crawl("https://nonexistent.domain.test")
+    except Exception as e:
+        print(f"Expected error: {e}")
+
+    # Test 7: Redirects
+    print("\n=== Test 7: Redirect handling ===")
+    crawler.browser_config = HTTPCrawlerConfig(follow_redirects=True)
+    result = await crawler.crawl("http://httpbin.org/redirect/1")
+    print(f"Final URL: {result.redirected_url}")
+
+    # Test 8: Custom timeout
+    print("\n=== Test 8: Custom timeout ===")
+    try:
+        await crawler.crawl(
+            "https://httpbin.org/delay/5",
+            config=CrawlerRunConfig(page_timeout=2)
+        )
+    except ConnectionTimeoutError as e:
+        print(f"Expected timeout: {e}")
+
+    # Test 9: SSL verification
+    print("\n=== Test 9: SSL verification ===")
+    crawler.browser_config = HTTPCrawlerConfig(verify_ssl=False)
+    try:
+        await crawler.crawl("https://expired.badssl.com/")
+        print("Connected to invalid SSL site with verification disabled")
+    except Exception as e:
+        print(f"SSL error: {e}")
+
+    # Test 10: Large file streaming
+    print("\n=== Test 10: Large file streaming ===")
+    from tempfile import NamedTemporaryFile
+    with NamedTemporaryFile(delete=False) as f:
+        f.write(b"<html><body>" + b"X" * 1024 * 1024 * 10 + b"</body></html>")
+        f.close()
+        result = await crawler.crawl("file://" + f.name)
+        print(f"Large file content length: {len(result.html)}")
+        os.remove(f.name)
+
+    crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())