feat(crawler): add network request and console message capturing

Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00
parent a2061bf31e
commit 66ac07b4f3
31 changed files with 1686 additions and 10 deletions
--- a/tests/general/tets_robot.py
+++ b/tests/general/tets_robot.py
@@ -0,0 +1,62 @@
+import asyncio
+from crawl4ai import *
+
+async def test_real_websites():
+    print("\n=== Testing Real Website Robots.txt Compliance ===\n")
+    
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        
+        # Test cases with URLs
+        test_cases = [
+            # Public sites that should be allowed
+            ("https://example.com", True),  # Simple public site
+            ("https://httpbin.org/get", True),  # API endpoint
+            
+            # Sites with known strict robots.txt
+            ("https://www.facebook.com/robots.txt", False),  # Social media
+            ("https://www.google.com/search", False),  # Search pages
+            
+            # Edge cases
+            ("https://api.github.com", True),  # API service
+            ("https://raw.githubusercontent.com", True),  # Content delivery
+            
+            # Non-existent/error cases
+            ("https://thisisnotarealwebsite.com", True),  # Non-existent domain
+            ("https://localhost:12345", True),  # Invalid port
+        ]
+
+        for url, expected in test_cases:
+            print(f"\nTesting: {url}")
+            try:
+                config = CrawlerRunConfig(
+                    cache_mode=CacheMode.BYPASS,
+                    check_robots_txt=True,  # Enable robots.txt checking
+                    verbose=True
+                )
+                
+                result = await crawler.arun(url=url, config=config)
+                allowed = result.success and not result.error_message
+                
+                print(f"Expected: {'allowed' if expected else 'denied'}")
+                print(f"Actual: {'allowed' if allowed else 'denied'}")
+                print(f"Status Code: {result.status_code}")
+                if result.error_message:
+                    print(f"Error: {result.error_message}")
+                
+                # Optional: Print robots.txt content if available
+                if result.metadata and 'robots_txt' in result.metadata:
+                    print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
+                
+            except Exception as e:
+                print(f"Test failed with error: {str(e)}")
+
+async def main():
+    try:
+        await test_real_websites()
+    except Exception as e:
+        print(f"Test suite failed: {str(e)}")
+        raise
+
+if __name__ == "__main__":
+    asyncio.run(main())