feat: add stealth mode and enhance undetected browser support

- Add playwright-stealth integration with enable_stealth parameter in BrowserConfig - Merge undetected browser strategy into main async_crawler_strategy.py using adapter pattern - Add browser adapters (BrowserAdapter, PlaywrightAdapter, UndetectedAdapter) for flexible browser switching - Update install.py to install both playwright and patchright browsers automatically - Add comprehensive documentation for anti-bot features (stealth mode + undetected browser) - Create examples demonstrating stealth mode usage and comparison tests - Update pyproject.toml and requirements.txt with patchright>=1.49.0 and other dependencies - Remove duplicate/unused dependencies (alphashape, cssselect, pyperclip, shapely, selenium) - Add dependency checker tool in tests/check_dependencies.py Breaking changes: None - all existing functionality preserved 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-17 16:59:10 +08:00
parent 5c33cbcca2
commit 6a728cbe5b
27 changed files with 2833 additions and 460 deletions
--- a/docs/examples/undetectability/undetected_bot_test.py
+++ b/docs/examples/undetectability/undetected_bot_test.py
@@ -0,0 +1,155 @@
+"""
+Bot Detection Test - Compare Regular vs Undetected
+Tests browser fingerprinting differences at bot.sannysoft.com
+"""
+
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler, 
+    BrowserConfig, 
+    CrawlerRunConfig,
+    UndetectedAdapter,
+    CrawlResult
+)
+from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
+
+# Bot detection test site
+TEST_URL = "https://bot.sannysoft.com"
+
+def analyze_bot_detection(result: CrawlResult) -> dict:
+    """Analyze bot detection results from the page"""
+    detections = {
+        "webdriver": False,
+        "headless": False, 
+        "automation": False,
+        "user_agent": False,
+        "total_tests": 0,
+        "failed_tests": 0
+    }
+    
+    if not result.success or not result.html:
+        return detections
+    
+    # Look for specific test results in the HTML
+    html_lower = result.html.lower()
+    
+    # Check for common bot indicators
+    if "webdriver" in html_lower and ("fail" in html_lower or "true" in html_lower):
+        detections["webdriver"] = True
+        detections["failed_tests"] += 1
+    
+    if "headless" in html_lower and ("fail" in html_lower or "true" in html_lower):
+        detections["headless"] = True
+        detections["failed_tests"] += 1
+    
+    if "automation" in html_lower and "detected" in html_lower:
+        detections["automation"] = True
+        detections["failed_tests"] += 1
+    
+    # Count total tests (approximate)
+    detections["total_tests"] = html_lower.count("test") + html_lower.count("check")
+    
+    return detections
+
+async def test_browser_mode(adapter_name: str, adapter=None):
+    """Test a browser mode and return results"""
+    print(f"\n{'='*60}")
+    print(f"Testing: {adapter_name}")
+    print(f"{'='*60}")
+    
+    browser_config = BrowserConfig(
+        headless=False,  # Run in headed mode for better results
+        verbose=True,
+        viewport_width=1920,
+        viewport_height=1080,
+    )
+    
+    if adapter:
+        # Use undetected mode
+        crawler_strategy = AsyncPlaywrightCrawlerStrategy(
+            browser_config=browser_config,
+            browser_adapter=adapter
+        )
+        crawler = AsyncWebCrawler(
+            crawler_strategy=crawler_strategy,
+            config=browser_config
+        )
+    else:
+        # Use regular mode
+        crawler = AsyncWebCrawler(config=browser_config)
+    
+    async with crawler:
+        config = CrawlerRunConfig(
+            delay_before_return_html=3.0,  # Let detection scripts run
+            wait_for_images=True,
+            screenshot=True,
+            simulate_user=False,  # Don't simulate for accurate detection
+        )
+        
+        result = await crawler.arun(url=TEST_URL, config=config)
+        
+        print(f"\n✓ Success: {result.success}")
+        print(f"✓ Status Code: {result.status_code}")
+        
+        if result.success:
+            # Analyze detection results
+            detections = analyze_bot_detection(result)
+            
+            print(f"\n🔍 Bot Detection Analysis:")
+            print(f"  - WebDriver Detected: {'❌ Yes' if detections['webdriver'] else '✅ No'}")
+            print(f"  - Headless Detected: {'❌ Yes' if detections['headless'] else '✅ No'}")
+            print(f"  - Automation Detected: {'❌ Yes' if detections['automation'] else '✅ No'}")
+            print(f"  - Failed Tests: {detections['failed_tests']}")
+            
+            # Show some content
+            if result.markdown.raw_markdown:
+                print(f"\nContent preview:")
+                lines = result.markdown.raw_markdown.split('\n')
+                for line in lines[:20]:  # Show first 20 lines
+                    if any(keyword in line.lower() for keyword in ['test', 'pass', 'fail', 'yes', 'no']):
+                        print(f"  {line.strip()}")
+        
+        return result, detections if result.success else {}
+
+async def main():
+    """Run the comparison"""
+    print("🤖 Crawl4AI - Bot Detection Test")
+    print(f"Testing at: {TEST_URL}")
+    print("This site runs various browser fingerprinting tests\n")
+    
+    # Test regular browser
+    regular_result, regular_detections = await test_browser_mode("Regular Browser")
+    
+    # Small delay
+    await asyncio.sleep(2)
+    
+    # Test undetected browser
+    undetected_adapter = UndetectedAdapter()
+    undetected_result, undetected_detections = await test_browser_mode(
+        "Undetected Browser", 
+        undetected_adapter
+    )
+    
+    # Summary comparison
+    print(f"\n{'='*60}")
+    print("COMPARISON SUMMARY")
+    print(f"{'='*60}")
+    
+    print(f"\n{'Test':<25} {'Regular':<15} {'Undetected':<15}")
+    print(f"{'-'*55}")
+    
+    if regular_detections and undetected_detections:
+        print(f"{'WebDriver Detection':<25} {'❌ Detected' if regular_detections['webdriver'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['webdriver'] else '✅ Passed':<15}")
+        print(f"{'Headless Detection':<25} {'❌ Detected' if regular_detections['headless'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['headless'] else '✅ Passed':<15}")
+        print(f"{'Automation Detection':<25} {'❌ Detected' if regular_detections['automation'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['automation'] else '✅ Passed':<15}")
+        print(f"{'Failed Tests':<25} {regular_detections['failed_tests']:<15} {undetected_detections['failed_tests']:<15}")
+    
+    print(f"\n{'='*60}")
+    
+    if undetected_detections.get('failed_tests', 0) < regular_detections.get('failed_tests', 1):
+        print("✅ Undetected browser performed better at evading detection!")
+    else:
+        print("ℹ️  Both browsers had similar detection results")
+
+if __name__ == "__main__":
+    asyncio.run(main())