- Add playwright-stealth integration with enable_stealth parameter in BrowserConfig - Merge undetected browser strategy into main async_crawler_strategy.py using adapter pattern - Add browser adapters (BrowserAdapter, PlaywrightAdapter, UndetectedAdapter) for flexible browser switching - Update install.py to install both playwright and patchright browsers automatically - Add comprehensive documentation for anti-bot features (stealth mode + undetected browser) - Create examples demonstrating stealth mode usage and comparison tests - Update pyproject.toml and requirements.txt with patchright>=1.49.0 and other dependencies - Remove duplicate/unused dependencies (alphashape, cssselect, pyperclip, shapely, selenium) - Add dependency checker tool in tests/check_dependencies.py Breaking changes: None - all existing functionality preserved 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
522 lines
19 KiB
Python
522 lines
19 KiB
Python
"""
|
|
Stealth Mode Example with Crawl4AI
|
|
|
|
This example demonstrates how to use the stealth mode feature to bypass basic bot detection.
|
|
The stealth mode uses playwright-stealth to modify browser fingerprints and behaviors
|
|
that are commonly used to detect automated browsers.
|
|
|
|
Key features demonstrated:
|
|
1. Comparing crawling with and without stealth mode
|
|
2. Testing against bot detection sites
|
|
3. Accessing sites that block automated browsers
|
|
4. Best practices for stealth crawling
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
from typing import Dict, Any
|
|
from colorama import Fore, Style, init
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.async_logger import AsyncLogger
|
|
|
|
# Initialize colorama for colored output
|
|
init()
|
|
|
|
# Create a logger for better output
|
|
logger = AsyncLogger(verbose=True)
|
|
|
|
|
|
async def test_bot_detection(use_stealth: bool = False) -> Dict[str, Any]:
|
|
"""Test against a bot detection service"""
|
|
|
|
logger.info(
|
|
f"Testing bot detection with stealth={'ON' if use_stealth else 'OFF'}",
|
|
tag="STEALTH"
|
|
)
|
|
|
|
# Configure browser with or without stealth
|
|
browser_config = BrowserConfig(
|
|
headless=False, # Use False to see the browser in action
|
|
enable_stealth=use_stealth,
|
|
viewport_width=1280,
|
|
viewport_height=800
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# JavaScript to extract bot detection results
|
|
detection_script = """
|
|
// Comprehensive bot detection checks
|
|
(() => {
|
|
const detectionResults = {
|
|
// Basic WebDriver detection
|
|
webdriver: navigator.webdriver,
|
|
|
|
// Chrome specific
|
|
chrome: !!window.chrome,
|
|
chromeRuntime: !!window.chrome?.runtime,
|
|
|
|
// Automation indicators
|
|
automationControlled: navigator.webdriver,
|
|
|
|
// Permissions API
|
|
permissionsPresent: !!navigator.permissions?.query,
|
|
|
|
// Plugins
|
|
pluginsLength: navigator.plugins.length,
|
|
pluginsArray: Array.from(navigator.plugins).map(p => p.name),
|
|
|
|
// Languages
|
|
languages: navigator.languages,
|
|
language: navigator.language,
|
|
|
|
// User agent
|
|
userAgent: navigator.userAgent,
|
|
|
|
// Screen and window properties
|
|
screen: {
|
|
width: screen.width,
|
|
height: screen.height,
|
|
availWidth: screen.availWidth,
|
|
availHeight: screen.availHeight,
|
|
colorDepth: screen.colorDepth,
|
|
pixelDepth: screen.pixelDepth
|
|
},
|
|
|
|
// WebGL vendor
|
|
webglVendor: (() => {
|
|
try {
|
|
const canvas = document.createElement('canvas');
|
|
const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
|
|
const ext = gl.getExtension('WEBGL_debug_renderer_info');
|
|
return gl.getParameter(ext.UNMASKED_VENDOR_WEBGL);
|
|
} catch (e) {
|
|
return 'Error';
|
|
}
|
|
})(),
|
|
|
|
// Platform
|
|
platform: navigator.platform,
|
|
|
|
// Hardware concurrency
|
|
hardwareConcurrency: navigator.hardwareConcurrency,
|
|
|
|
// Device memory
|
|
deviceMemory: navigator.deviceMemory,
|
|
|
|
// Connection
|
|
connection: navigator.connection?.effectiveType
|
|
};
|
|
|
|
// Log results for console capture
|
|
console.log('DETECTION_RESULTS:', JSON.stringify(detectionResults, null, 2));
|
|
|
|
// Return results
|
|
return detectionResults;
|
|
})();
|
|
"""
|
|
|
|
# Crawl bot detection test page
|
|
config = CrawlerRunConfig(
|
|
js_code=detection_script,
|
|
capture_console_messages=True,
|
|
wait_until="networkidle",
|
|
delay_before_return_html=2.0 # Give time for all checks to complete
|
|
)
|
|
|
|
result = await crawler.arun(
|
|
url="https://bot.sannysoft.com",
|
|
config=config
|
|
)
|
|
|
|
if result.success:
|
|
# Extract detection results from console
|
|
detection_data = None
|
|
for msg in result.console_messages or []:
|
|
if "DETECTION_RESULTS:" in msg.get("text", ""):
|
|
try:
|
|
json_str = msg["text"].replace("DETECTION_RESULTS:", "").strip()
|
|
detection_data = json.loads(json_str)
|
|
except:
|
|
pass
|
|
|
|
# Also try to get from JavaScript execution result
|
|
if not detection_data and result.js_execution_result:
|
|
detection_data = result.js_execution_result
|
|
|
|
return {
|
|
"success": True,
|
|
"url": result.url,
|
|
"detection_data": detection_data,
|
|
"page_title": result.metadata.get("title", ""),
|
|
"stealth_enabled": use_stealth
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": result.error_message,
|
|
"stealth_enabled": use_stealth
|
|
}
|
|
|
|
|
|
async def test_cloudflare_site(use_stealth: bool = False) -> Dict[str, Any]:
|
|
"""Test accessing a Cloudflare-protected site"""
|
|
|
|
logger.info(
|
|
f"Testing Cloudflare site with stealth={'ON' if use_stealth else 'OFF'}",
|
|
tag="STEALTH"
|
|
)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True, # Cloudflare detection works better in headless mode with stealth
|
|
enable_stealth=use_stealth,
|
|
viewport_width=1920,
|
|
viewport_height=1080
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
wait_until="networkidle",
|
|
page_timeout=30000, # 30 seconds
|
|
delay_before_return_html=3.0
|
|
)
|
|
|
|
# Test on a site that often shows Cloudflare challenges
|
|
result = await crawler.arun(
|
|
url="https://nowsecure.nl",
|
|
config=config
|
|
)
|
|
|
|
# Check if we hit Cloudflare challenge
|
|
cloudflare_detected = False
|
|
if result.html:
|
|
cloudflare_indicators = [
|
|
"Checking your browser",
|
|
"Just a moment",
|
|
"cf-browser-verification",
|
|
"cf-challenge",
|
|
"ray ID"
|
|
]
|
|
cloudflare_detected = any(indicator in result.html for indicator in cloudflare_indicators)
|
|
|
|
return {
|
|
"success": result.success,
|
|
"url": result.url,
|
|
"cloudflare_challenge": cloudflare_detected,
|
|
"status_code": result.status_code,
|
|
"page_title": result.metadata.get("title", "") if result.metadata else "",
|
|
"stealth_enabled": use_stealth,
|
|
"html_snippet": result.html[:500] if result.html else ""
|
|
}
|
|
|
|
|
|
async def test_anti_bot_site(use_stealth: bool = False) -> Dict[str, Any]:
|
|
"""Test against sites with anti-bot measures"""
|
|
|
|
logger.info(
|
|
f"Testing anti-bot site with stealth={'ON' if use_stealth else 'OFF'}",
|
|
tag="STEALTH"
|
|
)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
enable_stealth=use_stealth,
|
|
# Additional browser arguments that help with stealth
|
|
extra_args=[
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--disable-features=site-per-process"
|
|
] if not use_stealth else [] # These are automatically applied with stealth
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Some sites check for specific behaviors
|
|
behavior_script = """
|
|
(async () => {
|
|
// Simulate human-like behavior
|
|
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
|
|
|
|
// Random mouse movement
|
|
const moveX = Math.random() * 100;
|
|
const moveY = Math.random() * 100;
|
|
|
|
// Simulate reading time
|
|
await sleep(1000 + Math.random() * 2000);
|
|
|
|
// Scroll slightly
|
|
window.scrollBy(0, 100 + Math.random() * 200);
|
|
|
|
console.log('Human behavior simulation complete');
|
|
return true;
|
|
})()
|
|
"""
|
|
|
|
config = CrawlerRunConfig(
|
|
js_code=behavior_script,
|
|
wait_until="networkidle",
|
|
delay_before_return_html=5.0, # Longer delay to appear more human
|
|
capture_console_messages=True
|
|
)
|
|
|
|
# Test on a site that implements anti-bot measures
|
|
result = await crawler.arun(
|
|
url="https://www.g2.com/",
|
|
config=config
|
|
)
|
|
|
|
# Check for common anti-bot blocks
|
|
blocked_indicators = [
|
|
"Access Denied",
|
|
"403 Forbidden",
|
|
"Security Check",
|
|
"Verify you are human",
|
|
"captcha",
|
|
"challenge"
|
|
]
|
|
|
|
blocked = False
|
|
if result.html:
|
|
blocked = any(indicator.lower() in result.html.lower() for indicator in blocked_indicators)
|
|
|
|
return {
|
|
"success": result.success and not blocked,
|
|
"url": result.url,
|
|
"blocked": blocked,
|
|
"status_code": result.status_code,
|
|
"page_title": result.metadata.get("title", "") if result.metadata else "",
|
|
"stealth_enabled": use_stealth
|
|
}
|
|
|
|
|
|
async def compare_results():
|
|
"""Run all tests with and without stealth mode and compare results"""
|
|
|
|
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
|
print(f"{Fore.CYAN}Crawl4AI Stealth Mode Comparison{Style.RESET_ALL}")
|
|
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
|
|
|
# Test 1: Bot Detection
|
|
print(f"{Fore.YELLOW}1. Bot Detection Test (bot.sannysoft.com){Style.RESET_ALL}")
|
|
print("-" * 40)
|
|
|
|
# Without stealth
|
|
regular_detection = await test_bot_detection(use_stealth=False)
|
|
if regular_detection["success"] and regular_detection["detection_data"]:
|
|
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
|
data = regular_detection["detection_data"]
|
|
print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}")
|
|
print(f" • Chrome: {data.get('chrome', 'Unknown')}")
|
|
print(f" • Languages: {data.get('languages', 'Unknown')}")
|
|
print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}")
|
|
print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...")
|
|
|
|
# With stealth
|
|
stealth_detection = await test_bot_detection(use_stealth=True)
|
|
if stealth_detection["success"] and stealth_detection["detection_data"]:
|
|
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
|
data = stealth_detection["detection_data"]
|
|
print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}")
|
|
print(f" • Chrome: {data.get('chrome', 'Unknown')}")
|
|
print(f" • Languages: {data.get('languages', 'Unknown')}")
|
|
print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}")
|
|
print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...")
|
|
|
|
# Test 2: Cloudflare Site
|
|
print(f"\n\n{Fore.YELLOW}2. Cloudflare Protected Site Test{Style.RESET_ALL}")
|
|
print("-" * 40)
|
|
|
|
# Without stealth
|
|
regular_cf = await test_cloudflare_site(use_stealth=False)
|
|
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
|
print(f" • Success: {regular_cf['success']}")
|
|
print(f" • Cloudflare Challenge: {regular_cf['cloudflare_challenge']}")
|
|
print(f" • Status Code: {regular_cf['status_code']}")
|
|
print(f" • Page Title: {regular_cf['page_title']}")
|
|
|
|
# With stealth
|
|
stealth_cf = await test_cloudflare_site(use_stealth=True)
|
|
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
|
print(f" • Success: {stealth_cf['success']}")
|
|
print(f" • Cloudflare Challenge: {stealth_cf['cloudflare_challenge']}")
|
|
print(f" • Status Code: {stealth_cf['status_code']}")
|
|
print(f" • Page Title: {stealth_cf['page_title']}")
|
|
|
|
# Test 3: Anti-bot Site
|
|
print(f"\n\n{Fore.YELLOW}3. Anti-Bot Site Test{Style.RESET_ALL}")
|
|
print("-" * 40)
|
|
|
|
# Without stealth
|
|
regular_antibot = await test_anti_bot_site(use_stealth=False)
|
|
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
|
print(f" • Success: {regular_antibot['success']}")
|
|
print(f" • Blocked: {regular_antibot['blocked']}")
|
|
print(f" • Status Code: {regular_antibot['status_code']}")
|
|
print(f" • Page Title: {regular_antibot['page_title']}")
|
|
|
|
# With stealth
|
|
stealth_antibot = await test_anti_bot_site(use_stealth=True)
|
|
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
|
print(f" • Success: {stealth_antibot['success']}")
|
|
print(f" • Blocked: {stealth_antibot['blocked']}")
|
|
print(f" • Status Code: {stealth_antibot['status_code']}")
|
|
print(f" • Page Title: {stealth_antibot['page_title']}")
|
|
|
|
# Summary
|
|
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
|
print(f"{Fore.CYAN}Summary:{Style.RESET_ALL}")
|
|
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
|
print(f"\nStealth mode helps bypass basic bot detection by:")
|
|
print(f" • Hiding webdriver property")
|
|
print(f" • Modifying browser fingerprints")
|
|
print(f" • Adjusting navigator properties")
|
|
print(f" • Emulating real browser plugin behavior")
|
|
print(f"\n{Fore.YELLOW}Note:{Style.RESET_ALL} Stealth mode is not a silver bullet.")
|
|
print(f"Advanced anti-bot systems may still detect automation.")
|
|
print(f"Always respect robots.txt and website terms of service.")
|
|
|
|
|
|
async def stealth_best_practices():
|
|
"""Demonstrate best practices for using stealth mode"""
|
|
|
|
print(f"\n\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
|
print(f"{Fore.CYAN}Stealth Mode Best Practices{Style.RESET_ALL}")
|
|
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
|
|
|
# Best Practice 1: Combine with realistic behavior
|
|
print(f"{Fore.YELLOW}1. Combine with Realistic Behavior:{Style.RESET_ALL}")
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
enable_stealth=True,
|
|
viewport_width=1920,
|
|
viewport_height=1080
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Simulate human-like behavior
|
|
human_behavior_script = """
|
|
(async () => {
|
|
// Wait random time between actions
|
|
const randomWait = () => Math.random() * 2000 + 1000;
|
|
|
|
// Simulate reading
|
|
await new Promise(resolve => setTimeout(resolve, randomWait()));
|
|
|
|
// Smooth scroll
|
|
const smoothScroll = async () => {
|
|
const totalHeight = document.body.scrollHeight;
|
|
const viewHeight = window.innerHeight;
|
|
let currentPosition = 0;
|
|
|
|
while (currentPosition < totalHeight - viewHeight) {
|
|
const scrollAmount = Math.random() * 300 + 100;
|
|
window.scrollBy({
|
|
top: scrollAmount,
|
|
behavior: 'smooth'
|
|
});
|
|
currentPosition += scrollAmount;
|
|
await new Promise(resolve => setTimeout(resolve, randomWait()));
|
|
}
|
|
};
|
|
|
|
await smoothScroll();
|
|
console.log('Human-like behavior simulation completed');
|
|
return true;
|
|
})()
|
|
"""
|
|
|
|
config = CrawlerRunConfig(
|
|
js_code=human_behavior_script,
|
|
wait_until="networkidle",
|
|
delay_before_return_html=3.0,
|
|
capture_console_messages=True
|
|
)
|
|
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
config=config
|
|
)
|
|
|
|
print(f" ✓ Simulated human-like scrolling and reading patterns")
|
|
print(f" ✓ Added random delays between actions")
|
|
print(f" ✓ Result: {result.success}")
|
|
|
|
# Best Practice 2: Use appropriate viewport and user agent
|
|
print(f"\n{Fore.YELLOW}2. Use Realistic Viewport and User Agent:{Style.RESET_ALL}")
|
|
|
|
# Get a realistic user agent
|
|
from crawl4ai.user_agent_generator import UserAgentGenerator
|
|
ua_generator = UserAgentGenerator()
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
enable_stealth=True,
|
|
viewport_width=1920,
|
|
viewport_height=1080,
|
|
user_agent=ua_generator.generate(device_type="desktop", browser_type="chrome")
|
|
)
|
|
|
|
print(f" ✓ Using realistic viewport: 1920x1080")
|
|
print(f" ✓ Using current Chrome user agent")
|
|
print(f" ✓ Stealth mode will ensure consistency")
|
|
|
|
# Best Practice 3: Manage request rate
|
|
print(f"\n{Fore.YELLOW}3. Manage Request Rate:{Style.RESET_ALL}")
|
|
print(f" ✓ Add delays between requests")
|
|
print(f" ✓ Randomize timing patterns")
|
|
print(f" ✓ Respect robots.txt")
|
|
|
|
# Best Practice 4: Session management
|
|
print(f"\n{Fore.YELLOW}4. Use Session Management:{Style.RESET_ALL}")
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
enable_stealth=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Create a session for multiple requests
|
|
session_id = "stealth_session_1"
|
|
|
|
config = CrawlerRunConfig(
|
|
session_id=session_id,
|
|
wait_until="domcontentloaded"
|
|
)
|
|
|
|
# First request
|
|
result1 = await crawler.arun(
|
|
url="https://example.com",
|
|
config=config
|
|
)
|
|
|
|
# Subsequent request reuses the same browser context
|
|
result2 = await crawler.arun(
|
|
url="https://example.com/about",
|
|
config=config
|
|
)
|
|
|
|
print(f" ✓ Reused browser session for multiple requests")
|
|
print(f" ✓ Maintains cookies and state between requests")
|
|
print(f" ✓ More efficient and realistic browsing pattern")
|
|
|
|
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
|
|
|
|
|
async def main():
|
|
"""Run all examples"""
|
|
|
|
# Run comparison tests
|
|
await compare_results()
|
|
|
|
# Show best practices
|
|
await stealth_best_practices()
|
|
|
|
print(f"\n{Fore.GREEN}Examples completed!{Style.RESET_ALL}")
|
|
print(f"\n{Fore.YELLOW}Remember:{Style.RESET_ALL}")
|
|
print(f"• Stealth mode helps with basic bot detection")
|
|
print(f"• Always respect website terms of service")
|
|
print(f"• Consider rate limiting and ethical scraping practices")
|
|
print(f"• For advanced protection, consider additional measures")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |