Merge branch 'feat/undetected-browser' into develop-future
This commit is contained in:
@@ -3,8 +3,8 @@ C4A-Script API Usage Examples
|
||||
Shows how to use the new Result-based API in various scenarios
|
||||
"""
|
||||
|
||||
from c4a_compile import compile, validate, compile_file
|
||||
from c4a_result import CompilationResult, ValidationResult
|
||||
from crawl4ai.script.c4a_compile import compile, validate, compile_file
|
||||
from crawl4ai.script.c4a_result import CompilationResult, ValidationResult
|
||||
import json
|
||||
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ C4A-Script Hello World
|
||||
A concise example showing how to use the C4A-Script compiler
|
||||
"""
|
||||
|
||||
from c4a_compile import compile
|
||||
from crawl4ai.script.c4a_compile import compile
|
||||
|
||||
# Define your C4A-Script
|
||||
script = """
|
||||
|
||||
@@ -3,7 +3,7 @@ C4A-Script Hello World - Error Example
|
||||
Shows how error handling works
|
||||
"""
|
||||
|
||||
from c4a_compile import compile
|
||||
from crawl4ai.script.c4a_compile import compile
|
||||
|
||||
# Define a script with an error (missing THEN)
|
||||
script = """
|
||||
|
||||
57
docs/examples/hello_world_undetected.py
Normal file
57
docs/examples/hello_world_undetected.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter,
|
||||
CrawlResult,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create the crawler strategy with the undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Configure the crawl
|
||||
crawler_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
),
|
||||
capture_console_messages=True, # Enable console capture to test adapter
|
||||
)
|
||||
|
||||
# Test on a site that typically detects bots
|
||||
print("Testing undetected adapter...")
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Console messages captured: {len(result.console_messages or [])}")
|
||||
print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
59
docs/examples/simple_anti_bot_examples.py
Normal file
59
docs/examples/simple_anti_bot_examples.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Example 1: Stealth Mode
|
||||
async def stealth_mode_example():
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Example 2: Undetected Browser
|
||||
async def undetected_browser_example():
|
||||
browser_config = BrowserConfig(
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Example 3: Both Combined
|
||||
async def combined_example():
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Run examples
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(stealth_mode_example())
|
||||
asyncio.run(undetected_browser_example())
|
||||
asyncio.run(combined_example())
|
||||
522
docs/examples/stealth_mode_example.py
Normal file
522
docs/examples/stealth_mode_example.py
Normal file
@@ -0,0 +1,522 @@
|
||||
"""
|
||||
Stealth Mode Example with Crawl4AI
|
||||
|
||||
This example demonstrates how to use the stealth mode feature to bypass basic bot detection.
|
||||
The stealth mode uses playwright-stealth to modify browser fingerprints and behaviors
|
||||
that are commonly used to detect automated browsers.
|
||||
|
||||
Key features demonstrated:
|
||||
1. Comparing crawling with and without stealth mode
|
||||
2. Testing against bot detection sites
|
||||
3. Accessing sites that block automated browsers
|
||||
4. Best practices for stealth crawling
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Dict, Any
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Initialize colorama for colored output
|
||||
init()
|
||||
|
||||
# Create a logger for better output
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
|
||||
async def test_bot_detection(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test against a bot detection service"""
|
||||
|
||||
logger.info(
|
||||
f"Testing bot detection with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
# Configure browser with or without stealth
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Use False to see the browser in action
|
||||
enable_stealth=use_stealth,
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# JavaScript to extract bot detection results
|
||||
detection_script = """
|
||||
// Comprehensive bot detection checks
|
||||
(() => {
|
||||
const detectionResults = {
|
||||
// Basic WebDriver detection
|
||||
webdriver: navigator.webdriver,
|
||||
|
||||
// Chrome specific
|
||||
chrome: !!window.chrome,
|
||||
chromeRuntime: !!window.chrome?.runtime,
|
||||
|
||||
// Automation indicators
|
||||
automationControlled: navigator.webdriver,
|
||||
|
||||
// Permissions API
|
||||
permissionsPresent: !!navigator.permissions?.query,
|
||||
|
||||
// Plugins
|
||||
pluginsLength: navigator.plugins.length,
|
||||
pluginsArray: Array.from(navigator.plugins).map(p => p.name),
|
||||
|
||||
// Languages
|
||||
languages: navigator.languages,
|
||||
language: navigator.language,
|
||||
|
||||
// User agent
|
||||
userAgent: navigator.userAgent,
|
||||
|
||||
// Screen and window properties
|
||||
screen: {
|
||||
width: screen.width,
|
||||
height: screen.height,
|
||||
availWidth: screen.availWidth,
|
||||
availHeight: screen.availHeight,
|
||||
colorDepth: screen.colorDepth,
|
||||
pixelDepth: screen.pixelDepth
|
||||
},
|
||||
|
||||
// WebGL vendor
|
||||
webglVendor: (() => {
|
||||
try {
|
||||
const canvas = document.createElement('canvas');
|
||||
const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
|
||||
const ext = gl.getExtension('WEBGL_debug_renderer_info');
|
||||
return gl.getParameter(ext.UNMASKED_VENDOR_WEBGL);
|
||||
} catch (e) {
|
||||
return 'Error';
|
||||
}
|
||||
})(),
|
||||
|
||||
// Platform
|
||||
platform: navigator.platform,
|
||||
|
||||
// Hardware concurrency
|
||||
hardwareConcurrency: navigator.hardwareConcurrency,
|
||||
|
||||
// Device memory
|
||||
deviceMemory: navigator.deviceMemory,
|
||||
|
||||
// Connection
|
||||
connection: navigator.connection?.effectiveType
|
||||
};
|
||||
|
||||
// Log results for console capture
|
||||
console.log('DETECTION_RESULTS:', JSON.stringify(detectionResults, null, 2));
|
||||
|
||||
// Return results
|
||||
return detectionResults;
|
||||
})();
|
||||
"""
|
||||
|
||||
# Crawl bot detection test page
|
||||
config = CrawlerRunConfig(
|
||||
js_code=detection_script,
|
||||
capture_console_messages=True,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=2.0 # Give time for all checks to complete
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Extract detection results from console
|
||||
detection_data = None
|
||||
for msg in result.console_messages or []:
|
||||
if "DETECTION_RESULTS:" in msg.get("text", ""):
|
||||
try:
|
||||
json_str = msg["text"].replace("DETECTION_RESULTS:", "").strip()
|
||||
detection_data = json.loads(json_str)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Also try to get from JavaScript execution result
|
||||
if not detection_data and result.js_execution_result:
|
||||
detection_data = result.js_execution_result
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"url": result.url,
|
||||
"detection_data": detection_data,
|
||||
"page_title": result.metadata.get("title", ""),
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": result.error_message,
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
|
||||
|
||||
async def test_cloudflare_site(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test accessing a Cloudflare-protected site"""
|
||||
|
||||
logger.info(
|
||||
f"Testing Cloudflare site with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, # Cloudflare detection works better in headless mode with stealth
|
||||
enable_stealth=use_stealth,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
page_timeout=30000, # 30 seconds
|
||||
delay_before_return_html=3.0
|
||||
)
|
||||
|
||||
# Test on a site that often shows Cloudflare challenges
|
||||
result = await crawler.arun(
|
||||
url="https://nowsecure.nl",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Check if we hit Cloudflare challenge
|
||||
cloudflare_detected = False
|
||||
if result.html:
|
||||
cloudflare_indicators = [
|
||||
"Checking your browser",
|
||||
"Just a moment",
|
||||
"cf-browser-verification",
|
||||
"cf-challenge",
|
||||
"ray ID"
|
||||
]
|
||||
cloudflare_detected = any(indicator in result.html for indicator in cloudflare_indicators)
|
||||
|
||||
return {
|
||||
"success": result.success,
|
||||
"url": result.url,
|
||||
"cloudflare_challenge": cloudflare_detected,
|
||||
"status_code": result.status_code,
|
||||
"page_title": result.metadata.get("title", "") if result.metadata else "",
|
||||
"stealth_enabled": use_stealth,
|
||||
"html_snippet": result.html[:500] if result.html else ""
|
||||
}
|
||||
|
||||
|
||||
async def test_anti_bot_site(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test against sites with anti-bot measures"""
|
||||
|
||||
logger.info(
|
||||
f"Testing anti-bot site with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=use_stealth,
|
||||
# Additional browser arguments that help with stealth
|
||||
extra_args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-features=site-per-process"
|
||||
] if not use_stealth else [] # These are automatically applied with stealth
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Some sites check for specific behaviors
|
||||
behavior_script = """
|
||||
(async () => {
|
||||
// Simulate human-like behavior
|
||||
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
|
||||
|
||||
// Random mouse movement
|
||||
const moveX = Math.random() * 100;
|
||||
const moveY = Math.random() * 100;
|
||||
|
||||
// Simulate reading time
|
||||
await sleep(1000 + Math.random() * 2000);
|
||||
|
||||
// Scroll slightly
|
||||
window.scrollBy(0, 100 + Math.random() * 200);
|
||||
|
||||
console.log('Human behavior simulation complete');
|
||||
return true;
|
||||
})()
|
||||
"""
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code=behavior_script,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=5.0, # Longer delay to appear more human
|
||||
capture_console_messages=True
|
||||
)
|
||||
|
||||
# Test on a site that implements anti-bot measures
|
||||
result = await crawler.arun(
|
||||
url="https://www.g2.com/",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Check for common anti-bot blocks
|
||||
blocked_indicators = [
|
||||
"Access Denied",
|
||||
"403 Forbidden",
|
||||
"Security Check",
|
||||
"Verify you are human",
|
||||
"captcha",
|
||||
"challenge"
|
||||
]
|
||||
|
||||
blocked = False
|
||||
if result.html:
|
||||
blocked = any(indicator.lower() in result.html.lower() for indicator in blocked_indicators)
|
||||
|
||||
return {
|
||||
"success": result.success and not blocked,
|
||||
"url": result.url,
|
||||
"blocked": blocked,
|
||||
"status_code": result.status_code,
|
||||
"page_title": result.metadata.get("title", "") if result.metadata else "",
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
|
||||
|
||||
async def compare_results():
|
||||
"""Run all tests with and without stealth mode and compare results"""
|
||||
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Crawl4AI Stealth Mode Comparison{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
||||
|
||||
# Test 1: Bot Detection
|
||||
print(f"{Fore.YELLOW}1. Bot Detection Test (bot.sannysoft.com){Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_detection = await test_bot_detection(use_stealth=False)
|
||||
if regular_detection["success"] and regular_detection["detection_data"]:
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
data = regular_detection["detection_data"]
|
||||
print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}")
|
||||
print(f" • Chrome: {data.get('chrome', 'Unknown')}")
|
||||
print(f" • Languages: {data.get('languages', 'Unknown')}")
|
||||
print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}")
|
||||
print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...")
|
||||
|
||||
# With stealth
|
||||
stealth_detection = await test_bot_detection(use_stealth=True)
|
||||
if stealth_detection["success"] and stealth_detection["detection_data"]:
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
data = stealth_detection["detection_data"]
|
||||
print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}")
|
||||
print(f" • Chrome: {data.get('chrome', 'Unknown')}")
|
||||
print(f" • Languages: {data.get('languages', 'Unknown')}")
|
||||
print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}")
|
||||
print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...")
|
||||
|
||||
# Test 2: Cloudflare Site
|
||||
print(f"\n\n{Fore.YELLOW}2. Cloudflare Protected Site Test{Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_cf = await test_cloudflare_site(use_stealth=False)
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {regular_cf['success']}")
|
||||
print(f" • Cloudflare Challenge: {regular_cf['cloudflare_challenge']}")
|
||||
print(f" • Status Code: {regular_cf['status_code']}")
|
||||
print(f" • Page Title: {regular_cf['page_title']}")
|
||||
|
||||
# With stealth
|
||||
stealth_cf = await test_cloudflare_site(use_stealth=True)
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {stealth_cf['success']}")
|
||||
print(f" • Cloudflare Challenge: {stealth_cf['cloudflare_challenge']}")
|
||||
print(f" • Status Code: {stealth_cf['status_code']}")
|
||||
print(f" • Page Title: {stealth_cf['page_title']}")
|
||||
|
||||
# Test 3: Anti-bot Site
|
||||
print(f"\n\n{Fore.YELLOW}3. Anti-Bot Site Test{Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_antibot = await test_anti_bot_site(use_stealth=False)
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {regular_antibot['success']}")
|
||||
print(f" • Blocked: {regular_antibot['blocked']}")
|
||||
print(f" • Status Code: {regular_antibot['status_code']}")
|
||||
print(f" • Page Title: {regular_antibot['page_title']}")
|
||||
|
||||
# With stealth
|
||||
stealth_antibot = await test_anti_bot_site(use_stealth=True)
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {stealth_antibot['success']}")
|
||||
print(f" • Blocked: {stealth_antibot['blocked']}")
|
||||
print(f" • Status Code: {stealth_antibot['status_code']}")
|
||||
print(f" • Page Title: {stealth_antibot['page_title']}")
|
||||
|
||||
# Summary
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Summary:{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"\nStealth mode helps bypass basic bot detection by:")
|
||||
print(f" • Hiding webdriver property")
|
||||
print(f" • Modifying browser fingerprints")
|
||||
print(f" • Adjusting navigator properties")
|
||||
print(f" • Emulating real browser plugin behavior")
|
||||
print(f"\n{Fore.YELLOW}Note:{Style.RESET_ALL} Stealth mode is not a silver bullet.")
|
||||
print(f"Advanced anti-bot systems may still detect automation.")
|
||||
print(f"Always respect robots.txt and website terms of service.")
|
||||
|
||||
|
||||
async def stealth_best_practices():
|
||||
"""Demonstrate best practices for using stealth mode"""
|
||||
|
||||
print(f"\n\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Stealth Mode Best Practices{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
||||
|
||||
# Best Practice 1: Combine with realistic behavior
|
||||
print(f"{Fore.YELLOW}1. Combine with Realistic Behavior:{Style.RESET_ALL}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Simulate human-like behavior
|
||||
human_behavior_script = """
|
||||
(async () => {
|
||||
// Wait random time between actions
|
||||
const randomWait = () => Math.random() * 2000 + 1000;
|
||||
|
||||
// Simulate reading
|
||||
await new Promise(resolve => setTimeout(resolve, randomWait()));
|
||||
|
||||
// Smooth scroll
|
||||
const smoothScroll = async () => {
|
||||
const totalHeight = document.body.scrollHeight;
|
||||
const viewHeight = window.innerHeight;
|
||||
let currentPosition = 0;
|
||||
|
||||
while (currentPosition < totalHeight - viewHeight) {
|
||||
const scrollAmount = Math.random() * 300 + 100;
|
||||
window.scrollBy({
|
||||
top: scrollAmount,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
currentPosition += scrollAmount;
|
||||
await new Promise(resolve => setTimeout(resolve, randomWait()));
|
||||
}
|
||||
};
|
||||
|
||||
await smoothScroll();
|
||||
console.log('Human-like behavior simulation completed');
|
||||
return true;
|
||||
})()
|
||||
"""
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code=human_behavior_script,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=3.0,
|
||||
capture_console_messages=True
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f" ✓ Simulated human-like scrolling and reading patterns")
|
||||
print(f" ✓ Added random delays between actions")
|
||||
print(f" ✓ Result: {result.success}")
|
||||
|
||||
# Best Practice 2: Use appropriate viewport and user agent
|
||||
print(f"\n{Fore.YELLOW}2. Use Realistic Viewport and User Agent:{Style.RESET_ALL}")
|
||||
|
||||
# Get a realistic user agent
|
||||
from crawl4ai.user_agent_generator import UserAgentGenerator
|
||||
ua_generator = UserAgentGenerator()
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent=ua_generator.generate(device_type="desktop", browser_type="chrome")
|
||||
)
|
||||
|
||||
print(f" ✓ Using realistic viewport: 1920x1080")
|
||||
print(f" ✓ Using current Chrome user agent")
|
||||
print(f" ✓ Stealth mode will ensure consistency")
|
||||
|
||||
# Best Practice 3: Manage request rate
|
||||
print(f"\n{Fore.YELLOW}3. Manage Request Rate:{Style.RESET_ALL}")
|
||||
print(f" ✓ Add delays between requests")
|
||||
print(f" ✓ Randomize timing patterns")
|
||||
print(f" ✓ Respect robots.txt")
|
||||
|
||||
# Best Practice 4: Session management
|
||||
print(f"\n{Fore.YELLOW}4. Use Session Management:{Style.RESET_ALL}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Create a session for multiple requests
|
||||
session_id = "stealth_session_1"
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
wait_until="domcontentloaded"
|
||||
)
|
||||
|
||||
# First request
|
||||
result1 = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Subsequent request reuses the same browser context
|
||||
result2 = await crawler.arun(
|
||||
url="https://example.com/about",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f" ✓ Reused browser session for multiple requests")
|
||||
print(f" ✓ Maintains cookies and state between requests")
|
||||
print(f" ✓ More efficient and realistic browsing pattern")
|
||||
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
|
||||
# Run comparison tests
|
||||
await compare_results()
|
||||
|
||||
# Show best practices
|
||||
await stealth_best_practices()
|
||||
|
||||
print(f"\n{Fore.GREEN}Examples completed!{Style.RESET_ALL}")
|
||||
print(f"\n{Fore.YELLOW}Remember:{Style.RESET_ALL}")
|
||||
print(f"• Stealth mode helps with basic bot detection")
|
||||
print(f"• Always respect website terms of service")
|
||||
print(f"• Consider rate limiting and ethical scraping practices")
|
||||
print(f"• For advanced protection, consider additional measures")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
215
docs/examples/stealth_mode_quick_start.py
Normal file
215
docs/examples/stealth_mode_quick_start.py
Normal file
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Quick Start: Using Stealth Mode in Crawl4AI
|
||||
|
||||
This example shows practical use cases for the stealth mode feature.
|
||||
Stealth mode helps bypass basic bot detection mechanisms.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
async def example_1_basic_stealth():
|
||||
"""Example 1: Basic stealth mode usage"""
|
||||
print("\n=== Example 1: Basic Stealth Mode ===")
|
||||
|
||||
# Enable stealth mode in browser config
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # This is the key parameter
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
print(f"✓ Crawled {result.url} successfully")
|
||||
print(f"✓ Title: {result.metadata.get('title', 'N/A')}")
|
||||
|
||||
|
||||
async def example_2_stealth_with_screenshot():
|
||||
"""Example 2: Stealth mode with screenshot to show detection results"""
|
||||
print("\n=== Example 2: Stealth Mode Visual Verification ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False # Set to False to see the browser
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Successfully crawled bot detection site")
|
||||
print(f"✓ With stealth enabled, many detection tests should show as passed")
|
||||
|
||||
if result.screenshot:
|
||||
# Save screenshot for verification
|
||||
import base64
|
||||
with open("stealth_detection_results.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f"✓ Screenshot saved as 'stealth_detection_results.png'")
|
||||
print(f" Check the screenshot to see detection results!")
|
||||
|
||||
|
||||
async def example_3_stealth_for_protected_sites():
|
||||
"""Example 3: Using stealth for sites with bot protection"""
|
||||
print("\n=== Example 3: Stealth for Protected Sites ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Add human-like behavior
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=2.0, # Wait 2 seconds
|
||||
js_code="""
|
||||
// Simulate human-like scrolling
|
||||
window.scrollTo({
|
||||
top: document.body.scrollHeight / 2,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
"""
|
||||
)
|
||||
|
||||
# Try accessing a site that might have bot protection
|
||||
result = await crawler.arun(
|
||||
url="https://www.g2.com/products/slack/reviews",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Successfully accessed protected site")
|
||||
print(f"✓ Retrieved {len(result.html)} characters of HTML")
|
||||
else:
|
||||
print(f"✗ Failed to access site: {result.error_message}")
|
||||
|
||||
|
||||
async def example_4_stealth_with_sessions():
|
||||
"""Example 4: Stealth mode with session management"""
|
||||
print("\n=== Example 4: Stealth + Session Management ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
session_id = "my_stealth_session"
|
||||
|
||||
# First request - establish session
|
||||
config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
wait_until="domcontentloaded"
|
||||
)
|
||||
|
||||
result1 = await crawler.arun(
|
||||
url="https://news.ycombinator.com",
|
||||
config=config
|
||||
)
|
||||
print(f"✓ First request completed: {result1.url}")
|
||||
|
||||
# Second request - reuse session
|
||||
await asyncio.sleep(2) # Brief delay between requests
|
||||
|
||||
result2 = await crawler.arun(
|
||||
url="https://news.ycombinator.com/best",
|
||||
config=config
|
||||
)
|
||||
print(f"✓ Second request completed: {result2.url}")
|
||||
print(f"✓ Session reused, maintaining cookies and state")
|
||||
|
||||
|
||||
async def example_5_stealth_comparison():
|
||||
"""Example 5: Compare results with and without stealth using screenshots"""
|
||||
print("\n=== Example 5: Stealth Mode Comparison ===")
|
||||
|
||||
test_url = "https://bot.sannysoft.com"
|
||||
|
||||
# First test WITHOUT stealth
|
||||
print("\nWithout stealth:")
|
||||
regular_config = BrowserConfig(
|
||||
enable_stealth=False,
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=regular_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
result = await crawler.arun(url=test_url, config=config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
with open("comparison_without_stealth.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f" ✓ Screenshot saved: comparison_without_stealth.png")
|
||||
print(f" Many tests will show as FAILED (red)")
|
||||
|
||||
# Then test WITH stealth
|
||||
print("\nWith stealth:")
|
||||
stealth_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=stealth_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
result = await crawler.arun(url=test_url, config=config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
with open("comparison_with_stealth.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f" ✓ Screenshot saved: comparison_with_stealth.png")
|
||||
print(f" More tests should show as PASSED (green)")
|
||||
|
||||
print("\nCompare the two screenshots to see the difference!")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
print("Crawl4AI Stealth Mode Examples")
|
||||
print("==============================")
|
||||
|
||||
# Run basic example
|
||||
await example_1_basic_stealth()
|
||||
|
||||
# Run screenshot verification example
|
||||
await example_2_stealth_with_screenshot()
|
||||
|
||||
# Run protected site example
|
||||
await example_3_stealth_for_protected_sites()
|
||||
|
||||
# Run session example
|
||||
await example_4_stealth_with_sessions()
|
||||
|
||||
# Run comparison example
|
||||
await example_5_stealth_comparison()
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("Tips for using stealth mode effectively:")
|
||||
print("- Use realistic viewport sizes (1920x1080, 1366x768)")
|
||||
print("- Add delays between requests to appear more human")
|
||||
print("- Combine with session management for better results")
|
||||
print("- Remember: stealth mode is for legitimate scraping only")
|
||||
print("="*50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
62
docs/examples/stealth_test_simple.py
Normal file
62
docs/examples/stealth_test_simple.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Simple test to verify stealth mode is working
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
async def test_stealth():
|
||||
"""Test stealth mode effectiveness"""
|
||||
|
||||
# Test WITHOUT stealth
|
||||
print("=== WITHOUT Stealth ===")
|
||||
config1 = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=config1) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
screenshot=True
|
||||
)
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
# Take screenshot
|
||||
if result.screenshot:
|
||||
with open("without_stealth.png", "wb") as f:
|
||||
import base64
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("Screenshot saved: without_stealth.png")
|
||||
|
||||
# Test WITH stealth
|
||||
print("\n=== WITH Stealth ===")
|
||||
config2 = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=config2) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
screenshot=True
|
||||
)
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
# Take screenshot
|
||||
if result.screenshot:
|
||||
with open("with_stealth.png", "wb") as f:
|
||||
import base64
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("Screenshot saved: with_stealth.png")
|
||||
|
||||
print("\nCheck the screenshots to see the difference in bot detection results!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_stealth())
|
||||
74
docs/examples/undetectability/undetected_basic_test.py
Normal file
74
docs/examples/undetectability/undetected_basic_test.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Basic Undetected Browser Test
|
||||
Simple example to test if undetected mode works
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
async def test_regular_mode():
|
||||
"""Test with regular browser"""
|
||||
print("Testing Regular Browser Mode...")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com")
|
||||
print(f"Regular Mode - Success: {result.success}")
|
||||
print(f"Regular Mode - Status: {result.status_code}")
|
||||
print(f"Regular Mode - Content length: {len(result.markdown.raw_markdown)}")
|
||||
print(f"Regular Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
|
||||
return result.success
|
||||
|
||||
async def test_undetected_mode():
|
||||
"""Test with undetected browser"""
|
||||
print("\nTesting Undetected Browser Mode...")
|
||||
from crawl4ai import UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com")
|
||||
print(f"Undetected Mode - Success: {result.success}")
|
||||
print(f"Undetected Mode - Status: {result.status_code}")
|
||||
print(f"Undetected Mode - Content length: {len(result.markdown.raw_markdown)}")
|
||||
print(f"Undetected Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
|
||||
return result.success
|
||||
|
||||
async def main():
|
||||
"""Run both tests"""
|
||||
print("🤖 Crawl4AI Basic Adapter Test\n")
|
||||
|
||||
# Test regular mode
|
||||
regular_success = await test_regular_mode()
|
||||
|
||||
# Test undetected mode
|
||||
undetected_success = await test_undetected_mode()
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*50)
|
||||
print("Summary:")
|
||||
print(f"Regular Mode: {'✅ Success' if regular_success else '❌ Failed'}")
|
||||
print(f"Undetected Mode: {'✅ Success' if undetected_success else '❌ Failed'}")
|
||||
print("="*50)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
155
docs/examples/undetectability/undetected_bot_test.py
Normal file
155
docs/examples/undetectability/undetected_bot_test.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
Bot Detection Test - Compare Regular vs Undetected
|
||||
Tests browser fingerprinting differences at bot.sannysoft.com
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter,
|
||||
CrawlResult
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Bot detection test site
|
||||
TEST_URL = "https://bot.sannysoft.com"
|
||||
|
||||
def analyze_bot_detection(result: CrawlResult) -> dict:
|
||||
"""Analyze bot detection results from the page"""
|
||||
detections = {
|
||||
"webdriver": False,
|
||||
"headless": False,
|
||||
"automation": False,
|
||||
"user_agent": False,
|
||||
"total_tests": 0,
|
||||
"failed_tests": 0
|
||||
}
|
||||
|
||||
if not result.success or not result.html:
|
||||
return detections
|
||||
|
||||
# Look for specific test results in the HTML
|
||||
html_lower = result.html.lower()
|
||||
|
||||
# Check for common bot indicators
|
||||
if "webdriver" in html_lower and ("fail" in html_lower or "true" in html_lower):
|
||||
detections["webdriver"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
if "headless" in html_lower and ("fail" in html_lower or "true" in html_lower):
|
||||
detections["headless"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
if "automation" in html_lower and "detected" in html_lower:
|
||||
detections["automation"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
# Count total tests (approximate)
|
||||
detections["total_tests"] = html_lower.count("test") + html_lower.count("check")
|
||||
|
||||
return detections
|
||||
|
||||
async def test_browser_mode(adapter_name: str, adapter=None):
|
||||
"""Test a browser mode and return results"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing: {adapter_name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Run in headed mode for better results
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
if adapter:
|
||||
# Use undetected mode
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
crawler = AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
)
|
||||
else:
|
||||
# Use regular mode
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
async with crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=3.0, # Let detection scripts run
|
||||
wait_for_images=True,
|
||||
screenshot=True,
|
||||
simulate_user=False, # Don't simulate for accurate detection
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
|
||||
if result.success:
|
||||
# Analyze detection results
|
||||
detections = analyze_bot_detection(result)
|
||||
|
||||
print(f"\n🔍 Bot Detection Analysis:")
|
||||
print(f" - WebDriver Detected: {'❌ Yes' if detections['webdriver'] else '✅ No'}")
|
||||
print(f" - Headless Detected: {'❌ Yes' if detections['headless'] else '✅ No'}")
|
||||
print(f" - Automation Detected: {'❌ Yes' if detections['automation'] else '✅ No'}")
|
||||
print(f" - Failed Tests: {detections['failed_tests']}")
|
||||
|
||||
# Show some content
|
||||
if result.markdown.raw_markdown:
|
||||
print(f"\nContent preview:")
|
||||
lines = result.markdown.raw_markdown.split('\n')
|
||||
for line in lines[:20]: # Show first 20 lines
|
||||
if any(keyword in line.lower() for keyword in ['test', 'pass', 'fail', 'yes', 'no']):
|
||||
print(f" {line.strip()}")
|
||||
|
||||
return result, detections if result.success else {}
|
||||
|
||||
async def main():
|
||||
"""Run the comparison"""
|
||||
print("🤖 Crawl4AI - Bot Detection Test")
|
||||
print(f"Testing at: {TEST_URL}")
|
||||
print("This site runs various browser fingerprinting tests\n")
|
||||
|
||||
# Test regular browser
|
||||
regular_result, regular_detections = await test_browser_mode("Regular Browser")
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test undetected browser
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
undetected_result, undetected_detections = await test_browser_mode(
|
||||
"Undetected Browser",
|
||||
undetected_adapter
|
||||
)
|
||||
|
||||
# Summary comparison
|
||||
print(f"\n{'='*60}")
|
||||
print("COMPARISON SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
|
||||
print(f"\n{'Test':<25} {'Regular':<15} {'Undetected':<15}")
|
||||
print(f"{'-'*55}")
|
||||
|
||||
if regular_detections and undetected_detections:
|
||||
print(f"{'WebDriver Detection':<25} {'❌ Detected' if regular_detections['webdriver'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['webdriver'] else '✅ Passed':<15}")
|
||||
print(f"{'Headless Detection':<25} {'❌ Detected' if regular_detections['headless'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['headless'] else '✅ Passed':<15}")
|
||||
print(f"{'Automation Detection':<25} {'❌ Detected' if regular_detections['automation'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['automation'] else '✅ Passed':<15}")
|
||||
print(f"{'Failed Tests':<25} {regular_detections['failed_tests']:<15} {undetected_detections['failed_tests']:<15}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
|
||||
if undetected_detections.get('failed_tests', 0) < regular_detections.get('failed_tests', 1):
|
||||
print("✅ Undetected browser performed better at evading detection!")
|
||||
else:
|
||||
print("ℹ️ Both browsers had similar detection results")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
164
docs/examples/undetectability/undetected_cloudflare_test.py
Normal file
164
docs/examples/undetectability/undetected_cloudflare_test.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Undetected Browser Test - Cloudflare Protected Site
|
||||
Tests the difference between regular and undetected modes on a Cloudflare-protected site
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Test URL with Cloudflare protection
|
||||
TEST_URL = "https://nowsecure.nl"
|
||||
|
||||
async def test_regular_browser():
|
||||
"""Test with regular browser - likely to be blocked"""
|
||||
print("=" * 60)
|
||||
print("Testing with Regular Browser")
|
||||
print("=" * 60)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=2.0,
|
||||
simulate_user=True,
|
||||
magic=True, # Try with magic mode too
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
|
||||
# Check for Cloudflare challenge
|
||||
if result.html:
|
||||
cf_indicators = [
|
||||
"Checking your browser",
|
||||
"Please stand by",
|
||||
"cloudflare",
|
||||
"cf-browser-verification",
|
||||
"Access denied",
|
||||
"Ray ID"
|
||||
]
|
||||
|
||||
detected = False
|
||||
for indicator in cf_indicators:
|
||||
if indicator.lower() in result.html.lower():
|
||||
print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected and len(result.markdown.raw_markdown) > 100:
|
||||
print("✅ Successfully bypassed Cloudflare!")
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
elif not detected:
|
||||
print("⚠️ Page loaded but content seems minimal")
|
||||
|
||||
return result
|
||||
|
||||
async def test_undetected_browser():
|
||||
"""Test with undetected browser - should bypass Cloudflare"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with Undetected Browser")
|
||||
print("=" * 60)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Headless is easier to detect
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=2.0,
|
||||
simulate_user=True,
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
|
||||
# Check for Cloudflare challenge
|
||||
if result.html:
|
||||
cf_indicators = [
|
||||
"Checking your browser",
|
||||
"Please stand by",
|
||||
"cloudflare",
|
||||
"cf-browser-verification",
|
||||
"Access denied",
|
||||
"Ray ID"
|
||||
]
|
||||
|
||||
detected = False
|
||||
for indicator in cf_indicators:
|
||||
if indicator.lower() in result.html.lower():
|
||||
print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected and len(result.markdown.raw_markdown) > 100:
|
||||
print("✅ Successfully bypassed Cloudflare!")
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
elif not detected:
|
||||
print("⚠️ Page loaded but content seems minimal")
|
||||
|
||||
return result
|
||||
|
||||
async def main():
|
||||
"""Compare regular vs undetected browser"""
|
||||
print("🤖 Crawl4AI - Cloudflare Bypass Test")
|
||||
print(f"Testing URL: {TEST_URL}\n")
|
||||
|
||||
# Test regular browser
|
||||
regular_result = await test_regular_browser()
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test undetected browser
|
||||
undetected_result = await test_undetected_browser()
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Regular Browser:")
|
||||
print(f" - Success: {regular_result.success}")
|
||||
print(f" - Content Length: {len(regular_result.markdown.raw_markdown) if regular_result.markdown else 0}")
|
||||
|
||||
print(f"\nUndetected Browser:")
|
||||
print(f" - Success: {undetected_result.success}")
|
||||
print(f" - Content Length: {len(undetected_result.markdown.raw_markdown) if undetected_result.markdown else 0}")
|
||||
|
||||
if undetected_result.success and len(undetected_result.markdown.raw_markdown) > len(regular_result.markdown.raw_markdown):
|
||||
print("\n✅ Undetected browser successfully bypassed protection!")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,184 @@
|
||||
"""
|
||||
Undetected vs Regular Browser Comparison
|
||||
This example demonstrates the difference between regular and undetected browser modes
|
||||
when accessing sites with bot detection services.
|
||||
|
||||
Based on tested anti-bot services:
|
||||
- Cloudflare
|
||||
- Kasada
|
||||
- Akamai
|
||||
- DataDome
|
||||
- Bet365
|
||||
- And others
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
PlaywrightAdapter,
|
||||
UndetectedAdapter,
|
||||
CrawlResult
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
# Test URLs for various bot detection services
|
||||
TEST_SITES = {
|
||||
"Cloudflare Protected": "https://nowsecure.nl",
|
||||
# "Bot Detection Test": "https://bot.sannysoft.com",
|
||||
# "Fingerprint Test": "https://fingerprint.com/products/bot-detection",
|
||||
# "Browser Scan": "https://browserscan.net",
|
||||
# "CreepJS": "https://abrahamjuliot.github.io/creepjs",
|
||||
}
|
||||
|
||||
|
||||
async def test_with_adapter(url: str, adapter_name: str, adapter):
|
||||
"""Test a URL with a specific adapter"""
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Better for avoiding detection
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the crawler strategy with the adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing with {adapter_name} adapter")
|
||||
print(f"URL: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
delay_before_return_html=3.0, # Give page time to load
|
||||
wait_for_images=True,
|
||||
screenshot=True,
|
||||
simulate_user=True, # Add user simulation
|
||||
)
|
||||
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url=url,
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
# Check results
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ Success: {result.success}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
print(f"✓ Markdown Length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for common bot detection indicators
|
||||
detection_indicators = [
|
||||
"Access denied",
|
||||
"Please verify you are human",
|
||||
"Checking your browser",
|
||||
"Enable JavaScript",
|
||||
"captcha",
|
||||
"403 Forbidden",
|
||||
"Bot detection",
|
||||
"Security check"
|
||||
]
|
||||
|
||||
content_lower = result.markdown.raw_markdown.lower()
|
||||
detected = False
|
||||
for indicator in detection_indicators:
|
||||
if indicator.lower() in content_lower:
|
||||
print(f"⚠️ Possible detection: Found '{indicator}'")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected:
|
||||
print("✅ No obvious bot detection triggered!")
|
||||
# Show first 200 chars of content
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
|
||||
return result.success and not detected
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
async def compare_adapters(url: str, site_name: str):
|
||||
"""Compare regular and undetected adapters on the same URL"""
|
||||
print(f"\n{'#'*60}")
|
||||
print(f"# Testing: {site_name}")
|
||||
print(f"{'#'*60}")
|
||||
|
||||
# Test with regular adapter
|
||||
regular_adapter = PlaywrightAdapter()
|
||||
regular_success = await test_with_adapter(url, "Regular", regular_adapter)
|
||||
|
||||
# Small delay between tests
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test with undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
undetected_success = await test_with_adapter(url, "Undetected", undetected_adapter)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Summary for {site_name}:")
|
||||
print(f"Regular Adapter: {'✅ Passed' if regular_success else '❌ Blocked/Detected'}")
|
||||
print(f"Undetected Adapter: {'✅ Passed' if undetected_success else '❌ Blocked/Detected'}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
return regular_success, undetected_success
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run comparison tests on multiple sites"""
|
||||
print("🤖 Crawl4AI Browser Adapter Comparison")
|
||||
print("Testing regular vs undetected browser modes\n")
|
||||
|
||||
results = {}
|
||||
|
||||
# Test each site
|
||||
for site_name, url in TEST_SITES.items():
|
||||
regular, undetected = await compare_adapters(url, site_name)
|
||||
results[site_name] = {
|
||||
"regular": regular,
|
||||
"undetected": undetected
|
||||
}
|
||||
|
||||
# Delay between different sites
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# Final summary
|
||||
print(f"\n{'#'*60}")
|
||||
print("# FINAL RESULTS")
|
||||
print(f"{'#'*60}")
|
||||
print(f"{'Site':<30} {'Regular':<15} {'Undetected':<15}")
|
||||
print(f"{'-'*60}")
|
||||
|
||||
for site, result in results.items():
|
||||
regular_status = "✅ Passed" if result["regular"] else "❌ Blocked"
|
||||
undetected_status = "✅ Passed" if result["undetected"] else "❌ Blocked"
|
||||
print(f"{site:<30} {regular_status:<15} {undetected_status:<15}")
|
||||
|
||||
# Calculate success rates
|
||||
regular_success = sum(1 for r in results.values() if r["regular"])
|
||||
undetected_success = sum(1 for r in results.values() if r["undetected"])
|
||||
total = len(results)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Success Rates:")
|
||||
print(f"Regular Adapter: {regular_success}/{total} ({regular_success/total*100:.1f}%)")
|
||||
print(f"Undetected Adapter: {undetected_success}/{total} ({undetected_success/total*100:.1f}%)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Note: This example may take a while to run as it tests multiple sites
|
||||
# You can comment out sites in TEST_SITES to run faster tests
|
||||
asyncio.run(main())
|
||||
118
docs/examples/undetected_simple_demo.py
Normal file
118
docs/examples/undetected_simple_demo.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
Simple Undetected Browser Demo
|
||||
Demonstrates the basic usage of undetected browser mode
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
async def crawl_with_regular_browser(url: str):
|
||||
"""Crawl with regular browser"""
|
||||
print("\n[Regular Browser Mode]")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=CrawlerRunConfig(
|
||||
delay_before_return_html=2.0
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for bot detection keywords
|
||||
content = result.markdown.raw_markdown.lower()
|
||||
if any(word in content for word in ["cloudflare", "checking your browser", "please wait"]):
|
||||
print("⚠️ Bot detection triggered!")
|
||||
else:
|
||||
print("✅ Page loaded successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def crawl_with_undetected_browser(url: str):
|
||||
"""Crawl with undetected browser"""
|
||||
print("\n[Undetected Browser Mode]")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create undetected adapter and strategy
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=CrawlerRunConfig(
|
||||
delay_before_return_html=2.0
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for bot detection keywords
|
||||
content = result.markdown.raw_markdown.lower()
|
||||
if any(word in content for word in ["cloudflare", "checking your browser", "please wait"]):
|
||||
print("⚠️ Bot detection triggered!")
|
||||
else:
|
||||
print("✅ Page loaded successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def main():
|
||||
"""Demo comparing regular vs undetected modes"""
|
||||
print("🤖 Crawl4AI Undetected Browser Demo")
|
||||
print("="*50)
|
||||
|
||||
# Test URLs - you can change these
|
||||
test_urls = [
|
||||
"https://www.example.com", # Simple site
|
||||
"https://httpbin.org/headers", # Shows request headers
|
||||
]
|
||||
|
||||
for url in test_urls:
|
||||
print(f"\n📍 Testing URL: {url}")
|
||||
|
||||
# Test with regular browser
|
||||
regular_result = await crawl_with_regular_browser(url)
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test with undetected browser
|
||||
undetected_result = await crawl_with_undetected_browser(url)
|
||||
|
||||
# Compare results
|
||||
print(f"\n📊 Comparison for {url}:")
|
||||
print(f"Regular browser content: {len(regular_result.markdown.raw_markdown)} chars")
|
||||
print(f"Undetected browser content: {len(undetected_result.markdown.raw_markdown)} chars")
|
||||
|
||||
if url == "https://httpbin.org/headers":
|
||||
# Show headers for comparison
|
||||
print("\nHeaders seen by server:")
|
||||
print("Regular:", regular_result.markdown.raw_markdown[:500])
|
||||
print("\nUndetected:", undetected_result.markdown.raw_markdown[:500])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -358,9 +358,77 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
---
|
||||
|
||||
## 7. Anti-Bot Features (Stealth Mode & Undetected Browser)
|
||||
|
||||
Crawl4AI provides two powerful features to bypass bot detection:
|
||||
|
||||
### 7.1 Stealth Mode
|
||||
|
||||
Stealth mode uses playwright-stealth to modify browser fingerprints and behaviors. Enable it with a simple flag:
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Activates stealth mode
|
||||
headless=False
|
||||
)
|
||||
```
|
||||
|
||||
**When to use**: Sites with basic bot detection (checking navigator.webdriver, plugins, etc.)
|
||||
|
||||
### 7.2 Undetected Browser
|
||||
|
||||
For advanced bot detection, use the undetected browser adapter:
|
||||
|
||||
```python
|
||||
from crawl4ai import UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Create undetected adapter
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(crawler_strategy=strategy, config=browser_config) as crawler:
|
||||
# Your crawling code
|
||||
```
|
||||
|
||||
**When to use**: Sites with sophisticated bot detection (Cloudflare, DataDome, etc.)
|
||||
|
||||
### 7.3 Combining Both
|
||||
|
||||
For maximum evasion, combine stealth mode with undetected browser:
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Enable stealth
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter() # Use undetected browser
|
||||
```
|
||||
|
||||
### Choosing the Right Approach
|
||||
|
||||
| Detection Level | Recommended Approach |
|
||||
|----------------|---------------------|
|
||||
| No protection | Regular browser |
|
||||
| Basic checks | Regular + Stealth mode |
|
||||
| Advanced protection | Undetected browser |
|
||||
| Maximum evasion | Undetected + Stealth mode |
|
||||
|
||||
**Best Practice**: Start with regular browser + stealth mode. Only use undetected browser if needed, as it may be slightly slower.
|
||||
|
||||
See [Undetected Browser Mode](undetected-browser.md) for detailed examples.
|
||||
|
||||
---
|
||||
|
||||
## Conclusion & Next Steps
|
||||
|
||||
You’ve now explored several **advanced** features:
|
||||
You've now explored several **advanced** features:
|
||||
|
||||
- **Proxy Usage**
|
||||
- **PDF & Screenshot** capturing for large or critical pages
|
||||
@@ -368,7 +436,10 @@ You’ve now explored several **advanced** features:
|
||||
- **Custom Headers** for language or specialized requests
|
||||
- **Session Persistence** via storage state
|
||||
- **Robots.txt Compliance**
|
||||
- **Anti-Bot Features** (Stealth Mode & Undetected Browser)
|
||||
|
||||
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
|
||||
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, manage sessions across multiple runs, and bypass bot detection—streamlining your entire data collection pipeline.
|
||||
|
||||
**Last Updated**: 2025-01-01
|
||||
**Note**: In future versions, we may enable stealth mode and undetected browser by default. For now, users should explicitly enable these features when needed.
|
||||
|
||||
**Last Updated**: 2025-01-17
|
||||
394
docs/md_v2/advanced/undetected-browser.md
Normal file
394
docs/md_v2/advanced/undetected-browser.md
Normal file
@@ -0,0 +1,394 @@
|
||||
# Undetected Browser Mode
|
||||
|
||||
## Overview
|
||||
|
||||
Crawl4AI offers two powerful anti-bot features to help you access websites with bot detection:
|
||||
|
||||
1. **Stealth Mode** - Uses playwright-stealth to modify browser fingerprints and behaviors
|
||||
2. **Undetected Browser Mode** - Advanced browser adapter with deep-level patches for sophisticated bot detection
|
||||
|
||||
This guide covers both features and helps you choose the right approach for your needs.
|
||||
|
||||
## Anti-Bot Features Comparison
|
||||
|
||||
| Feature | Regular Browser | Stealth Mode | Undetected Browser |
|
||||
|---------|----------------|--------------|-------------------|
|
||||
| WebDriver Detection | ❌ | ✅ | ✅ |
|
||||
| Navigator Properties | ❌ | ✅ | ✅ |
|
||||
| Plugin Emulation | ❌ | ✅ | ✅ |
|
||||
| CDP Detection | ❌ | Partial | ✅ |
|
||||
| Deep Browser Patches | ❌ | ❌ | ✅ |
|
||||
| Performance Impact | None | Minimal | Moderate |
|
||||
| Setup Complexity | None | None | Minimal |
|
||||
|
||||
## When to Use Each Approach
|
||||
|
||||
### Use Regular Browser + Stealth Mode When:
|
||||
- Sites have basic bot detection (checking navigator.webdriver, plugins, etc.)
|
||||
- You need good performance with basic protection
|
||||
- Sites check for common automation indicators
|
||||
|
||||
### Use Undetected Browser When:
|
||||
- Sites employ sophisticated bot detection services (Cloudflare, DataDome, etc.)
|
||||
- Stealth mode alone isn't sufficient
|
||||
- You're willing to trade some performance for better evasion
|
||||
|
||||
### Best Practice: Progressive Enhancement
|
||||
1. **Start with**: Regular browser + Stealth mode
|
||||
2. **If blocked**: Switch to Undetected browser
|
||||
3. **If still blocked**: Combine Undetected browser + Stealth mode
|
||||
|
||||
## Stealth Mode
|
||||
|
||||
Stealth mode is the simpler anti-bot solution that works with both regular and undetected browsers:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
# Enable stealth mode with regular browser
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Simple flag to enable
|
||||
headless=False # Better for avoiding detection
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
```
|
||||
|
||||
### What Stealth Mode Does:
|
||||
- Removes `navigator.webdriver` flag
|
||||
- Modifies browser fingerprints
|
||||
- Emulates realistic plugin behavior
|
||||
- Adjusts navigator properties
|
||||
- Fixes common automation leaks
|
||||
|
||||
## Undetected Browser Mode
|
||||
|
||||
For sites with sophisticated bot detection that stealth mode can't bypass, use the undetected browser adapter:
|
||||
|
||||
### Key Features
|
||||
|
||||
- **Drop-in Replacement**: Uses the same API as regular browser mode
|
||||
- **Enhanced Stealth**: Built-in patches to evade common detection methods
|
||||
- **Browser Adapter Pattern**: Seamlessly switch between regular and undetected modes
|
||||
- **Automatic Installation**: `crawl4ai-setup` installs all necessary browser dependencies
|
||||
|
||||
### Quick Start
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
async def main():
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Headless mode can be detected easier
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the crawler strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Your crawling code here
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig()
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Combining Both Features
|
||||
|
||||
For maximum evasion, combine stealth mode with undetected browser:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Create browser config with stealth enabled
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Enable stealth mode
|
||||
headless=False
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with both features
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://protected-site.com")
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Basic Stealth Mode
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def test_stealth_mode():
|
||||
# Simple stealth mode configuration
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(screenshot=True)
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print("✓ Successfully accessed bot detection test site")
|
||||
# Save screenshot to verify detection results
|
||||
if result.screenshot:
|
||||
import base64
|
||||
with open("stealth_test.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("✓ Screenshot saved - check for green (passed) tests")
|
||||
|
||||
asyncio.run(test_stealth_mode())
|
||||
```
|
||||
|
||||
### Example 2: Undetected Browser Mode
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create the crawler strategy with the undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Configure the crawl
|
||||
crawler_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
),
|
||||
capture_console_messages=True, # Test adapter console capture
|
||||
)
|
||||
|
||||
# Test on a site that typically detects bots
|
||||
print("Testing undetected adapter...")
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Console messages captured: {len(result.console_messages or [])}")
|
||||
print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Browser Adapter Pattern
|
||||
|
||||
The undetected browser support is implemented using an adapter pattern, allowing seamless switching between different browser implementations:
|
||||
|
||||
```python
|
||||
# Regular browser adapter (default)
|
||||
from crawl4ai import PlaywrightAdapter
|
||||
regular_adapter = PlaywrightAdapter()
|
||||
|
||||
# Undetected browser adapter
|
||||
from crawl4ai import UndetectedAdapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
```
|
||||
|
||||
The adapter handles:
|
||||
- JavaScript execution
|
||||
- Console message capture
|
||||
- Error handling
|
||||
- Browser-specific optimizations
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Avoid Headless Mode**: Detection is easier in headless mode
|
||||
```python
|
||||
browser_config = BrowserConfig(headless=False)
|
||||
```
|
||||
|
||||
2. **Use Reasonable Delays**: Don't rush through pages
|
||||
```python
|
||||
crawler_config = CrawlerRunConfig(
|
||||
wait_time=3.0, # Wait 3 seconds after page load
|
||||
delay_before_return_html=2.0 # Additional delay
|
||||
)
|
||||
```
|
||||
|
||||
3. **Rotate User Agents**: You can customize user agents
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
headers={"User-Agent": "your-user-agent"}
|
||||
)
|
||||
```
|
||||
|
||||
4. **Handle Failures Gracefully**: Some sites may still detect and block
|
||||
```python
|
||||
if not result.success:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
```
|
||||
|
||||
## Advanced Usage Tips
|
||||
|
||||
### Progressive Detection Handling
|
||||
|
||||
```python
|
||||
async def crawl_with_progressive_evasion(url):
|
||||
# Step 1: Try regular browser with stealth
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url)
|
||||
if result.success and "Access Denied" not in result.html:
|
||||
return result
|
||||
|
||||
# Step 2: If blocked, try undetected browser
|
||||
print("Regular + stealth blocked, trying undetected browser...")
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(url)
|
||||
return result
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
The undetected browser dependencies are automatically installed when you run:
|
||||
|
||||
```bash
|
||||
crawl4ai-setup
|
||||
```
|
||||
|
||||
This command installs all necessary browser dependencies for both regular and undetected modes.
|
||||
|
||||
## Limitations
|
||||
|
||||
- **Performance**: Slightly slower than regular mode due to additional patches
|
||||
- **Headless Detection**: Some sites can still detect headless mode
|
||||
- **Resource Usage**: May use more resources than regular mode
|
||||
- **Not 100% Guaranteed**: Advanced anti-bot services are constantly evolving
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Browser Not Found
|
||||
|
||||
Run the setup command:
|
||||
```bash
|
||||
crawl4ai-setup
|
||||
```
|
||||
|
||||
### Detection Still Occurring
|
||||
|
||||
Try combining with other features:
|
||||
```python
|
||||
crawler_config = CrawlerRunConfig(
|
||||
simulate_user=True, # Add user simulation
|
||||
magic=True, # Enable magic mode
|
||||
wait_time=5.0, # Longer waits
|
||||
)
|
||||
```
|
||||
|
||||
### Performance Issues
|
||||
|
||||
If experiencing slow performance:
|
||||
```python
|
||||
# Use selective undetected mode only for protected sites
|
||||
if is_protected_site(url):
|
||||
adapter = UndetectedAdapter()
|
||||
else:
|
||||
adapter = PlaywrightAdapter() # Default adapter
|
||||
```
|
||||
|
||||
## Future Plans
|
||||
|
||||
**Note**: In future versions of Crawl4AI, we may enable stealth mode and undetected browser by default to provide better out-of-the-box success rates. For now, users should explicitly enable these features when needed.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Crawl4AI provides flexible anti-bot solutions:
|
||||
|
||||
1. **Start Simple**: Use regular browser + stealth mode for most sites
|
||||
2. **Escalate if Needed**: Switch to undetected browser for sophisticated protection
|
||||
3. **Combine for Maximum Effect**: Use both features together when facing the toughest challenges
|
||||
|
||||
Remember:
|
||||
- Always respect robots.txt and website terms of service
|
||||
- Use appropriate delays to avoid overwhelming servers
|
||||
- Consider the performance trade-offs of each approach
|
||||
- Test progressively to find the minimum necessary evasion level
|
||||
|
||||
## See Also
|
||||
|
||||
- [Advanced Features](advanced-features.md) - Overview of all advanced features
|
||||
- [Proxy & Security](proxy-security.md) - Using proxies with anti-bot features
|
||||
- [Session Management](session-management.md) - Maintaining sessions across requests
|
||||
- [Identity Based Crawling](identity-based-crawling.md) - Additional anti-detection strategies
|
||||
@@ -29,6 +29,7 @@ class BrowserConfig:
|
||||
text_mode=False,
|
||||
light_mode=False,
|
||||
extra_args=None,
|
||||
enable_stealth=False,
|
||||
# ... other advanced parameters omitted here
|
||||
):
|
||||
...
|
||||
@@ -84,6 +85,11 @@ class BrowserConfig:
|
||||
- Additional flags for the underlying browser.
|
||||
- E.g. `["--disable-extensions"]`.
|
||||
|
||||
11. **`enable_stealth`**:
|
||||
- If `True`, enables stealth mode using playwright-stealth.
|
||||
- Modifies browser fingerprints to avoid basic bot detection.
|
||||
- Default is `False`. Recommended for sites with bot protection.
|
||||
|
||||
### Helper Methods
|
||||
|
||||
Both configuration classes provide a `clone()` method to create modified copies:
|
||||
|
||||
@@ -54,6 +54,16 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Crypto Analysis | Demonstrates how to crawl and analyze cryptocurrency data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crypto_analysis_example.py) |
|
||||
| SERP API | Demonstrates using Crawl4AI with search engine result pages. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/serp_api_project_11_feb.py) |
|
||||
|
||||
## Anti-Bot & Stealth Features
|
||||
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Stealth Mode Quick Start | Five practical examples showing how to use stealth mode for bypassing basic bot detection. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_quick_start.py) |
|
||||
| Stealth Mode Comprehensive | Comprehensive demonstration of stealth mode features with bot detection testing and comparisons. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_example.py) |
|
||||
| Undetected Browser | Simple example showing how to use the undetected browser adapter. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world_undetected.py) |
|
||||
| Undetected Browser Demo | Basic demo comparing regular and undetected browser modes. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/undetected_simple_demo.py) |
|
||||
| Undetected Tests | Advanced tests comparing regular vs undetected browsers on various bot detection services. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/undetectability/) |
|
||||
|
||||
## Customization & Security
|
||||
|
||||
| Example | Description | Link |
|
||||
|
||||
@@ -18,7 +18,7 @@ crawl4ai-setup
|
||||
```
|
||||
|
||||
**What does it do?**
|
||||
- Installs or updates required Playwright browsers (Chromium, Firefox, etc.)
|
||||
- Installs or updates required browser dependencies for both regular and undetected modes
|
||||
- Performs OS-level checks (e.g., missing libs on Linux)
|
||||
- Confirms your environment is ready to crawl
|
||||
|
||||
|
||||
Reference in New Issue
Block a user