- Add playwright-stealth integration with enable_stealth parameter in BrowserConfig - Merge undetected browser strategy into main async_crawler_strategy.py using adapter pattern - Add browser adapters (BrowserAdapter, PlaywrightAdapter, UndetectedAdapter) for flexible browser switching - Update install.py to install both playwright and patchright browsers automatically - Add comprehensive documentation for anti-bot features (stealth mode + undetected browser) - Create examples demonstrating stealth mode usage and comparison tests - Update pyproject.toml and requirements.txt with patchright>=1.49.0 and other dependencies - Remove duplicate/unused dependencies (alphashape, cssselect, pyperclip, shapely, selenium) - Add dependency checker tool in tests/check_dependencies.py Breaking changes: None - all existing functionality preserved 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
215 lines
7.1 KiB
Python
215 lines
7.1 KiB
Python
"""
|
|
Quick Start: Using Stealth Mode in Crawl4AI
|
|
|
|
This example shows practical use cases for the stealth mode feature.
|
|
Stealth mode helps bypass basic bot detection mechanisms.
|
|
"""
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
|
|
|
|
async def example_1_basic_stealth():
|
|
"""Example 1: Basic stealth mode usage"""
|
|
print("\n=== Example 1: Basic Stealth Mode ===")
|
|
|
|
# Enable stealth mode in browser config
|
|
browser_config = BrowserConfig(
|
|
enable_stealth=True, # This is the key parameter
|
|
headless=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
print(f"✓ Crawled {result.url} successfully")
|
|
print(f"✓ Title: {result.metadata.get('title', 'N/A')}")
|
|
|
|
|
|
async def example_2_stealth_with_screenshot():
|
|
"""Example 2: Stealth mode with screenshot to show detection results"""
|
|
print("\n=== Example 2: Stealth Mode Visual Verification ===")
|
|
|
|
browser_config = BrowserConfig(
|
|
enable_stealth=True,
|
|
headless=False # Set to False to see the browser
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
screenshot=True,
|
|
wait_until="networkidle"
|
|
)
|
|
|
|
result = await crawler.arun(
|
|
url="https://bot.sannysoft.com",
|
|
config=config
|
|
)
|
|
|
|
if result.success:
|
|
print(f"✓ Successfully crawled bot detection site")
|
|
print(f"✓ With stealth enabled, many detection tests should show as passed")
|
|
|
|
if result.screenshot:
|
|
# Save screenshot for verification
|
|
import base64
|
|
with open("stealth_detection_results.png", "wb") as f:
|
|
f.write(base64.b64decode(result.screenshot))
|
|
print(f"✓ Screenshot saved as 'stealth_detection_results.png'")
|
|
print(f" Check the screenshot to see detection results!")
|
|
|
|
|
|
async def example_3_stealth_for_protected_sites():
|
|
"""Example 3: Using stealth for sites with bot protection"""
|
|
print("\n=== Example 3: Stealth for Protected Sites ===")
|
|
|
|
browser_config = BrowserConfig(
|
|
enable_stealth=True,
|
|
headless=True,
|
|
viewport_width=1920,
|
|
viewport_height=1080
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Add human-like behavior
|
|
config = CrawlerRunConfig(
|
|
wait_until="networkidle",
|
|
delay_before_return_html=2.0, # Wait 2 seconds
|
|
js_code="""
|
|
// Simulate human-like scrolling
|
|
window.scrollTo({
|
|
top: document.body.scrollHeight / 2,
|
|
behavior: 'smooth'
|
|
});
|
|
"""
|
|
)
|
|
|
|
# Try accessing a site that might have bot protection
|
|
result = await crawler.arun(
|
|
url="https://www.g2.com/products/slack/reviews",
|
|
config=config
|
|
)
|
|
|
|
if result.success:
|
|
print(f"✓ Successfully accessed protected site")
|
|
print(f"✓ Retrieved {len(result.html)} characters of HTML")
|
|
else:
|
|
print(f"✗ Failed to access site: {result.error_message}")
|
|
|
|
|
|
async def example_4_stealth_with_sessions():
|
|
"""Example 4: Stealth mode with session management"""
|
|
print("\n=== Example 4: Stealth + Session Management ===")
|
|
|
|
browser_config = BrowserConfig(
|
|
enable_stealth=True,
|
|
headless=False
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
session_id = "my_stealth_session"
|
|
|
|
# First request - establish session
|
|
config = CrawlerRunConfig(
|
|
session_id=session_id,
|
|
wait_until="domcontentloaded"
|
|
)
|
|
|
|
result1 = await crawler.arun(
|
|
url="https://news.ycombinator.com",
|
|
config=config
|
|
)
|
|
print(f"✓ First request completed: {result1.url}")
|
|
|
|
# Second request - reuse session
|
|
await asyncio.sleep(2) # Brief delay between requests
|
|
|
|
result2 = await crawler.arun(
|
|
url="https://news.ycombinator.com/best",
|
|
config=config
|
|
)
|
|
print(f"✓ Second request completed: {result2.url}")
|
|
print(f"✓ Session reused, maintaining cookies and state")
|
|
|
|
|
|
async def example_5_stealth_comparison():
|
|
"""Example 5: Compare results with and without stealth using screenshots"""
|
|
print("\n=== Example 5: Stealth Mode Comparison ===")
|
|
|
|
test_url = "https://bot.sannysoft.com"
|
|
|
|
# First test WITHOUT stealth
|
|
print("\nWithout stealth:")
|
|
regular_config = BrowserConfig(
|
|
enable_stealth=False,
|
|
headless=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=regular_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
screenshot=True,
|
|
wait_until="networkidle"
|
|
)
|
|
result = await crawler.arun(url=test_url, config=config)
|
|
|
|
if result.success and result.screenshot:
|
|
import base64
|
|
with open("comparison_without_stealth.png", "wb") as f:
|
|
f.write(base64.b64decode(result.screenshot))
|
|
print(f" ✓ Screenshot saved: comparison_without_stealth.png")
|
|
print(f" Many tests will show as FAILED (red)")
|
|
|
|
# Then test WITH stealth
|
|
print("\nWith stealth:")
|
|
stealth_config = BrowserConfig(
|
|
enable_stealth=True,
|
|
headless=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=stealth_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
screenshot=True,
|
|
wait_until="networkidle"
|
|
)
|
|
result = await crawler.arun(url=test_url, config=config)
|
|
|
|
if result.success and result.screenshot:
|
|
import base64
|
|
with open("comparison_with_stealth.png", "wb") as f:
|
|
f.write(base64.b64decode(result.screenshot))
|
|
print(f" ✓ Screenshot saved: comparison_with_stealth.png")
|
|
print(f" More tests should show as PASSED (green)")
|
|
|
|
print("\nCompare the two screenshots to see the difference!")
|
|
|
|
|
|
async def main():
|
|
"""Run all examples"""
|
|
print("Crawl4AI Stealth Mode Examples")
|
|
print("==============================")
|
|
|
|
# Run basic example
|
|
await example_1_basic_stealth()
|
|
|
|
# Run screenshot verification example
|
|
await example_2_stealth_with_screenshot()
|
|
|
|
# Run protected site example
|
|
await example_3_stealth_for_protected_sites()
|
|
|
|
# Run session example
|
|
await example_4_stealth_with_sessions()
|
|
|
|
# Run comparison example
|
|
await example_5_stealth_comparison()
|
|
|
|
print("\n" + "="*50)
|
|
print("Tips for using stealth mode effectively:")
|
|
print("- Use realistic viewport sizes (1920x1080, 1366x768)")
|
|
print("- Add delays between requests to appear more human")
|
|
print("- Combine with session management for better results")
|
|
print("- Remember: stealth mode is for legitimate scraping only")
|
|
print("="*50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |