- Add playwright-stealth integration with enable_stealth parameter in BrowserConfig - Merge undetected browser strategy into main async_crawler_strategy.py using adapter pattern - Add browser adapters (BrowserAdapter, PlaywrightAdapter, UndetectedAdapter) for flexible browser switching - Update install.py to install both playwright and patchright browsers automatically - Add comprehensive documentation for anti-bot features (stealth mode + undetected browser) - Create examples demonstrating stealth mode usage and comparison tests - Update pyproject.toml and requirements.txt with patchright>=1.49.0 and other dependencies - Remove duplicate/unused dependencies (alphashape, cssselect, pyperclip, shapely, selenium) - Add dependency checker tool in tests/check_dependencies.py Breaking changes: None - all existing functionality preserved 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
164 lines
5.4 KiB
Python
164 lines
5.4 KiB
Python
"""
|
|
Undetected Browser Test - Cloudflare Protected Site
|
|
Tests the difference between regular and undetected modes on a Cloudflare-protected site
|
|
"""
|
|
|
|
import asyncio
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
UndetectedAdapter
|
|
)
|
|
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
|
|
|
# Test URL with Cloudflare protection
|
|
TEST_URL = "https://nowsecure.nl"
|
|
|
|
async def test_regular_browser():
|
|
"""Test with regular browser - likely to be blocked"""
|
|
print("=" * 60)
|
|
print("Testing with Regular Browser")
|
|
print("=" * 60)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
verbose=True,
|
|
viewport_width=1920,
|
|
viewport_height=1080,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
config = CrawlerRunConfig(
|
|
delay_before_return_html=2.0,
|
|
simulate_user=True,
|
|
magic=True, # Try with magic mode too
|
|
)
|
|
|
|
result = await crawler.arun(url=TEST_URL, config=config)
|
|
|
|
print(f"\n✓ Success: {result.success}")
|
|
print(f"✓ Status Code: {result.status_code}")
|
|
print(f"✓ HTML Length: {len(result.html)}")
|
|
|
|
# Check for Cloudflare challenge
|
|
if result.html:
|
|
cf_indicators = [
|
|
"Checking your browser",
|
|
"Please stand by",
|
|
"cloudflare",
|
|
"cf-browser-verification",
|
|
"Access denied",
|
|
"Ray ID"
|
|
]
|
|
|
|
detected = False
|
|
for indicator in cf_indicators:
|
|
if indicator.lower() in result.html.lower():
|
|
print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found")
|
|
detected = True
|
|
break
|
|
|
|
if not detected and len(result.markdown.raw_markdown) > 100:
|
|
print("✅ Successfully bypassed Cloudflare!")
|
|
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
|
elif not detected:
|
|
print("⚠️ Page loaded but content seems minimal")
|
|
|
|
return result
|
|
|
|
async def test_undetected_browser():
|
|
"""Test with undetected browser - should bypass Cloudflare"""
|
|
print("\n" + "=" * 60)
|
|
print("Testing with Undetected Browser")
|
|
print("=" * 60)
|
|
|
|
browser_config = BrowserConfig(
|
|
headless=False, # Headless is easier to detect
|
|
verbose=True,
|
|
viewport_width=1920,
|
|
viewport_height=1080,
|
|
)
|
|
|
|
# Create undetected adapter
|
|
undetected_adapter = UndetectedAdapter()
|
|
|
|
# Create strategy with undetected adapter
|
|
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
|
browser_config=browser_config,
|
|
browser_adapter=undetected_adapter
|
|
)
|
|
|
|
async with AsyncWebCrawler(
|
|
crawler_strategy=crawler_strategy,
|
|
config=browser_config
|
|
) as crawler:
|
|
config = CrawlerRunConfig(
|
|
delay_before_return_html=2.0,
|
|
simulate_user=True,
|
|
)
|
|
|
|
result = await crawler.arun(url=TEST_URL, config=config)
|
|
|
|
print(f"\n✓ Success: {result.success}")
|
|
print(f"✓ Status Code: {result.status_code}")
|
|
print(f"✓ HTML Length: {len(result.html)}")
|
|
|
|
# Check for Cloudflare challenge
|
|
if result.html:
|
|
cf_indicators = [
|
|
"Checking your browser",
|
|
"Please stand by",
|
|
"cloudflare",
|
|
"cf-browser-verification",
|
|
"Access denied",
|
|
"Ray ID"
|
|
]
|
|
|
|
detected = False
|
|
for indicator in cf_indicators:
|
|
if indicator.lower() in result.html.lower():
|
|
print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found")
|
|
detected = True
|
|
break
|
|
|
|
if not detected and len(result.markdown.raw_markdown) > 100:
|
|
print("✅ Successfully bypassed Cloudflare!")
|
|
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
|
elif not detected:
|
|
print("⚠️ Page loaded but content seems minimal")
|
|
|
|
return result
|
|
|
|
async def main():
|
|
"""Compare regular vs undetected browser"""
|
|
print("🤖 Crawl4AI - Cloudflare Bypass Test")
|
|
print(f"Testing URL: {TEST_URL}\n")
|
|
|
|
# Test regular browser
|
|
regular_result = await test_regular_browser()
|
|
|
|
# Small delay
|
|
await asyncio.sleep(2)
|
|
|
|
# Test undetected browser
|
|
undetected_result = await test_undetected_browser()
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Regular Browser:")
|
|
print(f" - Success: {regular_result.success}")
|
|
print(f" - Content Length: {len(regular_result.markdown.raw_markdown) if regular_result.markdown else 0}")
|
|
|
|
print(f"\nUndetected Browser:")
|
|
print(f" - Success: {undetected_result.success}")
|
|
print(f" - Content Length: {len(undetected_result.markdown.raw_markdown) if undetected_result.markdown else 0}")
|
|
|
|
if undetected_result.success and len(undetected_result.markdown.raw_markdown) > len(regular_result.markdown.raw_markdown):
|
|
print("\n✅ Undetected browser successfully bypassed protection!")
|
|
print("=" * 60)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |