- Implemented `test_adapter_verification.py` to verify correct usage of browser adapters. - Created `test_all_features.py` for a comprehensive suite covering URL seeding, adaptive crawling, browser adapters, proxy rotation, and dispatchers. - Developed `test_anti_bot_strategy.py` to validate the functionality of various anti-bot strategies. - Added `test_antibot_simple.py` for simple testing of anti-bot strategies using async web crawling. - Introduced `test_bot_detection.py` to assess adapter performance against bot detection mechanisms. - Compiled `test_final_summary.py` to provide a detailed summary of all tests and their results.
115 lines
4.0 KiB
Python
115 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Simple test of anti-bot strategy functionality
|
||
"""
|
||
import asyncio
|
||
import sys
|
||
import os
|
||
|
||
# Add the project root to Python path
|
||
sys.path.insert(0, os.getcwd())
|
||
|
||
async def test_antibot_strategies():
|
||
"""Test different anti-bot strategies"""
|
||
print("🧪 Testing Anti-Bot Strategies with AsyncWebCrawler")
|
||
print("=" * 60)
|
||
|
||
try:
|
||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||
from crawl4ai.browser_adapter import PlaywrightAdapter
|
||
|
||
# Test HTML content
|
||
test_html = """
|
||
<html>
|
||
<head><title>Test Page</title></head>
|
||
<body>
|
||
<h1>Anti-Bot Strategy Test</h1>
|
||
<p>This page tests different browser adapters.</p>
|
||
<div id="content">
|
||
<p>User-Agent detection test</p>
|
||
<script>
|
||
document.getElementById('content').innerHTML +=
|
||
'<p>Browser: ' + navigator.userAgent + '</p>';
|
||
</script>
|
||
</div>
|
||
</body>
|
||
</html>
|
||
"""
|
||
|
||
# Save test HTML
|
||
with open('/tmp/antibot_test.html', 'w') as f:
|
||
f.write(test_html)
|
||
|
||
test_url = 'file:///tmp/antibot_test.html'
|
||
|
||
strategies = [
|
||
('default', 'Default Playwright'),
|
||
('stealth', 'Stealth Mode'),
|
||
]
|
||
|
||
for strategy, description in strategies:
|
||
print(f"\n🔍 Testing: {description} (strategy: {strategy})")
|
||
print("-" * 40)
|
||
|
||
try:
|
||
# Import adapter based on strategy
|
||
if strategy == 'stealth':
|
||
try:
|
||
from crawl4ai import StealthAdapter
|
||
adapter = StealthAdapter()
|
||
print(f"✅ Using StealthAdapter")
|
||
except ImportError:
|
||
print(f"⚠️ StealthAdapter not available, using PlaywrightAdapter")
|
||
adapter = PlaywrightAdapter()
|
||
else:
|
||
adapter = PlaywrightAdapter()
|
||
print(f"✅ Using PlaywrightAdapter")
|
||
|
||
# Configure browser
|
||
browser_config = BrowserConfig(
|
||
headless=True,
|
||
browser_type="chromium"
|
||
)
|
||
|
||
# Configure crawler
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode="bypass"
|
||
)
|
||
|
||
# Run crawler
|
||
async with AsyncWebCrawler(
|
||
config=browser_config,
|
||
browser_adapter=adapter
|
||
) as crawler:
|
||
result = await crawler.arun(
|
||
url=test_url,
|
||
config=crawler_config
|
||
)
|
||
|
||
if result.success:
|
||
print(f"✅ Crawl successful")
|
||
print(f" 📄 Title: {result.metadata.get('title', 'N/A')}")
|
||
print(f" 📏 Content length: {len(result.markdown)} chars")
|
||
|
||
# Check if user agent info is in content
|
||
if 'User-Agent' in result.markdown or 'Browser:' in result.markdown:
|
||
print(f" 🔍 User-agent info detected in content")
|
||
else:
|
||
print(f" ℹ️ No user-agent info in content")
|
||
else:
|
||
print(f"❌ Crawl failed: {result.error_message}")
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error testing {strategy}: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
print(f"\n🎉 Anti-bot strategy testing completed!")
|
||
|
||
except Exception as e:
|
||
print(f"❌ Setup error: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(test_antibot_strategies()) |