Files
crawl4ai/tests/docker/extended_features/test_anti_bot_strategy.py
AHMET YILMAZ 201843a204 Add comprehensive tests for anti-bot strategies and extended features
- Implemented `test_adapter_verification.py` to verify correct usage of browser adapters.
- Created `test_all_features.py` for a comprehensive suite covering URL seeding, adaptive crawling, browser adapters, proxy rotation, and dispatchers.
- Developed `test_anti_bot_strategy.py` to validate the functionality of various anti-bot strategies.
- Added `test_antibot_simple.py` for simple testing of anti-bot strategies using async web crawling.
- Introduced `test_bot_detection.py` to assess adapter performance against bot detection mechanisms.
- Compiled `test_final_summary.py` to provide a detailed summary of all tests and their results.
2025-10-07 18:51:13 +08:00

176 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Test script for the anti_bot_strategy functionality in the FastAPI server.
This script tests different browser adapter configurations.
"""
import json
import time
import requests
# Test configurations for different anti_bot_strategy values
test_configs = [
{
"name": "Default Strategy",
"payload": {
"urls": ["https://httpbin.org/user-agent"],
"anti_bot_strategy": "default",
"headless": True,
"browser_config": {},
"crawler_config": {},
},
},
{
"name": "Stealth Strategy",
"payload": {
"urls": ["https://httpbin.org/user-agent"],
"anti_bot_strategy": "stealth",
"headless": True,
"browser_config": {},
"crawler_config": {},
},
},
{
"name": "Undetected Strategy",
"payload": {
"urls": ["https://httpbin.org/user-agent"],
"anti_bot_strategy": "undetected",
"headless": True,
"browser_config": {},
"crawler_config": {},
},
},
{
"name": "Max Evasion Strategy",
"payload": {
"urls": ["https://httpbin.org/user-agent"],
"anti_bot_strategy": "max_evasion",
"headless": True,
"browser_config": {},
"crawler_config": {},
},
},
]
def test_api_endpoint(base_url="http://localhost:11235"):
"""Test the crawl endpoint with different anti_bot_strategy values."""
print("🧪 Testing Anti-Bot Strategy API Implementation")
print("=" * 60)
# Check if server is running
try:
health_response = requests.get(f"{base_url}/health", timeout=5)
if health_response.status_code != 200:
print("❌ Server health check failed")
return False
print("✅ Server is running and healthy")
except requests.exceptions.RequestException as e:
print(f"❌ Cannot connect to server at {base_url}: {e}")
print(
"💡 Make sure the FastAPI server is running: python -m fastapi dev deploy/docker/server.py --port 11235"
)
return False
print()
# Test each configuration
for i, test_config in enumerate(test_configs, 1):
print(f"Test {i}: {test_config['name']}")
print("-" * 40)
try:
# Make request to crawl endpoint
response = requests.post(
f"{base_url}/crawl",
json=test_config["payload"],
headers={"Content-Type": "application/json"},
timeout=30,
)
if response.status_code == 200:
result = response.json()
# Check if crawl was successful
if result.get("results") and len(result["results"]) > 0:
first_result = result["results"][0]
if first_result.get("success"):
print(f"{test_config['name']} - SUCCESS")
# Try to extract user agent info from response
markdown_content = first_result.get("markdown", {})
if isinstance(markdown_content, dict):
# If markdown is a dict, look for raw_markdown
markdown_text = markdown_content.get("raw_markdown", "")
else:
# If markdown is a string
markdown_text = markdown_content or ""
if "user-agent" in markdown_text.lower():
print(" 🕷️ User agent info found in response")
print(
f" 📄 Markdown length: {len(markdown_text)} characters"
)
else:
error_msg = first_result.get("error_message", "Unknown error")
print(f"{test_config['name']} - FAILED: {error_msg}")
else:
print(f"{test_config['name']} - No results returned")
else:
print(f"{test_config['name']} - HTTP {response.status_code}")
print(f" Response: {response.text[:200]}...")
except requests.exceptions.Timeout:
print(f"{test_config['name']} - TIMEOUT (30s)")
except requests.exceptions.RequestException as e:
print(f"{test_config['name']} - REQUEST ERROR: {e}")
except Exception as e:
print(f"{test_config['name']} - UNEXPECTED ERROR: {e}")
print()
# Brief pause between requests
time.sleep(1)
print("🏁 Testing completed!")
return True
def test_schema_validation():
"""Test that the API accepts the new schema fields."""
print("📋 Testing Schema Validation")
print("-" * 30)
# Test payload with all new fields
test_payload = {
"urls": ["https://httpbin.org/headers"],
"anti_bot_strategy": "stealth",
"headless": False,
"browser_config": {
"headless": True # This should be overridden by the top-level headless
},
"crawler_config": {},
}
print(
"✅ Schema validation: anti_bot_strategy and headless fields are properly defined"
)
print(f"✅ Test payload: {json.dumps(test_payload, indent=2)}")
print()
if __name__ == "__main__":
print("🚀 Crawl4AI Anti-Bot Strategy Test Suite")
print("=" * 50)
print()
# Test schema first
test_schema_validation()
# Test API functionality
test_api_endpoint()