Files
crawl4ai/tests/docker/extended_features/test_anti_bot_strategy.py
AHMET YILMAZ 8cca9704eb feat: add comprehensive type definitions and improve test coverage
Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
2025-10-13 18:49:01 +08:00

173 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
Test script for the anti_bot_strategy functionality in the FastAPI server.
This script tests different browser adapter configurations.
"""
import json
import time
import requests
# Test configurations for different anti_bot_strategy values
test_configs = [
{
"name": "Default Strategy",
"payload": {
"urls": ["https://httpbin.org/user-agent"],
"anti_bot_strategy": "default",
"headless": True,
"browser_config": {},
"crawler_config": {},
},
},
{
"name": "Stealth Strategy",
"payload": {
"urls": ["https://httpbin.org/user-agent"],
"anti_bot_strategy": "stealth",
"headless": True,
"browser_config": {},
"crawler_config": {},
},
},
{
"name": "Undetected Strategy",
"payload": {
"urls": ["https://httpbin.org/user-agent"],
"anti_bot_strategy": "undetected",
"headless": True,
"browser_config": {},
"crawler_config": {},
},
},
{
"name": "Max Evasion Strategy",
"payload": {
"urls": ["https://httpbin.org/user-agent"],
"anti_bot_strategy": "max_evasion",
"headless": True,
"browser_config": {},
"crawler_config": {},
},
},
]
def test_api_endpoint(base_url="http://localhost:11235"):
"""Test the crawl endpoint with different anti_bot_strategy values."""
print("🧪 Testing Anti-Bot Strategy API Implementation")
print("=" * 60)
# Check if server is running
try:
health_response = requests.get(f"{base_url}/health", timeout=5)
if health_response.status_code != 200:
print("❌ Server health check failed")
return False
print("✅ Server is running and healthy")
except requests.exceptions.RequestException as e:
print(f"❌ Cannot connect to server at {base_url}: {e}")
print(
"💡 Make sure the FastAPI server is running: python -m fastapi dev deploy/docker/server.py --port 11235"
)
return False
print()
# Test each configuration
for i, test_config in enumerate(test_configs, 1):
print(f"Test {i}: {test_config['name']}")
print("-" * 40)
try:
# Make request to crawl endpoint
response = requests.post(
f"{base_url}/crawl",
json=test_config["payload"],
headers={"Content-Type": "application/json"},
timeout=30,
)
if response.status_code == 200:
result = response.json()
# Check if crawl was successful
if result.get("results") and len(result["results"]) > 0:
first_result = result["results"][0]
if first_result.get("success"):
print(f"{test_config['name']} - SUCCESS")
# Try to extract user agent info from response
markdown_content = first_result.get("markdown", {})
if isinstance(markdown_content, dict):
# If markdown is a dict, look for raw_markdown
markdown_text = markdown_content.get("raw_markdown", "")
else:
# If markdown is a string
markdown_text = markdown_content or ""
if "user-agent" in markdown_text.lower():
print(" 🕷️ User agent info found in response")
print(f" 📄 Markdown length: {len(markdown_text)} characters")
else:
error_msg = first_result.get("error_message", "Unknown error")
print(f"{test_config['name']} - FAILED: {error_msg}")
else:
print(f"{test_config['name']} - No results returned")
else:
print(f"{test_config['name']} - HTTP {response.status_code}")
print(f" Response: {response.text[:200]}...")
except requests.exceptions.Timeout:
print(f"{test_config['name']} - TIMEOUT (30s)")
except requests.exceptions.RequestException as e:
print(f"{test_config['name']} - REQUEST ERROR: {e}")
except Exception as e:
print(f"{test_config['name']} - UNEXPECTED ERROR: {e}")
print()
# Brief pause between requests
time.sleep(1)
print("🏁 Testing completed!")
def test_schema_validation():
"""Test that the API accepts the new schema fields."""
print("📋 Testing Schema Validation")
print("-" * 30)
# Test payload with all new fields
test_payload = {
"urls": ["https://httpbin.org/headers"],
"anti_bot_strategy": "stealth",
"headless": False,
"browser_config": {
"headless": True # This should be overridden by the top-level headless
},
"crawler_config": {},
}
print(
"✅ Schema validation: anti_bot_strategy and headless fields are properly defined"
)
print(f"✅ Test payload: {json.dumps(test_payload, indent=2)}")
print()
if __name__ == "__main__":
print("🚀 Crawl4AI Anti-Bot Strategy Test Suite")
print("=" * 50)
print()
# Test schema first
test_schema_validation()
# Test API functionality
test_api_endpoint()