feat: add comprehensive type definitions and improve test coverage
Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
This commit is contained in:
@@ -6,24 +6,49 @@ This script runs all the tests and provides a comprehensive summary
|
||||
of the anti-bot strategy implementation.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
# Add current directory to path for imports
|
||||
sys.path.insert(0, os.getcwd())
|
||||
sys.path.insert(0, os.path.join(os.getcwd(), 'deploy', 'docker'))
|
||||
sys.path.insert(0, os.path.join(os.getcwd(), "deploy", "docker"))
|
||||
|
||||
|
||||
def test_health():
|
||||
"""Test if the API server is running"""
|
||||
try:
|
||||
response = requests.get("http://localhost:11235/health", timeout=5)
|
||||
return response.status_code == 200
|
||||
except:
|
||||
return False
|
||||
assert response.status_code == 200, (
|
||||
f"Server returned status {response.status_code}"
|
||||
)
|
||||
except Exception as e:
|
||||
assert False, f"Cannot connect to server: {e}"
|
||||
|
||||
def test_strategy(strategy_name, url="https://httpbin.org/headers"):
|
||||
|
||||
def test_strategy_default():
|
||||
"""Test default anti-bot strategy"""
|
||||
test_strategy_impl("default", "https://httpbin.org/headers")
|
||||
|
||||
|
||||
def test_strategy_stealth():
|
||||
"""Test stealth anti-bot strategy"""
|
||||
test_strategy_impl("stealth", "https://httpbin.org/headers")
|
||||
|
||||
|
||||
def test_strategy_undetected():
|
||||
"""Test undetected anti-bot strategy"""
|
||||
test_strategy_impl("undetected", "https://httpbin.org/headers")
|
||||
|
||||
|
||||
def test_strategy_max_evasion():
|
||||
"""Test max evasion anti-bot strategy"""
|
||||
test_strategy_impl("max_evasion", "https://httpbin.org/headers")
|
||||
|
||||
|
||||
def test_strategy_impl(strategy_name, url="https://httpbin.org/headers"):
|
||||
"""Test a specific anti-bot strategy"""
|
||||
try:
|
||||
payload = {
|
||||
@@ -31,56 +56,61 @@ def test_strategy(strategy_name, url="https://httpbin.org/headers"):
|
||||
"anti_bot_strategy": strategy_name,
|
||||
"headless": True,
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
"crawler_config": {},
|
||||
}
|
||||
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl",
|
||||
json=payload,
|
||||
timeout=30
|
||||
"http://localhost:11235/crawl", json=payload, timeout=30
|
||||
)
|
||||
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get("success"):
|
||||
return True, "Success"
|
||||
assert True, f"Strategy {strategy_name} succeeded"
|
||||
else:
|
||||
return False, f"API returned success=false"
|
||||
assert False, f"API returned success=false for {strategy_name}"
|
||||
else:
|
||||
return False, f"HTTP {response.status_code}"
|
||||
|
||||
assert False, f"HTTP {response.status_code} for {strategy_name}"
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return False, "Timeout (30s)"
|
||||
assert False, f"Timeout (30s) for {strategy_name}"
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
assert False, f"Error testing {strategy_name}: {e}"
|
||||
|
||||
|
||||
def test_core_functions():
|
||||
"""Test core adapter selection functions"""
|
||||
try:
|
||||
from api import _get_browser_adapter, _apply_headless_setting
|
||||
from api import _apply_headless_setting, _get_browser_adapter
|
||||
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
|
||||
# Test adapter selection
|
||||
config = BrowserConfig(headless=True)
|
||||
strategies = ['default', 'stealth', 'undetected', 'max_evasion']
|
||||
expected = ['PlaywrightAdapter', 'StealthAdapter', 'UndetectedAdapter', 'UndetectedAdapter']
|
||||
|
||||
results = []
|
||||
strategies = ["default", "stealth", "undetected", "max_evasion"]
|
||||
expected = [
|
||||
"PlaywrightAdapter",
|
||||
"StealthAdapter",
|
||||
"UndetectedAdapter",
|
||||
"UndetectedAdapter",
|
||||
]
|
||||
|
||||
for strategy, expected_adapter in zip(strategies, expected):
|
||||
adapter = _get_browser_adapter(strategy, config)
|
||||
actual = adapter.__class__.__name__
|
||||
results.append((strategy, expected_adapter, actual, actual == expected_adapter))
|
||||
|
||||
return True, results
|
||||
|
||||
assert actual == expected_adapter, (
|
||||
f"Expected {expected_adapter}, got {actual} for strategy {strategy}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
assert False, f"Core functions failed: {e}"
|
||||
|
||||
|
||||
def main():
|
||||
"""Run comprehensive test summary"""
|
||||
print("🚀 Anti-Bot Strategy Implementation - Final Test Summary")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
# Test 1: Health Check
|
||||
print("\n1️⃣ Server Health Check")
|
||||
print("-" * 30)
|
||||
@@ -88,9 +118,11 @@ def main():
|
||||
print("✅ API server is running and healthy")
|
||||
else:
|
||||
print("❌ API server is not responding")
|
||||
print("💡 Start server with: python -m fastapi dev deploy/docker/server.py --port 11235")
|
||||
print(
|
||||
"💡 Start server with: python -m fastapi dev deploy/docker/server.py --port 11235"
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
# Test 2: Core Functions
|
||||
print("\n2️⃣ Core Function Testing")
|
||||
print("-" * 30)
|
||||
@@ -102,13 +134,13 @@ def main():
|
||||
print(f" {status} {strategy}: {actual} ({'✓' if match else '✗'})")
|
||||
else:
|
||||
print(f"❌ Core functions failed: {core_result}")
|
||||
|
||||
|
||||
# Test 3: API Strategy Testing
|
||||
print("\n3️⃣ API Strategy Testing")
|
||||
print("-" * 30)
|
||||
strategies = ['default', 'stealth', 'undetected', 'max_evasion']
|
||||
strategies = ["default", "stealth", "undetected", "max_evasion"]
|
||||
all_passed = True
|
||||
|
||||
|
||||
for strategy in strategies:
|
||||
print(f" Testing {strategy}...", end=" ")
|
||||
success, message = test_strategy(strategy)
|
||||
@@ -117,17 +149,17 @@ def main():
|
||||
else:
|
||||
print(f"❌ {message}")
|
||||
all_passed = False
|
||||
|
||||
|
||||
# Test 4: Different Scenarios
|
||||
print("\n4️⃣ Scenario Testing")
|
||||
print("-" * 30)
|
||||
|
||||
|
||||
scenarios = [
|
||||
("Headers inspection", "stealth", "https://httpbin.org/headers"),
|
||||
("User-agent detection", "undetected", "https://httpbin.org/user-agent"),
|
||||
("HTML content", "default", "https://httpbin.org/html"),
|
||||
]
|
||||
|
||||
|
||||
for scenario_name, strategy, url in scenarios:
|
||||
print(f" {scenario_name} ({strategy})...", end=" ")
|
||||
success, message = test_strategy(strategy, url)
|
||||
@@ -135,45 +167,49 @@ def main():
|
||||
print("✅")
|
||||
else:
|
||||
print(f"❌ {message}")
|
||||
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("📋 IMPLEMENTATION SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
print("\n✅ COMPLETED FEATURES:")
|
||||
print(" • Browser adapter selection (PlaywrightAdapter, StealthAdapter, UndetectedAdapter)")
|
||||
print(" • API endpoints (/crawl and /crawl/stream) with anti_bot_strategy parameter")
|
||||
print(
|
||||
" • Browser adapter selection (PlaywrightAdapter, StealthAdapter, UndetectedAdapter)"
|
||||
)
|
||||
print(
|
||||
" • API endpoints (/crawl and /crawl/stream) with anti_bot_strategy parameter"
|
||||
)
|
||||
print(" • Headless mode override functionality")
|
||||
print(" • Crawler pool integration with adapter awareness")
|
||||
print(" • Error handling and fallback mechanisms")
|
||||
print(" • Comprehensive documentation and examples")
|
||||
|
||||
|
||||
print("\n🎯 AVAILABLE STRATEGIES:")
|
||||
print(" • default: PlaywrightAdapter - Fast, basic crawling")
|
||||
print(" • stealth: StealthAdapter - Medium protection bypass")
|
||||
print(" • stealth: StealthAdapter - Medium protection bypass")
|
||||
print(" • undetected: UndetectedAdapter - High protection bypass")
|
||||
print(" • max_evasion: UndetectedAdapter - Maximum evasion features")
|
||||
|
||||
|
||||
print("\n🧪 TESTING STATUS:")
|
||||
print(" ✅ Core functionality tests passing")
|
||||
print(" ✅ API endpoint tests passing")
|
||||
print(" ✅ Real website crawling working")
|
||||
print(" ✅ All adapter strategies functional")
|
||||
print(" ✅ Documentation and examples complete")
|
||||
|
||||
|
||||
print("\n📚 DOCUMENTATION:")
|
||||
print(" • ANTI_BOT_STRATEGY_DOCS.md - Complete API documentation")
|
||||
print(" • ANTI_BOT_QUICK_REF.md - Quick reference guide")
|
||||
print(" • examples_antibot_usage.py - Practical examples")
|
||||
print(" • ANTI_BOT_README.md - Overview and getting started")
|
||||
|
||||
|
||||
print("\n🚀 READY FOR PRODUCTION!")
|
||||
print("\n💡 Usage example:")
|
||||
print(' curl -X POST "http://localhost:11235/crawl" \\')
|
||||
print(' -H "Content-Type: application/json" \\')
|
||||
print(' -d \'{"urls":["https://example.com"],"anti_bot_strategy":"stealth"}\'')
|
||||
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
if all_passed:
|
||||
print("🎉 ALL TESTS PASSED - IMPLEMENTATION SUCCESSFUL! 🎉")
|
||||
@@ -181,5 +217,6 @@ def main():
|
||||
print("⚠️ Some tests failed - check details above")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user