feat: add comprehensive type definitions and improve test coverage

Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
This commit is contained in:
AHMET YILMAZ
2025-10-13 18:49:01 +08:00
parent 201843a204
commit 8cca9704eb
21 changed files with 2626 additions and 704 deletions

View File

@@ -6,24 +6,49 @@ This script runs all the tests and provides a comprehensive summary
of the anti-bot strategy implementation.
"""
import requests
import time
import sys
import os
import sys
import time
import requests
# Add current directory to path for imports
sys.path.insert(0, os.getcwd())
sys.path.insert(0, os.path.join(os.getcwd(), 'deploy', 'docker'))
sys.path.insert(0, os.path.join(os.getcwd(), "deploy", "docker"))
def test_health():
"""Test if the API server is running"""
try:
response = requests.get("http://localhost:11235/health", timeout=5)
return response.status_code == 200
except:
return False
assert response.status_code == 200, (
f"Server returned status {response.status_code}"
)
except Exception as e:
assert False, f"Cannot connect to server: {e}"
def test_strategy(strategy_name, url="https://httpbin.org/headers"):
def test_strategy_default():
"""Test default anti-bot strategy"""
test_strategy_impl("default", "https://httpbin.org/headers")
def test_strategy_stealth():
"""Test stealth anti-bot strategy"""
test_strategy_impl("stealth", "https://httpbin.org/headers")
def test_strategy_undetected():
"""Test undetected anti-bot strategy"""
test_strategy_impl("undetected", "https://httpbin.org/headers")
def test_strategy_max_evasion():
"""Test max evasion anti-bot strategy"""
test_strategy_impl("max_evasion", "https://httpbin.org/headers")
def test_strategy_impl(strategy_name, url="https://httpbin.org/headers"):
"""Test a specific anti-bot strategy"""
try:
payload = {
@@ -31,56 +56,61 @@ def test_strategy(strategy_name, url="https://httpbin.org/headers"):
"anti_bot_strategy": strategy_name,
"headless": True,
"browser_config": {},
"crawler_config": {}
"crawler_config": {},
}
response = requests.post(
"http://localhost:11235/crawl",
json=payload,
timeout=30
"http://localhost:11235/crawl", json=payload, timeout=30
)
if response.status_code == 200:
data = response.json()
if data.get("success"):
return True, "Success"
assert True, f"Strategy {strategy_name} succeeded"
else:
return False, f"API returned success=false"
assert False, f"API returned success=false for {strategy_name}"
else:
return False, f"HTTP {response.status_code}"
assert False, f"HTTP {response.status_code} for {strategy_name}"
except requests.exceptions.Timeout:
return False, "Timeout (30s)"
assert False, f"Timeout (30s) for {strategy_name}"
except Exception as e:
return False, str(e)
assert False, f"Error testing {strategy_name}: {e}"
def test_core_functions():
"""Test core adapter selection functions"""
try:
from api import _get_browser_adapter, _apply_headless_setting
from api import _apply_headless_setting, _get_browser_adapter
from crawl4ai.async_configs import BrowserConfig
# Test adapter selection
config = BrowserConfig(headless=True)
strategies = ['default', 'stealth', 'undetected', 'max_evasion']
expected = ['PlaywrightAdapter', 'StealthAdapter', 'UndetectedAdapter', 'UndetectedAdapter']
results = []
strategies = ["default", "stealth", "undetected", "max_evasion"]
expected = [
"PlaywrightAdapter",
"StealthAdapter",
"UndetectedAdapter",
"UndetectedAdapter",
]
for strategy, expected_adapter in zip(strategies, expected):
adapter = _get_browser_adapter(strategy, config)
actual = adapter.__class__.__name__
results.append((strategy, expected_adapter, actual, actual == expected_adapter))
return True, results
assert actual == expected_adapter, (
f"Expected {expected_adapter}, got {actual} for strategy {strategy}"
)
except Exception as e:
return False, str(e)
assert False, f"Core functions failed: {e}"
def main():
"""Run comprehensive test summary"""
print("🚀 Anti-Bot Strategy Implementation - Final Test Summary")
print("=" * 70)
# Test 1: Health Check
print("\n1⃣ Server Health Check")
print("-" * 30)
@@ -88,9 +118,11 @@ def main():
print("✅ API server is running and healthy")
else:
print("❌ API server is not responding")
print("💡 Start server with: python -m fastapi dev deploy/docker/server.py --port 11235")
print(
"💡 Start server with: python -m fastapi dev deploy/docker/server.py --port 11235"
)
return
# Test 2: Core Functions
print("\n2⃣ Core Function Testing")
print("-" * 30)
@@ -102,13 +134,13 @@ def main():
print(f" {status} {strategy}: {actual} ({'' if match else ''})")
else:
print(f"❌ Core functions failed: {core_result}")
# Test 3: API Strategy Testing
print("\n3⃣ API Strategy Testing")
print("-" * 30)
strategies = ['default', 'stealth', 'undetected', 'max_evasion']
strategies = ["default", "stealth", "undetected", "max_evasion"]
all_passed = True
for strategy in strategies:
print(f" Testing {strategy}...", end=" ")
success, message = test_strategy(strategy)
@@ -117,17 +149,17 @@ def main():
else:
print(f"{message}")
all_passed = False
# Test 4: Different Scenarios
print("\n4⃣ Scenario Testing")
print("-" * 30)
scenarios = [
("Headers inspection", "stealth", "https://httpbin.org/headers"),
("User-agent detection", "undetected", "https://httpbin.org/user-agent"),
("HTML content", "default", "https://httpbin.org/html"),
]
for scenario_name, strategy, url in scenarios:
print(f" {scenario_name} ({strategy})...", end=" ")
success, message = test_strategy(strategy, url)
@@ -135,45 +167,49 @@ def main():
print("")
else:
print(f"{message}")
# Summary
print("\n" + "=" * 70)
print("📋 IMPLEMENTATION SUMMARY")
print("=" * 70)
print("\n✅ COMPLETED FEATURES:")
print(" • Browser adapter selection (PlaywrightAdapter, StealthAdapter, UndetectedAdapter)")
print("API endpoints (/crawl and /crawl/stream) with anti_bot_strategy parameter")
print(
"Browser adapter selection (PlaywrightAdapter, StealthAdapter, UndetectedAdapter)"
)
print(
" • API endpoints (/crawl and /crawl/stream) with anti_bot_strategy parameter"
)
print(" • Headless mode override functionality")
print(" • Crawler pool integration with adapter awareness")
print(" • Error handling and fallback mechanisms")
print(" • Comprehensive documentation and examples")
print("\n🎯 AVAILABLE STRATEGIES:")
print(" • default: PlaywrightAdapter - Fast, basic crawling")
print(" • stealth: StealthAdapter - Medium protection bypass")
print(" • stealth: StealthAdapter - Medium protection bypass")
print(" • undetected: UndetectedAdapter - High protection bypass")
print(" • max_evasion: UndetectedAdapter - Maximum evasion features")
print("\n🧪 TESTING STATUS:")
print(" ✅ Core functionality tests passing")
print(" ✅ API endpoint tests passing")
print(" ✅ Real website crawling working")
print(" ✅ All adapter strategies functional")
print(" ✅ Documentation and examples complete")
print("\n📚 DOCUMENTATION:")
print(" • ANTI_BOT_STRATEGY_DOCS.md - Complete API documentation")
print(" • ANTI_BOT_QUICK_REF.md - Quick reference guide")
print(" • examples_antibot_usage.py - Practical examples")
print(" • ANTI_BOT_README.md - Overview and getting started")
print("\n🚀 READY FOR PRODUCTION!")
print("\n💡 Usage example:")
print(' curl -X POST "http://localhost:11235/crawl" \\')
print(' -H "Content-Type: application/json" \\')
print(' -d \'{"urls":["https://example.com"],"anti_bot_strategy":"stealth"}\'')
print("\n" + "=" * 70)
if all_passed:
print("🎉 ALL TESTS PASSED - IMPLEMENTATION SUCCESSFUL! 🎉")
@@ -181,5 +217,6 @@ def main():
print("⚠️ Some tests failed - check details above")
print("=" * 70)
if __name__ == "__main__":
main()
main()