feat: add comprehensive type definitions and improve test coverage

Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
This commit is contained in:
AHMET YILMAZ
2025-10-13 18:49:01 +08:00
parent 201843a204
commit 8cca9704eb
21 changed files with 2626 additions and 704 deletions

View File

@@ -2,22 +2,27 @@
"""
Simple test of anti-bot strategy functionality
"""
import asyncio
import sys
import os
import sys
import pytest
# Add the project root to Python path
sys.path.insert(0, os.getcwd())
@pytest.mark.asyncio
async def test_antibot_strategies():
"""Test different anti-bot strategies"""
print("🧪 Testing Anti-Bot Strategies with AsyncWebCrawler")
print("=" * 60)
try:
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.browser_adapter import PlaywrightAdapter
# Test HTML content
test_html = """
<html>
@@ -35,81 +40,81 @@ async def test_antibot_strategies():
</body>
</html>
"""
# Save test HTML
with open('/tmp/antibot_test.html', 'w') as f:
with open("/tmp/antibot_test.html", "w") as f:
f.write(test_html)
test_url = 'file:///tmp/antibot_test.html'
test_url = "file:///tmp/antibot_test.html"
strategies = [
('default', 'Default Playwright'),
('stealth', 'Stealth Mode'),
("default", "Default Playwright"),
("stealth", "Stealth Mode"),
]
for strategy, description in strategies:
print(f"\n🔍 Testing: {description} (strategy: {strategy})")
print("-" * 40)
try:
# Import adapter based on strategy
if strategy == 'stealth':
if strategy == "stealth":
try:
from crawl4ai import StealthAdapter
adapter = StealthAdapter()
print(f"✅ Using StealthAdapter")
except ImportError:
print(f"⚠️ StealthAdapter not available, using PlaywrightAdapter")
print(
f"⚠️ StealthAdapter not available, using PlaywrightAdapter"
)
adapter = PlaywrightAdapter()
else:
adapter = PlaywrightAdapter()
print(f"✅ Using PlaywrightAdapter")
# Configure browser
browser_config = BrowserConfig(
headless=True,
browser_type="chromium"
)
browser_config = BrowserConfig(headless=True, browser_type="chromium")
# Configure crawler
crawler_config = CrawlerRunConfig(
cache_mode="bypass"
)
crawler_config = CrawlerRunConfig(cache_mode="bypass")
# Run crawler
async with AsyncWebCrawler(
config=browser_config,
browser_adapter=adapter
config=browser_config, browser_adapter=adapter
) as crawler:
result = await crawler.arun(
url=test_url,
config=crawler_config
)
result = await crawler.arun(url=test_url, config=crawler_config)
if result.success:
print(f"✅ Crawl successful")
print(f" 📄 Title: {result.metadata.get('title', 'N/A')}")
print(f" 📏 Content length: {len(result.markdown)} chars")
# Check if user agent info is in content
if 'User-Agent' in result.markdown or 'Browser:' in result.markdown:
if (
"User-Agent" in result.markdown
or "Browser:" in result.markdown
):
print(f" 🔍 User-agent info detected in content")
else:
print(f" No user-agent info in content")
else:
print(f"❌ Crawl failed: {result.error_message}")
except Exception as e:
print(f"❌ Error testing {strategy}: {e}")
import traceback
traceback.print_exc()
print(f"\n🎉 Anti-bot strategy testing completed!")
except Exception as e:
print(f"❌ Setup error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(test_antibot_strategies())
asyncio.run(test_antibot_strategies())