Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
160 lines
5.2 KiB
Python
160 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script for the new URL discovery functionality.
|
|
This tests the handler function directly without running the full server.
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Add the repo to Python path
|
|
repo_root = Path(__file__).parent
|
|
sys.path.insert(0, str(repo_root))
|
|
sys.path.insert(0, str(repo_root / "deploy" / "docker"))
|
|
|
|
from rich.console import Console
|
|
from rich.panel import Panel
|
|
from rich.syntax import Syntax
|
|
|
|
console = Console()
|
|
|
|
async def test_url_discovery_handler():
|
|
"""Test the URL discovery handler function directly."""
|
|
try:
|
|
# Import the handler function and dependencies
|
|
from api import handle_url_discovery
|
|
from crawl4ai.async_configs import SeedingConfig
|
|
|
|
console.print("[bold cyan]Testing URL Discovery Handler Function[/bold cyan]")
|
|
|
|
# Test 1: Basic functionality
|
|
console.print("\n[cyan]Test 1: Basic URL discovery[/cyan]")
|
|
|
|
domain = "docs.crawl4ai.com"
|
|
seeding_config = {
|
|
"source": "sitemap",
|
|
"max_urls": 3,
|
|
"verbose": True
|
|
}
|
|
|
|
console.print(f"[blue]Domain:[/blue] {domain}")
|
|
console.print(f"[blue]Config:[/blue] {seeding_config}")
|
|
|
|
# Call the handler directly
|
|
result = await handle_url_discovery(domain, seeding_config)
|
|
|
|
console.print(f"[green]✓ Handler executed successfully[/green]")
|
|
console.print(f"[green]✓ Result type: {type(result)}[/green]")
|
|
console.print(f"[green]✓ Result length: {len(result)}[/green]")
|
|
|
|
# Print first few results if any
|
|
if result and len(result) > 0:
|
|
console.print("\n[blue]Sample results:[/blue]")
|
|
for i, url_obj in enumerate(result[:2]):
|
|
console.print(f" {i+1}. {url_obj}")
|
|
|
|
return True
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]✗ Import error: {e}[/red]")
|
|
console.print("[yellow]This suggests missing dependencies or module structure issues[/yellow]")
|
|
return False
|
|
except Exception as e:
|
|
console.print(f"[red]✗ Handler error: {e}[/red]")
|
|
return False
|
|
|
|
async def test_seeding_config_validation():
|
|
"""Test SeedingConfig validation."""
|
|
try:
|
|
from crawl4ai.async_configs import SeedingConfig
|
|
|
|
console.print("\n[cyan]Test 2: SeedingConfig validation[/cyan]")
|
|
|
|
# Test valid config
|
|
valid_config = {
|
|
"source": "sitemap",
|
|
"max_urls": 5,
|
|
"pattern": "*"
|
|
}
|
|
|
|
config = SeedingConfig(**valid_config)
|
|
console.print(f"[green]✓ Valid config created: {config.source}, max_urls={config.max_urls}[/green]")
|
|
|
|
# Test invalid config
|
|
try:
|
|
invalid_config = {
|
|
"source": "invalid_source",
|
|
"max_urls": 5
|
|
}
|
|
config = SeedingConfig(**invalid_config)
|
|
console.print(f"[yellow]? Invalid config unexpectedly accepted[/yellow]")
|
|
except Exception as e:
|
|
console.print(f"[green]✓ Invalid config correctly rejected: {str(e)[:50]}...[/green]")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
console.print(f"[red]✗ SeedingConfig test error: {e}[/red]")
|
|
return False
|
|
|
|
async def test_schema_validation():
|
|
"""Test the URLDiscoveryRequest schema."""
|
|
try:
|
|
from schemas import URLDiscoveryRequest
|
|
|
|
console.print("\n[cyan]Test 3: URLDiscoveryRequest schema validation[/cyan]")
|
|
|
|
# Test valid request
|
|
valid_request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"max_urls": 10
|
|
}
|
|
}
|
|
|
|
request = URLDiscoveryRequest(**valid_request_data)
|
|
console.print(f"[green]✓ Valid request created: domain={request.domain}[/green]")
|
|
|
|
# Test request with default config
|
|
minimal_request_data = {
|
|
"domain": "example.com"
|
|
}
|
|
|
|
request = URLDiscoveryRequest(**minimal_request_data)
|
|
console.print(f"[green]✓ Minimal request created with defaults[/green]")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
console.print(f"[red]✗ Schema test error: {e}[/red]")
|
|
return False
|
|
|
|
async def main():
|
|
"""Run all tests."""
|
|
console.print("[bold blue]🔍 URL Discovery Implementation Tests[/bold blue]")
|
|
|
|
results = []
|
|
|
|
# Test the implementation components
|
|
results.append(await test_seeding_config_validation())
|
|
results.append(await test_schema_validation())
|
|
results.append(await test_url_discovery_handler())
|
|
|
|
# Summary
|
|
console.print("\n[bold cyan]Test Summary[/bold cyan]")
|
|
passed = sum(results)
|
|
total = len(results)
|
|
|
|
if passed == total:
|
|
console.print(f"[bold green]✓ All {total} implementation tests passed![/bold green]")
|
|
console.print("[green]The URL discovery endpoint is ready for integration testing[/green]")
|
|
else:
|
|
console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
|
|
|
|
return passed == total
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |