feat: Add comprehensive tests for URL discovery and virtual scroll functionality
This commit is contained in:
193
tests/test_url_discovery.py
Normal file
193
tests/test_url_discovery.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the new /urls/discover endpoint in Crawl4AI Docker API.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import httpx
|
||||
import json
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.syntax import Syntax
|
||||
|
||||
console = Console()
|
||||
|
||||
# Configuration
|
||||
BASE_URL = "http://localhost:11235"
|
||||
TEST_DOMAIN = "docs.crawl4ai.com"
|
||||
|
||||
async def check_server_health(client: httpx.AsyncClient) -> bool:
|
||||
"""Check if the server is healthy."""
|
||||
console.print("[bold cyan]Checking server health...[/]", end="")
|
||||
try:
|
||||
response = await client.get("/health", timeout=10.0)
|
||||
response.raise_for_status()
|
||||
console.print(" [bold green]✓ Server is healthy![/]")
|
||||
return True
|
||||
except Exception as e:
|
||||
console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
|
||||
console.print(f"Is the server running at {BASE_URL}?")
|
||||
return False
|
||||
|
||||
def print_request(endpoint: str, payload: dict, title: str = "Request"):
|
||||
"""Pretty print the request."""
|
||||
syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
|
||||
console.print(Panel.fit(
|
||||
f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
|
||||
title=f"[bold blue]{title}[/]",
|
||||
border_style="blue"
|
||||
))
|
||||
|
||||
def print_response(response_data: dict, title: str = "Response"):
|
||||
"""Pretty print the response."""
|
||||
syntax = Syntax(json.dumps(response_data, indent=2), "json", theme="monokai")
|
||||
console.print(Panel.fit(
|
||||
syntax,
|
||||
title=f"[bold green]{title}[/]",
|
||||
border_style="green"
|
||||
))
|
||||
|
||||
async def test_urls_discover_basic():
|
||||
"""Test basic URL discovery functionality."""
|
||||
console.print("\n[bold yellow]Testing URL Discovery Endpoint[/bold yellow]")
|
||||
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
|
||||
# Check server health first
|
||||
if not await check_server_health(client):
|
||||
return False
|
||||
|
||||
# Test 1: Basic discovery with sitemap
|
||||
console.print("\n[cyan]Test 1: Basic URL discovery from sitemap[/cyan]")
|
||||
|
||||
payload = {
|
||||
"domain": TEST_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"max_urls": 5
|
||||
}
|
||||
}
|
||||
|
||||
print_request("/urls/discover", payload, "Basic Discovery Request")
|
||||
|
||||
try:
|
||||
response = await client.post("/urls/discover", json=payload)
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
|
||||
print_response(response_data, "Basic Discovery Response")
|
||||
|
||||
# Validate response structure
|
||||
if isinstance(response_data, list):
|
||||
console.print(f"[green]✓ Discovered {len(response_data)} URLs[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f"[red]✗ Expected list, got {type(response_data)}[/red]")
|
||||
return False
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
|
||||
return False
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def test_urls_discover_invalid_config():
|
||||
"""Test URL discovery with invalid configuration."""
|
||||
console.print("\n[cyan]Test 2: URL discovery with invalid configuration[/cyan]")
|
||||
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
|
||||
payload = {
|
||||
"domain": TEST_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "invalid_source", # Invalid source
|
||||
"max_urls": 5
|
||||
}
|
||||
}
|
||||
|
||||
print_request("/urls/discover", payload, "Invalid Config Request")
|
||||
|
||||
try:
|
||||
response = await client.post("/urls/discover", json=payload)
|
||||
|
||||
if response.status_code == 500:
|
||||
console.print("[green]✓ Server correctly rejected invalid config with 500 error[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f"[yellow]? Expected 500 error, got {response.status_code}[/yellow]")
|
||||
response_data = response.json()
|
||||
print_response(response_data, "Unexpected Response")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Unexpected error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def test_urls_discover_with_filtering():
|
||||
"""Test URL discovery with advanced filtering."""
|
||||
console.print("\n[cyan]Test 3: URL discovery with filtering and metadata[/cyan]")
|
||||
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
|
||||
payload = {
|
||||
"domain": TEST_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"pattern": "*/docs/*", # Filter to docs URLs only
|
||||
"extract_head": True, # Extract metadata
|
||||
"max_urls": 3
|
||||
}
|
||||
}
|
||||
|
||||
print_request("/urls/discover", payload, "Filtered Discovery Request")
|
||||
|
||||
try:
|
||||
response = await client.post("/urls/discover", json=payload)
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
|
||||
print_response(response_data, "Filtered Discovery Response")
|
||||
|
||||
# Validate response structure with metadata
|
||||
if isinstance(response_data, list) and len(response_data) > 0:
|
||||
sample_url = response_data[0]
|
||||
if "url" in sample_url:
|
||||
console.print(f"[green]✓ Discovered {len(response_data)} filtered URLs with metadata[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f"[red]✗ URL objects missing expected fields[/red]")
|
||||
return False
|
||||
else:
|
||||
console.print(f"[yellow]? No URLs found with filter pattern[/yellow]")
|
||||
return True # This could be expected
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
|
||||
return False
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def main():
|
||||
"""Run all tests."""
|
||||
console.print("[bold cyan]🔍 URL Discovery Endpoint Tests[/bold cyan]")
|
||||
|
||||
results = []
|
||||
|
||||
# Run tests
|
||||
results.append(await test_urls_discover_basic())
|
||||
results.append(await test_urls_discover_invalid_config())
|
||||
results.append(await test_urls_discover_with_filtering())
|
||||
|
||||
# Summary
|
||||
console.print("\n[bold cyan]Test Summary[/bold cyan]")
|
||||
passed = sum(results)
|
||||
total = len(results)
|
||||
|
||||
if passed == total:
|
||||
console.print(f"[bold green]✓ All {total} tests passed![/bold green]")
|
||||
else:
|
||||
console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
|
||||
|
||||
return passed == total
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user