193 lines
7.0 KiB
Python
193 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script for the new /urls/discover endpoint in Crawl4AI Docker API.
|
|
"""
|
|
|
|
import asyncio
|
|
import httpx
|
|
import json
|
|
from rich.console import Console
|
|
from rich.panel import Panel
|
|
from rich.syntax import Syntax
|
|
|
|
console = Console()
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:11235"
|
|
TEST_DOMAIN = "docs.crawl4ai.com"
|
|
|
|
async def check_server_health(client: httpx.AsyncClient) -> bool:
|
|
"""Check if the server is healthy."""
|
|
console.print("[bold cyan]Checking server health...[/]", end="")
|
|
try:
|
|
response = await client.get("/health", timeout=10.0)
|
|
response.raise_for_status()
|
|
console.print(" [bold green]✓ Server is healthy![/]")
|
|
return True
|
|
except Exception as e:
|
|
console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
|
|
console.print(f"Is the server running at {BASE_URL}?")
|
|
return False
|
|
|
|
def print_request(endpoint: str, payload: dict, title: str = "Request"):
|
|
"""Pretty print the request."""
|
|
syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
|
|
console.print(Panel.fit(
|
|
f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
|
|
title=f"[bold blue]{title}[/]",
|
|
border_style="blue"
|
|
))
|
|
|
|
def print_response(response_data: dict, title: str = "Response"):
|
|
"""Pretty print the response."""
|
|
syntax = Syntax(json.dumps(response_data, indent=2), "json", theme="monokai")
|
|
console.print(Panel.fit(
|
|
syntax,
|
|
title=f"[bold green]{title}[/]",
|
|
border_style="green"
|
|
))
|
|
|
|
async def test_urls_discover_basic():
|
|
"""Test basic URL discovery functionality."""
|
|
console.print("\n[bold yellow]Testing URL Discovery Endpoint[/bold yellow]")
|
|
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
|
|
# Check server health first
|
|
if not await check_server_health(client):
|
|
return False
|
|
|
|
# Test 1: Basic discovery with sitemap
|
|
console.print("\n[cyan]Test 1: Basic URL discovery from sitemap[/cyan]")
|
|
|
|
payload = {
|
|
"domain": TEST_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
print_request("/urls/discover", payload, "Basic Discovery Request")
|
|
|
|
try:
|
|
response = await client.post("/urls/discover", json=payload)
|
|
response.raise_for_status()
|
|
response_data = response.json()
|
|
|
|
print_response(response_data, "Basic Discovery Response")
|
|
|
|
# Validate response structure
|
|
if isinstance(response_data, list):
|
|
console.print(f"[green]✓ Discovered {len(response_data)} URLs[/green]")
|
|
return True
|
|
else:
|
|
console.print(f"[red]✗ Expected list, got {type(response_data)}[/red]")
|
|
return False
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
|
|
return False
|
|
except Exception as e:
|
|
console.print(f"[red]✗ Error: {e}[/red]")
|
|
return False
|
|
|
|
async def test_urls_discover_invalid_config():
|
|
"""Test URL discovery with invalid configuration."""
|
|
console.print("\n[cyan]Test 2: URL discovery with invalid configuration[/cyan]")
|
|
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
|
|
payload = {
|
|
"domain": TEST_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "invalid_source", # Invalid source
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
print_request("/urls/discover", payload, "Invalid Config Request")
|
|
|
|
try:
|
|
response = await client.post("/urls/discover", json=payload)
|
|
|
|
if response.status_code == 500:
|
|
console.print("[green]✓ Server correctly rejected invalid config with 500 error[/green]")
|
|
return True
|
|
else:
|
|
console.print(f"[yellow]? Expected 500 error, got {response.status_code}[/yellow]")
|
|
response_data = response.json()
|
|
print_response(response_data, "Unexpected Response")
|
|
return False
|
|
|
|
except Exception as e:
|
|
console.print(f"[red]✗ Unexpected error: {e}[/red]")
|
|
return False
|
|
|
|
async def test_urls_discover_with_filtering():
|
|
"""Test URL discovery with advanced filtering."""
|
|
console.print("\n[cyan]Test 3: URL discovery with filtering and metadata[/cyan]")
|
|
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
|
|
payload = {
|
|
"domain": TEST_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"pattern": "*/docs/*", # Filter to docs URLs only
|
|
"extract_head": True, # Extract metadata
|
|
"max_urls": 3
|
|
}
|
|
}
|
|
|
|
print_request("/urls/discover", payload, "Filtered Discovery Request")
|
|
|
|
try:
|
|
response = await client.post("/urls/discover", json=payload)
|
|
response.raise_for_status()
|
|
response_data = response.json()
|
|
|
|
print_response(response_data, "Filtered Discovery Response")
|
|
|
|
# Validate response structure with metadata
|
|
if isinstance(response_data, list) and len(response_data) > 0:
|
|
sample_url = response_data[0]
|
|
if "url" in sample_url:
|
|
console.print(f"[green]✓ Discovered {len(response_data)} filtered URLs with metadata[/green]")
|
|
return True
|
|
else:
|
|
console.print(f"[red]✗ URL objects missing expected fields[/red]")
|
|
return False
|
|
else:
|
|
console.print(f"[yellow]? No URLs found with filter pattern[/yellow]")
|
|
return True # This could be expected
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
|
|
return False
|
|
except Exception as e:
|
|
console.print(f"[red]✗ Error: {e}[/red]")
|
|
return False
|
|
|
|
async def main():
|
|
"""Run all tests."""
|
|
console.print("[bold cyan]🔍 URL Discovery Endpoint Tests[/bold cyan]")
|
|
|
|
results = []
|
|
|
|
# Run tests
|
|
results.append(await test_urls_discover_basic())
|
|
results.append(await test_urls_discover_invalid_config())
|
|
results.append(await test_urls_discover_with_filtering())
|
|
|
|
# Summary
|
|
console.print("\n[bold cyan]Test Summary[/bold cyan]")
|
|
passed = sum(results)
|
|
total = len(results)
|
|
|
|
if passed == total:
|
|
console.print(f"[bold green]✓ All {total} tests passed![/bold green]")
|
|
else:
|
|
console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
|
|
|
|
return passed == total
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |