Files
crawl4ai/tests/test_url_discovery.py

193 lines
7.0 KiB
Python

#!/usr/bin/env python3
"""
Test script for the new /urls/discover endpoint in Crawl4AI Docker API.
"""
import asyncio
import httpx
import json
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax
console = Console()
# Configuration
BASE_URL = "http://localhost:11235"
TEST_DOMAIN = "docs.crawl4ai.com"
async def check_server_health(client: httpx.AsyncClient) -> bool:
"""Check if the server is healthy."""
console.print("[bold cyan]Checking server health...[/]", end="")
try:
response = await client.get("/health", timeout=10.0)
response.raise_for_status()
console.print(" [bold green]✓ Server is healthy![/]")
return True
except Exception as e:
console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
console.print(f"Is the server running at {BASE_URL}?")
return False
def print_request(endpoint: str, payload: dict, title: str = "Request"):
"""Pretty print the request."""
syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
console.print(Panel.fit(
f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
title=f"[bold blue]{title}[/]",
border_style="blue"
))
def print_response(response_data: dict, title: str = "Response"):
"""Pretty print the response."""
syntax = Syntax(json.dumps(response_data, indent=2), "json", theme="monokai")
console.print(Panel.fit(
syntax,
title=f"[bold green]{title}[/]",
border_style="green"
))
async def test_urls_discover_basic():
"""Test basic URL discovery functionality."""
console.print("\n[bold yellow]Testing URL Discovery Endpoint[/bold yellow]")
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
# Check server health first
if not await check_server_health(client):
return False
# Test 1: Basic discovery with sitemap
console.print("\n[cyan]Test 1: Basic URL discovery from sitemap[/cyan]")
payload = {
"domain": TEST_DOMAIN,
"seeding_config": {
"source": "sitemap",
"max_urls": 5
}
}
print_request("/urls/discover", payload, "Basic Discovery Request")
try:
response = await client.post("/urls/discover", json=payload)
response.raise_for_status()
response_data = response.json()
print_response(response_data, "Basic Discovery Response")
# Validate response structure
if isinstance(response_data, list):
console.print(f"[green]✓ Discovered {len(response_data)} URLs[/green]")
return True
else:
console.print(f"[red]✗ Expected list, got {type(response_data)}[/red]")
return False
except httpx.HTTPStatusError as e:
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
return False
except Exception as e:
console.print(f"[red]✗ Error: {e}[/red]")
return False
async def test_urls_discover_invalid_config():
"""Test URL discovery with invalid configuration."""
console.print("\n[cyan]Test 2: URL discovery with invalid configuration[/cyan]")
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
payload = {
"domain": TEST_DOMAIN,
"seeding_config": {
"source": "invalid_source", # Invalid source
"max_urls": 5
}
}
print_request("/urls/discover", payload, "Invalid Config Request")
try:
response = await client.post("/urls/discover", json=payload)
if response.status_code == 500:
console.print("[green]✓ Server correctly rejected invalid config with 500 error[/green]")
return True
else:
console.print(f"[yellow]? Expected 500 error, got {response.status_code}[/yellow]")
response_data = response.json()
print_response(response_data, "Unexpected Response")
return False
except Exception as e:
console.print(f"[red]✗ Unexpected error: {e}[/red]")
return False
async def test_urls_discover_with_filtering():
"""Test URL discovery with advanced filtering."""
console.print("\n[cyan]Test 3: URL discovery with filtering and metadata[/cyan]")
async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
payload = {
"domain": TEST_DOMAIN,
"seeding_config": {
"source": "sitemap",
"pattern": "*/docs/*", # Filter to docs URLs only
"extract_head": True, # Extract metadata
"max_urls": 3
}
}
print_request("/urls/discover", payload, "Filtered Discovery Request")
try:
response = await client.post("/urls/discover", json=payload)
response.raise_for_status()
response_data = response.json()
print_response(response_data, "Filtered Discovery Response")
# Validate response structure with metadata
if isinstance(response_data, list) and len(response_data) > 0:
sample_url = response_data[0]
if "url" in sample_url:
console.print(f"[green]✓ Discovered {len(response_data)} filtered URLs with metadata[/green]")
return True
else:
console.print(f"[red]✗ URL objects missing expected fields[/red]")
return False
else:
console.print(f"[yellow]? No URLs found with filter pattern[/yellow]")
return True # This could be expected
except httpx.HTTPStatusError as e:
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
return False
except Exception as e:
console.print(f"[red]✗ Error: {e}[/red]")
return False
async def main():
"""Run all tests."""
console.print("[bold cyan]🔍 URL Discovery Endpoint Tests[/bold cyan]")
results = []
# Run tests
results.append(await test_urls_discover_basic())
results.append(await test_urls_discover_invalid_config())
results.append(await test_urls_discover_with_filtering())
# Summary
console.print("\n[bold cyan]Test Summary[/bold cyan]")
passed = sum(results)
total = len(results)
if passed == total:
console.print(f"[bold green]✓ All {total} tests passed![/bold green]")
else:
console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
return passed == total
if __name__ == "__main__":
asyncio.run(main())