feat: Add comprehensive tests for URL discovery and virtual scroll functionality
This commit is contained in:
239
tests/example_url_discovery.py
Normal file
239
tests/example_url_discovery.py
Normal file
@@ -0,0 +1,239 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Runnable example for the /urls/discover endpoint.
|
||||
|
||||
This script demonstrates how to use the new URL Discovery API endpoint
|
||||
to find relevant URLs from a domain before committing to a full crawl.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import httpx
|
||||
import json
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Configuration
|
||||
BASE_URL = "http://localhost:11235"
|
||||
EXAMPLE_DOMAIN = "nbcnews.com"
|
||||
|
||||
|
||||
async def discover_urls_basic_example():
|
||||
"""Basic example of URL discovery."""
|
||||
print("🔍 Basic URL Discovery Example")
|
||||
print("=" * 50)
|
||||
|
||||
# Basic discovery request
|
||||
request_data = {
|
||||
"domain": EXAMPLE_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "sitemap", # Use sitemap for fast discovery
|
||||
"max_urls": 10 # Limit to 10 URLs
|
||||
}
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{BASE_URL}/urls/discover",
|
||||
json=request_data,
|
||||
timeout=30.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
urls = response.json()
|
||||
print(f"✅ Found {len(urls)} URLs")
|
||||
|
||||
# Display first few URLs
|
||||
for i, url_obj in enumerate(urls[:3]):
|
||||
print(f" {i+1}. {url_obj.get('url', 'N/A')}")
|
||||
|
||||
return urls
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"❌ HTTP Error: {e.response.status_code}")
|
||||
print(f"Response: {e.response.text}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def discover_urls_advanced_example():
|
||||
"""Advanced example with filtering and metadata extraction."""
|
||||
print("\n🎯 Advanced URL Discovery Example")
|
||||
print("=" * 50)
|
||||
|
||||
# Advanced discovery with filtering
|
||||
request_data = {
|
||||
"domain": EXAMPLE_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "sitemap+cc", # Use both sitemap and Common Crawl
|
||||
"pattern": "*/news/*", # Filter to news articles only
|
||||
"extract_head": True, # Extract page metadata
|
||||
"max_urls": 5,
|
||||
"live_check": True, # Verify URLs are accessible
|
||||
"verbose": True
|
||||
}
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{BASE_URL}/urls/discover",
|
||||
json=request_data,
|
||||
timeout=60.0 # Longer timeout for advanced features
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
urls = response.json()
|
||||
print(f"✅ Found {len(urls)} news URLs with metadata")
|
||||
|
||||
# Display URLs with metadata
|
||||
for i, url_obj in enumerate(urls[:3]):
|
||||
print(f"\n {i+1}. URL: {url_obj.get('url', 'N/A')}")
|
||||
print(f" Status: {url_obj.get('status', 'unknown')}")
|
||||
|
||||
head_data = url_obj.get('head_data', {})
|
||||
if head_data:
|
||||
title = head_data.get('title', 'No title')
|
||||
description = head_data.get('description', 'No description')
|
||||
print(f" Title: {title[:60]}...")
|
||||
print(f" Description: {description[:60]}...")
|
||||
|
||||
return urls
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"❌ HTTP Error: {e.response.status_code}")
|
||||
print(f"Response: {e.response.text}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def discover_urls_with_scoring_example():
|
||||
"""Example using BM25 relevance scoring."""
|
||||
print("\n🏆 URL Discovery with Relevance Scoring")
|
||||
print("=" * 50)
|
||||
|
||||
# Discovery with relevance scoring
|
||||
request_data = {
|
||||
"domain": EXAMPLE_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"extract_head": True, # Required for BM25 scoring
|
||||
"query": "politics election", # Search for political content
|
||||
"scoring_method": "bm25",
|
||||
"score_threshold": 0.1, # Minimum relevance score
|
||||
"max_urls": 5
|
||||
}
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{BASE_URL}/urls/discover",
|
||||
json=request_data,
|
||||
timeout=60.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
urls = response.json()
|
||||
print(f"✅ Found {len(urls)} relevant URLs")
|
||||
|
||||
# Display URLs sorted by relevance score
|
||||
for i, url_obj in enumerate(urls[:3]):
|
||||
score = url_obj.get('score', 0)
|
||||
print(f"\n {i+1}. Score: {score:.3f}")
|
||||
print(f" URL: {url_obj.get('url', 'N/A')}")
|
||||
|
||||
head_data = url_obj.get('head_data', {})
|
||||
if head_data:
|
||||
title = head_data.get('title', 'No title')
|
||||
print(f" Title: {title[:60]}...")
|
||||
|
||||
return urls
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"❌ HTTP Error: {e.response.status_code}")
|
||||
print(f"Response: {e.response.text}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def demonstrate_request_schema():
|
||||
"""Show the complete request schema with all options."""
|
||||
print("\n📋 Complete Request Schema")
|
||||
print("=" * 50)
|
||||
|
||||
complete_schema = {
|
||||
"domain": "example.com", # Required: Domain to discover URLs from
|
||||
"seeding_config": { # Optional: Configuration object
|
||||
# Discovery sources
|
||||
"source": "sitemap+cc", # "sitemap", "cc", or "sitemap+cc"
|
||||
|
||||
# Filtering options
|
||||
"pattern": "*/blog/*", # URL pattern filter (glob style)
|
||||
"max_urls": 50, # Maximum URLs to return (-1 = no limit)
|
||||
"filter_nonsense_urls": True, # Filter out nonsense URLs
|
||||
|
||||
# Metadata and validation
|
||||
"extract_head": True, # Extract <head> metadata
|
||||
"live_check": True, # Verify URL accessibility
|
||||
|
||||
# Performance and rate limiting
|
||||
"concurrency": 100, # Concurrent requests
|
||||
"hits_per_sec": 10, # Rate limit (requests/second)
|
||||
"force": False, # Bypass cache
|
||||
|
||||
# Relevance scoring (requires extract_head=True)
|
||||
"query": "search terms", # Query for BM25 scoring
|
||||
"scoring_method": "bm25", # Scoring algorithm
|
||||
"score_threshold": 0.2, # Minimum score threshold
|
||||
|
||||
# Debugging
|
||||
"verbose": True # Enable verbose logging
|
||||
}
|
||||
}
|
||||
|
||||
print("Full request schema:")
|
||||
print(json.dumps(complete_schema, indent=2))
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples."""
|
||||
print("🚀 URL Discovery API Examples")
|
||||
print("=" * 50)
|
||||
print(f"Server: {BASE_URL}")
|
||||
print(f"Domain: {EXAMPLE_DOMAIN}")
|
||||
|
||||
# Check if server is running
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.get(f"{BASE_URL}/health", timeout=5.0)
|
||||
response.raise_for_status()
|
||||
print("✅ Server is running\n")
|
||||
except Exception as e:
|
||||
print(f"❌ Server not available: {e}")
|
||||
print("Please start the Crawl4AI server first:")
|
||||
print(" docker compose up crawl4ai -d")
|
||||
return
|
||||
|
||||
# Run examples
|
||||
await discover_urls_basic_example()
|
||||
await discover_urls_advanced_example()
|
||||
await discover_urls_with_scoring_example()
|
||||
|
||||
# Show schema
|
||||
demonstrate_request_schema()
|
||||
|
||||
print("\n🎉 Examples complete!")
|
||||
print("\nNext steps:")
|
||||
print("1. Use discovered URLs with the /crawl endpoint")
|
||||
print("2. Filter URLs based on your specific needs")
|
||||
print("3. Combine with other API endpoints for complete workflows")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
160
tests/test_implementation.py
Normal file
160
tests/test_implementation.py
Normal file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the new URL discovery functionality.
|
||||
This tests the handler function directly without running the full server.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add the repo to Python path
|
||||
repo_root = Path(__file__).parent
|
||||
sys.path.insert(0, str(repo_root))
|
||||
sys.path.insert(0, str(repo_root / "deploy" / "docker"))
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.syntax import Syntax
|
||||
|
||||
console = Console()
|
||||
|
||||
async def test_url_discovery_handler():
|
||||
"""Test the URL discovery handler function directly."""
|
||||
try:
|
||||
# Import the handler function and dependencies
|
||||
from api import handle_url_discovery
|
||||
from crawl4ai.async_configs import SeedingConfig
|
||||
|
||||
console.print("[bold cyan]Testing URL Discovery Handler Function[/bold cyan]")
|
||||
|
||||
# Test 1: Basic functionality
|
||||
console.print("\n[cyan]Test 1: Basic URL discovery[/cyan]")
|
||||
|
||||
domain = "docs.crawl4ai.com"
|
||||
seeding_config = {
|
||||
"source": "sitemap",
|
||||
"max_urls": 3,
|
||||
"verbose": True
|
||||
}
|
||||
|
||||
console.print(f"[blue]Domain:[/blue] {domain}")
|
||||
console.print(f"[blue]Config:[/blue] {seeding_config}")
|
||||
|
||||
# Call the handler directly
|
||||
result = await handle_url_discovery(domain, seeding_config)
|
||||
|
||||
console.print(f"[green]✓ Handler executed successfully[/green]")
|
||||
console.print(f"[green]✓ Result type: {type(result)}[/green]")
|
||||
console.print(f"[green]✓ Result length: {len(result)}[/green]")
|
||||
|
||||
# Print first few results if any
|
||||
if result and len(result) > 0:
|
||||
console.print("\n[blue]Sample results:[/blue]")
|
||||
for i, url_obj in enumerate(result[:2]):
|
||||
console.print(f" {i+1}. {url_obj}")
|
||||
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
console.print(f"[red]✗ Import error: {e}[/red]")
|
||||
console.print("[yellow]This suggests missing dependencies or module structure issues[/yellow]")
|
||||
return False
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Handler error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def test_seeding_config_validation():
|
||||
"""Test SeedingConfig validation."""
|
||||
try:
|
||||
from crawl4ai.async_configs import SeedingConfig
|
||||
|
||||
console.print("\n[cyan]Test 2: SeedingConfig validation[/cyan]")
|
||||
|
||||
# Test valid config
|
||||
valid_config = {
|
||||
"source": "sitemap",
|
||||
"max_urls": 5,
|
||||
"pattern": "*"
|
||||
}
|
||||
|
||||
config = SeedingConfig(**valid_config)
|
||||
console.print(f"[green]✓ Valid config created: {config.source}, max_urls={config.max_urls}[/green]")
|
||||
|
||||
# Test invalid config
|
||||
try:
|
||||
invalid_config = {
|
||||
"source": "invalid_source",
|
||||
"max_urls": 5
|
||||
}
|
||||
config = SeedingConfig(**invalid_config)
|
||||
console.print(f"[yellow]? Invalid config unexpectedly accepted[/yellow]")
|
||||
except Exception as e:
|
||||
console.print(f"[green]✓ Invalid config correctly rejected: {str(e)[:50]}...[/green]")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ SeedingConfig test error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def test_schema_validation():
|
||||
"""Test the URLDiscoveryRequest schema."""
|
||||
try:
|
||||
from schemas import URLDiscoveryRequest
|
||||
|
||||
console.print("\n[cyan]Test 3: URLDiscoveryRequest schema validation[/cyan]")
|
||||
|
||||
# Test valid request
|
||||
valid_request_data = {
|
||||
"domain": "example.com",
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"max_urls": 10
|
||||
}
|
||||
}
|
||||
|
||||
request = URLDiscoveryRequest(**valid_request_data)
|
||||
console.print(f"[green]✓ Valid request created: domain={request.domain}[/green]")
|
||||
|
||||
# Test request with default config
|
||||
minimal_request_data = {
|
||||
"domain": "example.com"
|
||||
}
|
||||
|
||||
request = URLDiscoveryRequest(**minimal_request_data)
|
||||
console.print(f"[green]✓ Minimal request created with defaults[/green]")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Schema test error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def main():
|
||||
"""Run all tests."""
|
||||
console.print("[bold blue]🔍 URL Discovery Implementation Tests[/bold blue]")
|
||||
|
||||
results = []
|
||||
|
||||
# Test the implementation components
|
||||
results.append(await test_seeding_config_validation())
|
||||
results.append(await test_schema_validation())
|
||||
results.append(await test_url_discovery_handler())
|
||||
|
||||
# Summary
|
||||
console.print("\n[bold cyan]Test Summary[/bold cyan]")
|
||||
passed = sum(results)
|
||||
total = len(results)
|
||||
|
||||
if passed == total:
|
||||
console.print(f"[bold green]✓ All {total} implementation tests passed![/bold green]")
|
||||
console.print("[green]The URL discovery endpoint is ready for integration testing[/green]")
|
||||
else:
|
||||
console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
|
||||
|
||||
return passed == total
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
193
tests/test_url_discovery.py
Normal file
193
tests/test_url_discovery.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the new /urls/discover endpoint in Crawl4AI Docker API.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import httpx
|
||||
import json
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.syntax import Syntax
|
||||
|
||||
console = Console()
|
||||
|
||||
# Configuration
|
||||
BASE_URL = "http://localhost:11235"
|
||||
TEST_DOMAIN = "docs.crawl4ai.com"
|
||||
|
||||
async def check_server_health(client: httpx.AsyncClient) -> bool:
|
||||
"""Check if the server is healthy."""
|
||||
console.print("[bold cyan]Checking server health...[/]", end="")
|
||||
try:
|
||||
response = await client.get("/health", timeout=10.0)
|
||||
response.raise_for_status()
|
||||
console.print(" [bold green]✓ Server is healthy![/]")
|
||||
return True
|
||||
except Exception as e:
|
||||
console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
|
||||
console.print(f"Is the server running at {BASE_URL}?")
|
||||
return False
|
||||
|
||||
def print_request(endpoint: str, payload: dict, title: str = "Request"):
|
||||
"""Pretty print the request."""
|
||||
syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
|
||||
console.print(Panel.fit(
|
||||
f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
|
||||
title=f"[bold blue]{title}[/]",
|
||||
border_style="blue"
|
||||
))
|
||||
|
||||
def print_response(response_data: dict, title: str = "Response"):
|
||||
"""Pretty print the response."""
|
||||
syntax = Syntax(json.dumps(response_data, indent=2), "json", theme="monokai")
|
||||
console.print(Panel.fit(
|
||||
syntax,
|
||||
title=f"[bold green]{title}[/]",
|
||||
border_style="green"
|
||||
))
|
||||
|
||||
async def test_urls_discover_basic():
|
||||
"""Test basic URL discovery functionality."""
|
||||
console.print("\n[bold yellow]Testing URL Discovery Endpoint[/bold yellow]")
|
||||
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
|
||||
# Check server health first
|
||||
if not await check_server_health(client):
|
||||
return False
|
||||
|
||||
# Test 1: Basic discovery with sitemap
|
||||
console.print("\n[cyan]Test 1: Basic URL discovery from sitemap[/cyan]")
|
||||
|
||||
payload = {
|
||||
"domain": TEST_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"max_urls": 5
|
||||
}
|
||||
}
|
||||
|
||||
print_request("/urls/discover", payload, "Basic Discovery Request")
|
||||
|
||||
try:
|
||||
response = await client.post("/urls/discover", json=payload)
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
|
||||
print_response(response_data, "Basic Discovery Response")
|
||||
|
||||
# Validate response structure
|
||||
if isinstance(response_data, list):
|
||||
console.print(f"[green]✓ Discovered {len(response_data)} URLs[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f"[red]✗ Expected list, got {type(response_data)}[/red]")
|
||||
return False
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
|
||||
return False
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def test_urls_discover_invalid_config():
|
||||
"""Test URL discovery with invalid configuration."""
|
||||
console.print("\n[cyan]Test 2: URL discovery with invalid configuration[/cyan]")
|
||||
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
|
||||
payload = {
|
||||
"domain": TEST_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "invalid_source", # Invalid source
|
||||
"max_urls": 5
|
||||
}
|
||||
}
|
||||
|
||||
print_request("/urls/discover", payload, "Invalid Config Request")
|
||||
|
||||
try:
|
||||
response = await client.post("/urls/discover", json=payload)
|
||||
|
||||
if response.status_code == 500:
|
||||
console.print("[green]✓ Server correctly rejected invalid config with 500 error[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f"[yellow]? Expected 500 error, got {response.status_code}[/yellow]")
|
||||
response_data = response.json()
|
||||
print_response(response_data, "Unexpected Response")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Unexpected error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def test_urls_discover_with_filtering():
|
||||
"""Test URL discovery with advanced filtering."""
|
||||
console.print("\n[cyan]Test 3: URL discovery with filtering and metadata[/cyan]")
|
||||
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
|
||||
payload = {
|
||||
"domain": TEST_DOMAIN,
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"pattern": "*/docs/*", # Filter to docs URLs only
|
||||
"extract_head": True, # Extract metadata
|
||||
"max_urls": 3
|
||||
}
|
||||
}
|
||||
|
||||
print_request("/urls/discover", payload, "Filtered Discovery Request")
|
||||
|
||||
try:
|
||||
response = await client.post("/urls/discover", json=payload)
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
|
||||
print_response(response_data, "Filtered Discovery Response")
|
||||
|
||||
# Validate response structure with metadata
|
||||
if isinstance(response_data, list) and len(response_data) > 0:
|
||||
sample_url = response_data[0]
|
||||
if "url" in sample_url:
|
||||
console.print(f"[green]✓ Discovered {len(response_data)} filtered URLs with metadata[/green]")
|
||||
return True
|
||||
else:
|
||||
console.print(f"[red]✗ URL objects missing expected fields[/red]")
|
||||
return False
|
||||
else:
|
||||
console.print(f"[yellow]? No URLs found with filter pattern[/yellow]")
|
||||
return True # This could be expected
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
|
||||
return False
|
||||
except Exception as e:
|
||||
console.print(f"[red]✗ Error: {e}[/red]")
|
||||
return False
|
||||
|
||||
async def main():
|
||||
"""Run all tests."""
|
||||
console.print("[bold cyan]🔍 URL Discovery Endpoint Tests[/bold cyan]")
|
||||
|
||||
results = []
|
||||
|
||||
# Run tests
|
||||
results.append(await test_urls_discover_basic())
|
||||
results.append(await test_urls_discover_invalid_config())
|
||||
results.append(await test_urls_discover_with_filtering())
|
||||
|
||||
# Summary
|
||||
console.print("\n[bold cyan]Test Summary[/bold cyan]")
|
||||
passed = sum(results)
|
||||
total = len(results)
|
||||
|
||||
if passed == total:
|
||||
console.print(f"[bold green]✓ All {total} tests passed![/bold green]")
|
||||
else:
|
||||
console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
|
||||
|
||||
return passed == total
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
286
tests/test_url_discovery_e2e.py
Normal file
286
tests/test_url_discovery_e2e.py
Normal file
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
End-to-end tests for the URL Discovery endpoint.
|
||||
|
||||
This test suite verifies the complete functionality of the /urls/discover endpoint
|
||||
including happy path scenarios and error handling.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import httpx
|
||||
import json
|
||||
import pytest
|
||||
from typing import Dict, Any
|
||||
|
||||
# Test configuration
|
||||
BASE_URL = "http://localhost:11235"
|
||||
TEST_TIMEOUT = 30.0
|
||||
|
||||
|
||||
class TestURLDiscoveryEndpoint:
|
||||
"""End-to-end test suite for URL Discovery endpoint."""
|
||||
|
||||
@pytest.fixture
|
||||
async def client(self):
|
||||
"""Create an async HTTP client for testing."""
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
||||
yield client
|
||||
|
||||
async def test_server_health(self, client):
|
||||
"""Test that the server is healthy before running other tests."""
|
||||
response = await client.get("/health")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["status"] == "ok"
|
||||
|
||||
async def test_endpoint_exists(self, client):
|
||||
"""Test that the /urls/discover endpoint exists and is documented."""
|
||||
# Check OpenAPI spec includes our endpoint
|
||||
response = await client.get("/openapi.json")
|
||||
assert response.status_code == 200
|
||||
|
||||
openapi_spec = response.json()
|
||||
assert "/urls/discover" in openapi_spec["paths"]
|
||||
|
||||
endpoint_spec = openapi_spec["paths"]["/urls/discover"]
|
||||
assert "post" in endpoint_spec
|
||||
assert endpoint_spec["post"]["summary"] == "URL Discovery and Seeding"
|
||||
|
||||
async def test_basic_url_discovery_happy_path(self, client):
|
||||
"""Test basic URL discovery with minimal configuration."""
|
||||
request_data = {
|
||||
"domain": "example.com",
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"max_urls": 5
|
||||
}
|
||||
}
|
||||
|
||||
response = await client.post("/urls/discover", json=request_data)
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert isinstance(data, list)
|
||||
# Note: We don't assert length > 0 because URL discovery
|
||||
# may legitimately return empty results
|
||||
|
||||
async def test_minimal_request_with_defaults(self, client):
|
||||
"""Test that minimal request works with default seeding_config."""
|
||||
request_data = {
|
||||
"domain": "example.com"
|
||||
}
|
||||
|
||||
response = await client.post("/urls/discover", json=request_data)
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert isinstance(data, list)
|
||||
|
||||
async def test_advanced_configuration(self, client):
|
||||
"""Test advanced configuration options."""
|
||||
request_data = {
|
||||
"domain": "example.com",
|
||||
"seeding_config": {
|
||||
"source": "sitemap+cc",
|
||||
"pattern": "*/docs/*",
|
||||
"extract_head": True,
|
||||
"max_urls": 3,
|
||||
"live_check": True,
|
||||
"concurrency": 50,
|
||||
"hits_per_sec": 5,
|
||||
"verbose": True
|
||||
}
|
||||
}
|
||||
|
||||
response = await client.post("/urls/discover", json=request_data)
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert isinstance(data, list)
|
||||
|
||||
# If URLs are returned, they should have the expected structure
|
||||
for url_obj in data:
|
||||
assert isinstance(url_obj, dict)
|
||||
# Should have at least a URL field
|
||||
assert "url" in url_obj
|
||||
|
||||
async def test_bm25_scoring_configuration(self, client):
|
||||
"""Test BM25 relevance scoring configuration."""
|
||||
request_data = {
|
||||
"domain": "example.com",
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"extract_head": True, # Required for scoring
|
||||
"query": "documentation",
|
||||
"scoring_method": "bm25",
|
||||
"score_threshold": 0.1,
|
||||
"max_urls": 5
|
||||
}
|
||||
}
|
||||
|
||||
response = await client.post("/urls/discover", json=request_data)
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert isinstance(data, list)
|
||||
|
||||
# If URLs are returned with scoring, check structure
|
||||
for url_obj in data:
|
||||
assert isinstance(url_obj, dict)
|
||||
assert "url" in url_obj
|
||||
# Scoring may or may not add score field depending on implementation
|
||||
|
||||
async def test_missing_required_domain_field(self, client):
|
||||
"""Test error handling when required domain field is missing."""
|
||||
request_data = {
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"max_urls": 5
|
||||
}
|
||||
}
|
||||
|
||||
response = await client.post("/urls/discover", json=request_data)
|
||||
assert response.status_code == 422 # Validation error
|
||||
|
||||
error_data = response.json()
|
||||
assert "detail" in error_data
|
||||
assert any("domain" in str(error).lower() for error in error_data["detail"])
|
||||
|
||||
async def test_invalid_request_body_structure(self, client):
|
||||
"""Test error handling with completely invalid request body."""
|
||||
invalid_request = {
|
||||
"invalid_field": "test_value",
|
||||
"another_invalid": 123
|
||||
}
|
||||
|
||||
response = await client.post("/urls/discover", json=invalid_request)
|
||||
assert response.status_code == 422 # Validation error
|
||||
|
||||
error_data = response.json()
|
||||
assert "detail" in error_data
|
||||
|
||||
async def test_invalid_seeding_config_parameters(self, client):
|
||||
"""Test handling of invalid seeding configuration parameters."""
|
||||
request_data = {
|
||||
"domain": "example.com",
|
||||
"seeding_config": {
|
||||
"source": "invalid_source", # Invalid source
|
||||
"max_urls": "not_a_number" # Invalid type
|
||||
}
|
||||
}
|
||||
|
||||
response = await client.post("/urls/discover", json=request_data)
|
||||
# The endpoint should handle this gracefully
|
||||
# It may return 200 with empty results or 500 with error details
|
||||
assert response.status_code in [200, 500]
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
assert isinstance(data, list)
|
||||
# May be empty due to invalid config
|
||||
else:
|
||||
# Should have error details
|
||||
error_data = response.json()
|
||||
assert "detail" in error_data
|
||||
|
||||
async def test_empty_seeding_config(self, client):
|
||||
"""Test with empty seeding_config object."""
|
||||
request_data = {
|
||||
"domain": "example.com",
|
||||
"seeding_config": {}
|
||||
}
|
||||
|
||||
response = await client.post("/urls/discover", json=request_data)
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert isinstance(data, list)
|
||||
|
||||
async def test_response_structure_consistency(self, client):
|
||||
"""Test that response structure is consistent."""
|
||||
request_data = {
|
||||
"domain": "example.com",
|
||||
"seeding_config": {
|
||||
"source": "sitemap",
|
||||
"max_urls": 1
|
||||
}
|
||||
}
|
||||
|
||||
# Make multiple requests to ensure consistency
|
||||
for _ in range(3):
|
||||
response = await client.post("/urls/discover", json=request_data)
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert isinstance(data, list)
|
||||
|
||||
# If there are results, check they have consistent structure
|
||||
for url_obj in data:
|
||||
assert isinstance(url_obj, dict)
|
||||
assert "url" in url_obj
|
||||
|
||||
async def test_content_type_validation(self, client):
|
||||
"""Test that endpoint requires JSON content type."""
|
||||
# Test with wrong content type
|
||||
response = await client.post(
|
||||
"/urls/discover",
|
||||
content="domain=example.com",
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"}
|
||||
)
|
||||
assert response.status_code == 422
|
||||
|
||||
|
||||
# Standalone test runner for when pytest is not available
|
||||
async def run_tests_standalone():
|
||||
"""Run tests without pytest framework."""
|
||||
print("🧪 Running URL Discovery Endpoint Tests")
|
||||
print("=" * 50)
|
||||
|
||||
# Check server health first
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
||||
try:
|
||||
response = await client.get("/health")
|
||||
assert response.status_code == 200
|
||||
print("✅ Server health check passed")
|
||||
except Exception as e:
|
||||
print(f"❌ Server health check failed: {e}")
|
||||
return False
|
||||
|
||||
test_suite = TestURLDiscoveryEndpoint()
|
||||
|
||||
# Run tests manually
|
||||
tests = [
|
||||
("Endpoint exists", test_suite.test_endpoint_exists),
|
||||
("Basic URL discovery", test_suite.test_basic_url_discovery_happy_path),
|
||||
("Minimal request", test_suite.test_minimal_request_with_defaults),
|
||||
("Advanced configuration", test_suite.test_advanced_configuration),
|
||||
("BM25 scoring", test_suite.test_bm25_scoring_configuration),
|
||||
("Missing domain error", test_suite.test_missing_required_domain_field),
|
||||
("Invalid request body", test_suite.test_invalid_request_body_structure),
|
||||
("Invalid config handling", test_suite.test_invalid_seeding_config_parameters),
|
||||
("Empty config", test_suite.test_empty_seeding_config),
|
||||
("Response consistency", test_suite.test_response_structure_consistency),
|
||||
("Content type validation", test_suite.test_content_type_validation),
|
||||
]
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
||||
for test_name, test_func in tests:
|
||||
try:
|
||||
await test_func(client)
|
||||
print(f"✅ {test_name}")
|
||||
passed += 1
|
||||
except Exception as e:
|
||||
print(f"❌ {test_name}: {e}")
|
||||
failed += 1
|
||||
|
||||
print(f"\n📊 Test Results: {passed} passed, {failed} failed")
|
||||
return failed == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run tests standalone
|
||||
success = asyncio.run(run_tests_standalone())
|
||||
exit(0 if success else 1)
|
||||
170
tests/test_virtual_scroll_api.py
Normal file
170
tests/test_virtual_scroll_api.py
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for VirtualScrollConfig with the /crawl API endpoint
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
def test_virtual_scroll_api():
|
||||
"""Test the /crawl endpoint with VirtualScrollConfig"""
|
||||
|
||||
# Create a simple HTML page with virtual scroll for testing
|
||||
test_html = '''
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
#container {
|
||||
height: 300px;
|
||||
overflow-y: auto;
|
||||
border: 1px solid #ccc;
|
||||
}
|
||||
.item {
|
||||
height: 30px;
|
||||
padding: 5px;
|
||||
border-bottom: 1px solid #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Virtual Scroll Test</h1>
|
||||
<div id="container">
|
||||
<div class="item">Item 1</div>
|
||||
<div class="item">Item 2</div>
|
||||
<div class="item">Item 3</div>
|
||||
<div class="item">Item 4</div>
|
||||
<div class="item">Item 5</div>
|
||||
</div>
|
||||
<script>
|
||||
// Simple script to simulate virtual scroll
|
||||
const container = document.getElementById('container');
|
||||
let itemCount = 5;
|
||||
|
||||
// Add more items when scrolling
|
||||
container.addEventListener('scroll', function() {
|
||||
if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
|
||||
for (let i = 0; i < 5; i++) {
|
||||
itemCount++;
|
||||
const newItem = document.createElement('div');
|
||||
newItem.className = 'item';
|
||||
newItem.textContent = `Item ${itemCount}`;
|
||||
container.appendChild(newItem);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Initial scroll to trigger loading
|
||||
setTimeout(() => {
|
||||
container.scrollTop = container.scrollHeight;
|
||||
}, 100);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
# Save the HTML to a temporary file and serve it
|
||||
import tempfile
|
||||
import os
|
||||
import http.server
|
||||
import socketserver
|
||||
import threading
|
||||
import time
|
||||
|
||||
# Create temporary HTML file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
|
||||
f.write(test_html)
|
||||
temp_file = f.name
|
||||
|
||||
# Start local server
|
||||
os.chdir(os.path.dirname(temp_file))
|
||||
port = 8080
|
||||
|
||||
class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
def log_message(self, format, *args):
|
||||
pass # Suppress log messages
|
||||
|
||||
try:
|
||||
with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
|
||||
server_thread = threading.Thread(target=httpd.serve_forever)
|
||||
server_thread.daemon = True
|
||||
server_thread.start()
|
||||
|
||||
time.sleep(0.5) # Give server time to start
|
||||
|
||||
# Now test the API
|
||||
url = f"http://crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"
|
||||
|
||||
payload = {
|
||||
"urls": [url],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
"viewport_width": 1920,
|
||||
"viewport_height": 1080
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"virtual_scroll_config": {
|
||||
"type": "VirtualScrollConfig",
|
||||
"params": {
|
||||
"container_selector": "#container",
|
||||
"scroll_count": 3,
|
||||
"scroll_by": "container_height",
|
||||
"wait_after_scroll": 0.5
|
||||
}
|
||||
},
|
||||
"cache_mode": "bypass",
|
||||
"extraction_strategy": {
|
||||
"type": "NoExtractionStrategy",
|
||||
"params": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print("Testing VirtualScrollConfig with /crawl endpoint...")
|
||||
print(f"Test URL: {url}")
|
||||
print("Payload:")
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:11234/crawl",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
print(f"\nResponse Status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print("✅ Success! VirtualScrollConfig is working.")
|
||||
print(f"Content length: {len(result[0]['content']['raw_content'])} characters")
|
||||
|
||||
# Check if virtual scroll captured more content
|
||||
if "Item 10" in result[0]['content']['raw_content']:
|
||||
print("✅ Virtual scroll successfully captured additional content!")
|
||||
else:
|
||||
print("⚠️ Virtual scroll may not have worked as expected")
|
||||
|
||||
# Print a snippet of the content
|
||||
content_preview = result[0]['content']['raw_content'][:500] + "..."
|
||||
print(f"\nContent preview:\n{content_preview}")
|
||||
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
print(f"Response: {response.text}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Test failed with error: {e}")
|
||||
finally:
|
||||
# Cleanup
|
||||
try:
|
||||
os.unlink(temp_file)
|
||||
except:
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_virtual_scroll_api()
|
||||
117
tests/test_virtual_scroll_api_simple.py
Normal file
117
tests/test_virtual_scroll_api_simple.py
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test VirtualScrollConfig with the /crawl API using existing test assets
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
import http.server
|
||||
import socketserver
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
def test_virtual_scroll_api():
|
||||
"""Test the /crawl endpoint with VirtualScrollConfig using test assets"""
|
||||
|
||||
# Use the existing test assets
|
||||
assets_dir = Path(__file__).parent / "docs" / "examples" / "assets"
|
||||
if not assets_dir.exists():
|
||||
print(f"❌ Assets directory not found: {assets_dir}")
|
||||
return
|
||||
|
||||
# Start local server for assets
|
||||
os.chdir(assets_dir)
|
||||
port = 8081
|
||||
|
||||
class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
def log_message(self, format, *args):
|
||||
pass # Suppress log messages
|
||||
|
||||
try:
|
||||
with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
|
||||
server_thread = threading.Thread(target=httpd.serve_forever)
|
||||
server_thread.daemon = True
|
||||
server_thread.start()
|
||||
|
||||
time.sleep(0.5) # Give server time to start
|
||||
|
||||
# Test with Twitter-like virtual scroll
|
||||
url = f"http://docs.crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"
|
||||
|
||||
payload = {
|
||||
"urls": [url],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
"viewport_width": 1280,
|
||||
"viewport_height": 800
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"virtual_scroll_config": {
|
||||
"type": "VirtualScrollConfig",
|
||||
"params": {
|
||||
"container_selector": "#timeline",
|
||||
"scroll_count": 10,
|
||||
"scroll_by": "container_height",
|
||||
"wait_after_scroll": 0.3
|
||||
}
|
||||
},
|
||||
"cache_mode": "bypass",
|
||||
"extraction_strategy": {
|
||||
"type": "NoExtractionStrategy",
|
||||
"params": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print("Testing VirtualScrollConfig with /crawl endpoint...")
|
||||
print(f"Test URL: {url}")
|
||||
print("Payload:")
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:11234/crawl",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=60 # Longer timeout for virtual scroll
|
||||
)
|
||||
|
||||
print(f"\nResponse Status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print("✅ Success! VirtualScrollConfig is working with the API.")
|
||||
print(f"Content length: {len(result[0]['content']['raw_content'])} characters")
|
||||
|
||||
# Check if we captured multiple posts (indicating virtual scroll worked)
|
||||
content = result[0]['content']['raw_content']
|
||||
post_count = content.count("Post #")
|
||||
print(f"Found {post_count} posts in the content")
|
||||
|
||||
if post_count > 5: # Should capture more than just the initial posts
|
||||
print("✅ Virtual scroll successfully captured additional content!")
|
||||
else:
|
||||
print("⚠️ Virtual scroll may not have captured much additional content")
|
||||
|
||||
# Print a snippet of the content
|
||||
content_preview = content[:1000] + "..." if len(content) > 1000 else content
|
||||
print(f"\nContent preview:\n{content_preview}")
|
||||
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
print(f"Response: {response.text}")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print("❌ Request timed out - virtual scroll may be taking too long")
|
||||
except Exception as e:
|
||||
print(f"❌ Test failed with error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_virtual_scroll_api()
|
||||
Reference in New Issue
Block a user