286 lines
10 KiB
Python
286 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
End-to-end tests for the URL Discovery endpoint.
|
|
|
|
This test suite verifies the complete functionality of the /urls/discover endpoint
|
|
including happy path scenarios and error handling.
|
|
"""
|
|
|
|
import asyncio
|
|
import httpx
|
|
import json
|
|
import pytest
|
|
from typing import Dict, Any
|
|
|
|
# Test configuration
|
|
BASE_URL = "http://localhost:11235"
|
|
TEST_TIMEOUT = 30.0
|
|
|
|
|
|
class TestURLDiscoveryEndpoint:
|
|
"""End-to-end test suite for URL Discovery endpoint."""
|
|
|
|
@pytest.fixture
|
|
async def client(self):
|
|
"""Create an async HTTP client for testing."""
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
|
yield client
|
|
|
|
async def test_server_health(self, client):
|
|
"""Test that the server is healthy before running other tests."""
|
|
response = await client.get("/health")
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["status"] == "ok"
|
|
|
|
async def test_endpoint_exists(self, client):
|
|
"""Test that the /urls/discover endpoint exists and is documented."""
|
|
# Check OpenAPI spec includes our endpoint
|
|
response = await client.get("/openapi.json")
|
|
assert response.status_code == 200
|
|
|
|
openapi_spec = response.json()
|
|
assert "/urls/discover" in openapi_spec["paths"]
|
|
|
|
endpoint_spec = openapi_spec["paths"]["/urls/discover"]
|
|
assert "post" in endpoint_spec
|
|
assert endpoint_spec["post"]["summary"] == "URL Discovery and Seeding"
|
|
|
|
async def test_basic_url_discovery_happy_path(self, client):
|
|
"""Test basic URL discovery with minimal configuration."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
# Note: We don't assert length > 0 because URL discovery
|
|
# may legitimately return empty results
|
|
|
|
async def test_minimal_request_with_defaults(self, client):
|
|
"""Test that minimal request works with default seeding_config."""
|
|
request_data = {
|
|
"domain": "example.com"
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
async def test_advanced_configuration(self, client):
|
|
"""Test advanced configuration options."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap+cc",
|
|
"pattern": "*/docs/*",
|
|
"extract_head": True,
|
|
"max_urls": 3,
|
|
"live_check": True,
|
|
"concurrency": 50,
|
|
"hits_per_sec": 5,
|
|
"verbose": True
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
# If URLs are returned, they should have the expected structure
|
|
for url_obj in data:
|
|
assert isinstance(url_obj, dict)
|
|
# Should have at least a URL field
|
|
assert "url" in url_obj
|
|
|
|
async def test_bm25_scoring_configuration(self, client):
|
|
"""Test BM25 relevance scoring configuration."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"extract_head": True, # Required for scoring
|
|
"query": "documentation",
|
|
"scoring_method": "bm25",
|
|
"score_threshold": 0.1,
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
# If URLs are returned with scoring, check structure
|
|
for url_obj in data:
|
|
assert isinstance(url_obj, dict)
|
|
assert "url" in url_obj
|
|
# Scoring may or may not add score field depending on implementation
|
|
|
|
async def test_missing_required_domain_field(self, client):
|
|
"""Test error handling when required domain field is missing."""
|
|
request_data = {
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 422 # Validation error
|
|
|
|
error_data = response.json()
|
|
assert "detail" in error_data
|
|
assert any("domain" in str(error).lower() for error in error_data["detail"])
|
|
|
|
async def test_invalid_request_body_structure(self, client):
|
|
"""Test error handling with completely invalid request body."""
|
|
invalid_request = {
|
|
"invalid_field": "test_value",
|
|
"another_invalid": 123
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=invalid_request)
|
|
assert response.status_code == 422 # Validation error
|
|
|
|
error_data = response.json()
|
|
assert "detail" in error_data
|
|
|
|
async def test_invalid_seeding_config_parameters(self, client):
|
|
"""Test handling of invalid seeding configuration parameters."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "invalid_source", # Invalid source
|
|
"max_urls": "not_a_number" # Invalid type
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
# The endpoint should handle this gracefully
|
|
# It may return 200 with empty results or 500 with error details
|
|
assert response.status_code in [200, 500]
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
# May be empty due to invalid config
|
|
else:
|
|
# Should have error details
|
|
error_data = response.json()
|
|
assert "detail" in error_data
|
|
|
|
async def test_empty_seeding_config(self, client):
|
|
"""Test with empty seeding_config object."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
async def test_response_structure_consistency(self, client):
|
|
"""Test that response structure is consistent."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"max_urls": 1
|
|
}
|
|
}
|
|
|
|
# Make multiple requests to ensure consistency
|
|
for _ in range(3):
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
# If there are results, check they have consistent structure
|
|
for url_obj in data:
|
|
assert isinstance(url_obj, dict)
|
|
assert "url" in url_obj
|
|
|
|
async def test_content_type_validation(self, client):
|
|
"""Test that endpoint requires JSON content type."""
|
|
# Test with wrong content type
|
|
response = await client.post(
|
|
"/urls/discover",
|
|
content="domain=example.com",
|
|
headers={"Content-Type": "application/x-www-form-urlencoded"}
|
|
)
|
|
assert response.status_code == 422
|
|
|
|
|
|
# Standalone test runner for when pytest is not available
|
|
async def run_tests_standalone():
|
|
"""Run tests without pytest framework."""
|
|
print("🧪 Running URL Discovery Endpoint Tests")
|
|
print("=" * 50)
|
|
|
|
# Check server health first
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
|
try:
|
|
response = await client.get("/health")
|
|
assert response.status_code == 200
|
|
print("✅ Server health check passed")
|
|
except Exception as e:
|
|
print(f"❌ Server health check failed: {e}")
|
|
return False
|
|
|
|
test_suite = TestURLDiscoveryEndpoint()
|
|
|
|
# Run tests manually
|
|
tests = [
|
|
("Endpoint exists", test_suite.test_endpoint_exists),
|
|
("Basic URL discovery", test_suite.test_basic_url_discovery_happy_path),
|
|
("Minimal request", test_suite.test_minimal_request_with_defaults),
|
|
("Advanced configuration", test_suite.test_advanced_configuration),
|
|
("BM25 scoring", test_suite.test_bm25_scoring_configuration),
|
|
("Missing domain error", test_suite.test_missing_required_domain_field),
|
|
("Invalid request body", test_suite.test_invalid_request_body_structure),
|
|
("Invalid config handling", test_suite.test_invalid_seeding_config_parameters),
|
|
("Empty config", test_suite.test_empty_seeding_config),
|
|
("Response consistency", test_suite.test_response_structure_consistency),
|
|
("Content type validation", test_suite.test_content_type_validation),
|
|
]
|
|
|
|
passed = 0
|
|
failed = 0
|
|
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
|
for test_name, test_func in tests:
|
|
try:
|
|
await test_func(client)
|
|
print(f"✅ {test_name}")
|
|
passed += 1
|
|
except Exception as e:
|
|
print(f"❌ {test_name}: {e}")
|
|
failed += 1
|
|
|
|
print(f"\n📊 Test Results: {passed} passed, {failed} failed")
|
|
return failed == 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run tests standalone
|
|
success = asyncio.run(run_tests_standalone())
|
|
exit(0 if success else 1) |