Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
286 lines
10 KiB
Python
286 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
End-to-end tests for the URL Discovery endpoint.
|
|
|
|
This test suite verifies the complete functionality of the /urls/discover endpoint
|
|
including happy path scenarios and error handling.
|
|
"""
|
|
|
|
import asyncio
|
|
import httpx
|
|
import json
|
|
import pytest
|
|
from typing import Dict, Any
|
|
|
|
# Test configuration
|
|
BASE_URL = "http://localhost:11235"
|
|
TEST_TIMEOUT = 30.0
|
|
|
|
|
|
class TestURLDiscoveryEndpoint:
|
|
"""End-to-end test suite for URL Discovery endpoint."""
|
|
|
|
@pytest.fixture
|
|
async def client(self):
|
|
"""Create an async HTTP client for testing."""
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
|
yield client
|
|
|
|
async def test_server_health(self, client):
|
|
"""Test that the server is healthy before running other tests."""
|
|
response = await client.get("/health")
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["status"] == "ok"
|
|
|
|
async def test_endpoint_exists(self, client):
|
|
"""Test that the /urls/discover endpoint exists and is documented."""
|
|
# Check OpenAPI spec includes our endpoint
|
|
response = await client.get("/openapi.json")
|
|
assert response.status_code == 200
|
|
|
|
openapi_spec = response.json()
|
|
assert "/urls/discover" in openapi_spec["paths"]
|
|
|
|
endpoint_spec = openapi_spec["paths"]["/urls/discover"]
|
|
assert "post" in endpoint_spec
|
|
assert endpoint_spec["post"]["summary"] == "URL Discovery and Seeding"
|
|
|
|
async def test_basic_url_discovery_happy_path(self, client):
|
|
"""Test basic URL discovery with minimal configuration."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
# Note: We don't assert length > 0 because URL discovery
|
|
# may legitimately return empty results
|
|
|
|
async def test_minimal_request_with_defaults(self, client):
|
|
"""Test that minimal request works with default seeding_config."""
|
|
request_data = {
|
|
"domain": "example.com"
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
async def test_advanced_configuration(self, client):
|
|
"""Test advanced configuration options."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap+cc",
|
|
"pattern": "*/docs/*",
|
|
"extract_head": True,
|
|
"max_urls": 3,
|
|
"live_check": True,
|
|
"concurrency": 50,
|
|
"hits_per_sec": 5,
|
|
"verbose": True
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
# If URLs are returned, they should have the expected structure
|
|
for url_obj in data:
|
|
assert isinstance(url_obj, dict)
|
|
# Should have at least a URL field
|
|
assert "url" in url_obj
|
|
|
|
async def test_bm25_scoring_configuration(self, client):
|
|
"""Test BM25 relevance scoring configuration."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"extract_head": True, # Required for scoring
|
|
"query": "documentation",
|
|
"scoring_method": "bm25",
|
|
"score_threshold": 0.1,
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
# If URLs are returned with scoring, check structure
|
|
for url_obj in data:
|
|
assert isinstance(url_obj, dict)
|
|
assert "url" in url_obj
|
|
# Scoring may or may not add score field depending on implementation
|
|
|
|
async def test_missing_required_domain_field(self, client):
|
|
"""Test error handling when required domain field is missing."""
|
|
request_data = {
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 422 # Validation error
|
|
|
|
error_data = response.json()
|
|
assert "detail" in error_data
|
|
assert any("domain" in str(error).lower() for error in error_data["detail"])
|
|
|
|
async def test_invalid_request_body_structure(self, client):
|
|
"""Test error handling with completely invalid request body."""
|
|
invalid_request = {
|
|
"invalid_field": "test_value",
|
|
"another_invalid": 123
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=invalid_request)
|
|
assert response.status_code == 422 # Validation error
|
|
|
|
error_data = response.json()
|
|
assert "detail" in error_data
|
|
|
|
async def test_invalid_seeding_config_parameters(self, client):
|
|
"""Test handling of invalid seeding configuration parameters."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "invalid_source", # Invalid source
|
|
"max_urls": "not_a_number" # Invalid type
|
|
}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
# The endpoint should handle this gracefully
|
|
# It may return 200 with empty results or 500 with error details
|
|
assert response.status_code in [200, 500]
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
# May be empty due to invalid config
|
|
else:
|
|
# Should have error details
|
|
error_data = response.json()
|
|
assert "detail" in error_data
|
|
|
|
async def test_empty_seeding_config(self, client):
|
|
"""Test with empty seeding_config object."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {}
|
|
}
|
|
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
async def test_response_structure_consistency(self, client):
|
|
"""Test that response structure is consistent."""
|
|
request_data = {
|
|
"domain": "example.com",
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"max_urls": 1
|
|
}
|
|
}
|
|
|
|
# Make multiple requests to ensure consistency
|
|
for _ in range(3):
|
|
response = await client.post("/urls/discover", json=request_data)
|
|
assert response.status_code == 200
|
|
|
|
data = response.json()
|
|
assert isinstance(data, list)
|
|
|
|
# If there are results, check they have consistent structure
|
|
for url_obj in data:
|
|
assert isinstance(url_obj, dict)
|
|
assert "url" in url_obj
|
|
|
|
async def test_content_type_validation(self, client):
|
|
"""Test that endpoint requires JSON content type."""
|
|
# Test with wrong content type
|
|
response = await client.post(
|
|
"/urls/discover",
|
|
content="domain=example.com",
|
|
headers={"Content-Type": "application/x-www-form-urlencoded"}
|
|
)
|
|
assert response.status_code == 422
|
|
|
|
|
|
# Standalone test runner for when pytest is not available
|
|
async def run_tests_standalone():
|
|
"""Run tests without pytest framework."""
|
|
print("🧪 Running URL Discovery Endpoint Tests")
|
|
print("=" * 50)
|
|
|
|
# Check server health first
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
|
try:
|
|
response = await client.get("/health")
|
|
assert response.status_code == 200
|
|
print("✅ Server health check passed")
|
|
except Exception as e:
|
|
print(f"❌ Server health check failed: {e}")
|
|
return False
|
|
|
|
test_suite = TestURLDiscoveryEndpoint()
|
|
|
|
# Run tests manually
|
|
tests = [
|
|
("Endpoint exists", test_suite.test_endpoint_exists),
|
|
("Basic URL discovery", test_suite.test_basic_url_discovery_happy_path),
|
|
("Minimal request", test_suite.test_minimal_request_with_defaults),
|
|
("Advanced configuration", test_suite.test_advanced_configuration),
|
|
("BM25 scoring", test_suite.test_bm25_scoring_configuration),
|
|
("Missing domain error", test_suite.test_missing_required_domain_field),
|
|
("Invalid request body", test_suite.test_invalid_request_body_structure),
|
|
("Invalid config handling", test_suite.test_invalid_seeding_config_parameters),
|
|
("Empty config", test_suite.test_empty_seeding_config),
|
|
("Response consistency", test_suite.test_response_structure_consistency),
|
|
("Content type validation", test_suite.test_content_type_validation),
|
|
]
|
|
|
|
passed = 0
|
|
failed = 0
|
|
|
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
|
|
for test_name, test_func in tests:
|
|
try:
|
|
await test_func(client)
|
|
print(f"✅ {test_name}")
|
|
passed += 1
|
|
except Exception as e:
|
|
print(f"❌ {test_name}: {e}")
|
|
failed += 1
|
|
|
|
print(f"\n📊 Test Results: {passed} passed, {failed} failed")
|
|
return failed == 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run tests standalone
|
|
success = asyncio.run(run_tests_standalone())
|
|
exit(0 if success else 1) |