Files
crawl4ai/test_url_discovery_e2e.py
AHMET YILMAZ 8cca9704eb feat: add comprehensive type definitions and improve test coverage
Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
2025-10-13 18:49:01 +08:00

286 lines
10 KiB
Python

#!/usr/bin/env python3
"""
End-to-end tests for the URL Discovery endpoint.
This test suite verifies the complete functionality of the /urls/discover endpoint
including happy path scenarios and error handling.
"""
import asyncio
import httpx
import json
import pytest
from typing import Dict, Any
# Test configuration
BASE_URL = "http://localhost:11235"
TEST_TIMEOUT = 30.0
class TestURLDiscoveryEndpoint:
"""End-to-end test suite for URL Discovery endpoint."""
@pytest.fixture
async def client(self):
"""Create an async HTTP client for testing."""
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
yield client
async def test_server_health(self, client):
"""Test that the server is healthy before running other tests."""
response = await client.get("/health")
assert response.status_code == 200
data = response.json()
assert data["status"] == "ok"
async def test_endpoint_exists(self, client):
"""Test that the /urls/discover endpoint exists and is documented."""
# Check OpenAPI spec includes our endpoint
response = await client.get("/openapi.json")
assert response.status_code == 200
openapi_spec = response.json()
assert "/urls/discover" in openapi_spec["paths"]
endpoint_spec = openapi_spec["paths"]["/urls/discover"]
assert "post" in endpoint_spec
assert endpoint_spec["post"]["summary"] == "URL Discovery and Seeding"
async def test_basic_url_discovery_happy_path(self, client):
"""Test basic URL discovery with minimal configuration."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap",
"max_urls": 5
}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
# Note: We don't assert length > 0 because URL discovery
# may legitimately return empty results
async def test_minimal_request_with_defaults(self, client):
"""Test that minimal request works with default seeding_config."""
request_data = {
"domain": "example.com"
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
async def test_advanced_configuration(self, client):
"""Test advanced configuration options."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap+cc",
"pattern": "*/docs/*",
"extract_head": True,
"max_urls": 3,
"live_check": True,
"concurrency": 50,
"hits_per_sec": 5,
"verbose": True
}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
# If URLs are returned, they should have the expected structure
for url_obj in data:
assert isinstance(url_obj, dict)
# Should have at least a URL field
assert "url" in url_obj
async def test_bm25_scoring_configuration(self, client):
"""Test BM25 relevance scoring configuration."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap",
"extract_head": True, # Required for scoring
"query": "documentation",
"scoring_method": "bm25",
"score_threshold": 0.1,
"max_urls": 5
}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
# If URLs are returned with scoring, check structure
for url_obj in data:
assert isinstance(url_obj, dict)
assert "url" in url_obj
# Scoring may or may not add score field depending on implementation
async def test_missing_required_domain_field(self, client):
"""Test error handling when required domain field is missing."""
request_data = {
"seeding_config": {
"source": "sitemap",
"max_urls": 5
}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 422 # Validation error
error_data = response.json()
assert "detail" in error_data
assert any("domain" in str(error).lower() for error in error_data["detail"])
async def test_invalid_request_body_structure(self, client):
"""Test error handling with completely invalid request body."""
invalid_request = {
"invalid_field": "test_value",
"another_invalid": 123
}
response = await client.post("/urls/discover", json=invalid_request)
assert response.status_code == 422 # Validation error
error_data = response.json()
assert "detail" in error_data
async def test_invalid_seeding_config_parameters(self, client):
"""Test handling of invalid seeding configuration parameters."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "invalid_source", # Invalid source
"max_urls": "not_a_number" # Invalid type
}
}
response = await client.post("/urls/discover", json=request_data)
# The endpoint should handle this gracefully
# It may return 200 with empty results or 500 with error details
assert response.status_code in [200, 500]
if response.status_code == 200:
data = response.json()
assert isinstance(data, list)
# May be empty due to invalid config
else:
# Should have error details
error_data = response.json()
assert "detail" in error_data
async def test_empty_seeding_config(self, client):
"""Test with empty seeding_config object."""
request_data = {
"domain": "example.com",
"seeding_config": {}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
async def test_response_structure_consistency(self, client):
"""Test that response structure is consistent."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap",
"max_urls": 1
}
}
# Make multiple requests to ensure consistency
for _ in range(3):
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
# If there are results, check they have consistent structure
for url_obj in data:
assert isinstance(url_obj, dict)
assert "url" in url_obj
async def test_content_type_validation(self, client):
"""Test that endpoint requires JSON content type."""
# Test with wrong content type
response = await client.post(
"/urls/discover",
content="domain=example.com",
headers={"Content-Type": "application/x-www-form-urlencoded"}
)
assert response.status_code == 422
# Standalone test runner for when pytest is not available
async def run_tests_standalone():
"""Run tests without pytest framework."""
print("🧪 Running URL Discovery Endpoint Tests")
print("=" * 50)
# Check server health first
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
try:
response = await client.get("/health")
assert response.status_code == 200
print("✅ Server health check passed")
except Exception as e:
print(f"❌ Server health check failed: {e}")
return False
test_suite = TestURLDiscoveryEndpoint()
# Run tests manually
tests = [
("Endpoint exists", test_suite.test_endpoint_exists),
("Basic URL discovery", test_suite.test_basic_url_discovery_happy_path),
("Minimal request", test_suite.test_minimal_request_with_defaults),
("Advanced configuration", test_suite.test_advanced_configuration),
("BM25 scoring", test_suite.test_bm25_scoring_configuration),
("Missing domain error", test_suite.test_missing_required_domain_field),
("Invalid request body", test_suite.test_invalid_request_body_structure),
("Invalid config handling", test_suite.test_invalid_seeding_config_parameters),
("Empty config", test_suite.test_empty_seeding_config),
("Response consistency", test_suite.test_response_structure_consistency),
("Content type validation", test_suite.test_content_type_validation),
]
passed = 0
failed = 0
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
for test_name, test_func in tests:
try:
await test_func(client)
print(f"{test_name}")
passed += 1
except Exception as e:
print(f"{test_name}: {e}")
failed += 1
print(f"\n📊 Test Results: {passed} passed, {failed} failed")
return failed == 0
if __name__ == "__main__":
# Run tests standalone
success = asyncio.run(run_tests_standalone())
exit(0 if success else 1)