feat: Add comprehensive tests for URL discovery and virtual scroll functionality

2025-10-16 10:35:48 +08:00
parent 674d0741da
commit 74eeff4c51
6 changed files with 287 additions and 0 deletions
--- a/tests/test_url_discovery_e2e.py
+++ b/tests/test_url_discovery_e2e.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+"""
+End-to-end tests for the URL Discovery endpoint.
+
+This test suite verifies the complete functionality of the /urls/discover endpoint
+including happy path scenarios and error handling.
+"""
+
+import asyncio
+import httpx
+import json
+import pytest
+from typing import Dict, Any
+
+# Test configuration
+BASE_URL = "http://localhost:11235"
+TEST_TIMEOUT = 30.0
+
+
+class TestURLDiscoveryEndpoint:
+    """End-to-end test suite for URL Discovery endpoint."""
+    
+    @pytest.fixture
+    async def client(self):
+        """Create an async HTTP client for testing."""
+        async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
+            yield client
+    
+    async def test_server_health(self, client):
+        """Test that the server is healthy before running other tests."""
+        response = await client.get("/health")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "ok"
+    
+    async def test_endpoint_exists(self, client):
+        """Test that the /urls/discover endpoint exists and is documented."""
+        # Check OpenAPI spec includes our endpoint
+        response = await client.get("/openapi.json")
+        assert response.status_code == 200
+        
+        openapi_spec = response.json()
+        assert "/urls/discover" in openapi_spec["paths"]
+        
+        endpoint_spec = openapi_spec["paths"]["/urls/discover"]
+        assert "post" in endpoint_spec
+        assert endpoint_spec["post"]["summary"] == "URL Discovery and Seeding"
+    
+    async def test_basic_url_discovery_happy_path(self, client):
+        """Test basic URL discovery with minimal configuration."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap",
+                "max_urls": 5
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+        # Note: We don't assert length > 0 because URL discovery 
+        # may legitimately return empty results
+    
+    async def test_minimal_request_with_defaults(self, client):
+        """Test that minimal request works with default seeding_config."""
+        request_data = {
+            "domain": "example.com"
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+    
+    async def test_advanced_configuration(self, client):
+        """Test advanced configuration options."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap+cc",
+                "pattern": "*/docs/*",
+                "extract_head": True,
+                "max_urls": 3,
+                "live_check": True,
+                "concurrency": 50,
+                "hits_per_sec": 5,
+                "verbose": True
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+        
+        # If URLs are returned, they should have the expected structure
+        for url_obj in data:
+            assert isinstance(url_obj, dict)
+            # Should have at least a URL field
+            assert "url" in url_obj
+    
+    async def test_bm25_scoring_configuration(self, client):
+        """Test BM25 relevance scoring configuration."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap",
+                "extract_head": True,  # Required for scoring
+                "query": "documentation",
+                "scoring_method": "bm25",
+                "score_threshold": 0.1,
+                "max_urls": 5
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+        
+        # If URLs are returned with scoring, check structure
+        for url_obj in data:
+            assert isinstance(url_obj, dict)
+            assert "url" in url_obj
+            # Scoring may or may not add score field depending on implementation
+    
+    async def test_missing_required_domain_field(self, client):
+        """Test error handling when required domain field is missing."""
+        request_data = {
+            "seeding_config": {
+                "source": "sitemap",
+                "max_urls": 5
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 422  # Validation error
+        
+        error_data = response.json()
+        assert "detail" in error_data
+        assert any("domain" in str(error).lower() for error in error_data["detail"])
+    
+    async def test_invalid_request_body_structure(self, client):
+        """Test error handling with completely invalid request body."""
+        invalid_request = {
+            "invalid_field": "test_value",
+            "another_invalid": 123
+        }
+        
+        response = await client.post("/urls/discover", json=invalid_request)
+        assert response.status_code == 422  # Validation error
+        
+        error_data = response.json()
+        assert "detail" in error_data
+    
+    async def test_invalid_seeding_config_parameters(self, client):
+        """Test handling of invalid seeding configuration parameters."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "invalid_source",  # Invalid source
+                "max_urls": "not_a_number"   # Invalid type
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        # The endpoint should handle this gracefully
+        # It may return 200 with empty results or 500 with error details
+        assert response.status_code in [200, 500]
+        
+        if response.status_code == 200:
+            data = response.json()
+            assert isinstance(data, list)
+            # May be empty due to invalid config
+        else:
+            # Should have error details
+            error_data = response.json()
+            assert "detail" in error_data
+    
+    async def test_empty_seeding_config(self, client):
+        """Test with empty seeding_config object."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {}
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+    
+    async def test_response_structure_consistency(self, client):
+        """Test that response structure is consistent."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap",
+                "max_urls": 1
+            }
+        }
+        
+        # Make multiple requests to ensure consistency
+        for _ in range(3):
+            response = await client.post("/urls/discover", json=request_data)
+            assert response.status_code == 200
+            
+            data = response.json()
+            assert isinstance(data, list)
+            
+            # If there are results, check they have consistent structure
+            for url_obj in data:
+                assert isinstance(url_obj, dict)
+                assert "url" in url_obj
+    
+    async def test_content_type_validation(self, client):
+        """Test that endpoint requires JSON content type."""
+        # Test with wrong content type
+        response = await client.post(
+            "/urls/discover",
+            content="domain=example.com",
+            headers={"Content-Type": "application/x-www-form-urlencoded"}
+        )
+        assert response.status_code == 422
+
+
+# Standalone test runner for when pytest is not available
+async def run_tests_standalone():
+    """Run tests without pytest framework."""
+    print("🧪 Running URL Discovery Endpoint Tests")
+    print("=" * 50)
+    
+    # Check server health first
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
+        try:
+            response = await client.get("/health")
+            assert response.status_code == 200
+            print("✅ Server health check passed")
+        except Exception as e:
+            print(f"❌ Server health check failed: {e}")
+            return False
+    
+    test_suite = TestURLDiscoveryEndpoint()
+    
+    # Run tests manually
+    tests = [
+        ("Endpoint exists", test_suite.test_endpoint_exists),
+        ("Basic URL discovery", test_suite.test_basic_url_discovery_happy_path),
+        ("Minimal request", test_suite.test_minimal_request_with_defaults),
+        ("Advanced configuration", test_suite.test_advanced_configuration),
+        ("BM25 scoring", test_suite.test_bm25_scoring_configuration),
+        ("Missing domain error", test_suite.test_missing_required_domain_field),
+        ("Invalid request body", test_suite.test_invalid_request_body_structure),
+        ("Invalid config handling", test_suite.test_invalid_seeding_config_parameters),
+        ("Empty config", test_suite.test_empty_seeding_config),
+        ("Response consistency", test_suite.test_response_structure_consistency),
+        ("Content type validation", test_suite.test_content_type_validation),
+    ]
+    
+    passed = 0
+    failed = 0
+    
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
+        for test_name, test_func in tests:
+            try:
+                await test_func(client)
+                print(f"✅ {test_name}")
+                passed += 1
+            except Exception as e:
+                print(f"❌ {test_name}: {e}")
+                failed += 1
+    
+    print(f"\n📊 Test Results: {passed} passed, {failed} failed")
+    return failed == 0
+
+
+if __name__ == "__main__":
+    # Run tests standalone
+    success = asyncio.run(run_tests_standalone())
+    exit(0 if success else 1)