feat: Add comprehensive tests for URL discovery and virtual scroll functionality

2025-10-16 10:35:48 +08:00
parent 674d0741da
commit 74eeff4c51
6 changed files with 287 additions and 0 deletions
--- a/tests/example_url_discovery.py
+++ b/tests/example_url_discovery.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""
+Runnable example for the /urls/discover endpoint.
+
+This script demonstrates how to use the new URL Discovery API endpoint
+to find relevant URLs from a domain before committing to a full crawl.
+"""
+
+import asyncio
+import httpx
+import json
+from typing import List, Dict, Any
+
+# Configuration
+BASE_URL = "http://localhost:11235"
+EXAMPLE_DOMAIN = "nbcnews.com"
+
+
+async def discover_urls_basic_example():
+    """Basic example of URL discovery."""
+    print("🔍 Basic URL Discovery Example")
+    print("=" * 50)
+    
+    # Basic discovery request
+    request_data = {
+        "domain": EXAMPLE_DOMAIN,
+        "seeding_config": {
+            "source": "sitemap",      # Use sitemap for fast discovery
+            "max_urls": 10           # Limit to 10 URLs
+        }
+    }
+    
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.post(
+                f"{BASE_URL}/urls/discover",
+                json=request_data,
+                timeout=30.0
+            )
+            response.raise_for_status()
+            
+            urls = response.json()
+            print(f"✅ Found {len(urls)} URLs")
+            
+            # Display first few URLs
+            for i, url_obj in enumerate(urls[:3]):
+                print(f"  {i+1}. {url_obj.get('url', 'N/A')}")
+                
+            return urls
+            
+        except httpx.HTTPStatusError as e:
+            print(f"❌ HTTP Error: {e.response.status_code}")
+            print(f"Response: {e.response.text}")
+            return []
+        except Exception as e:
+            print(f"❌ Error: {e}")
+            return []
+
+
+async def discover_urls_advanced_example():
+    """Advanced example with filtering and metadata extraction."""
+    print("\n🎯 Advanced URL Discovery Example")
+    print("=" * 50)
+    
+    # Advanced discovery with filtering
+    request_data = {
+        "domain": EXAMPLE_DOMAIN,
+        "seeding_config": {
+            "source": "sitemap+cc",   # Use both sitemap and Common Crawl
+            "pattern": "*/news/*",    # Filter to news articles only
+            "extract_head": True,     # Extract page metadata
+            "max_urls": 5,
+            "live_check": True,       # Verify URLs are accessible
+            "verbose": True
+        }
+    }
+    
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.post(
+                f"{BASE_URL}/urls/discover",
+                json=request_data,
+                timeout=60.0  # Longer timeout for advanced features
+            )
+            response.raise_for_status()
+            
+            urls = response.json()
+            print(f"✅ Found {len(urls)} news URLs with metadata")
+            
+            # Display URLs with metadata
+            for i, url_obj in enumerate(urls[:3]):
+                print(f"\n  {i+1}. URL: {url_obj.get('url', 'N/A')}")
+                print(f"     Status: {url_obj.get('status', 'unknown')}")
+                
+                head_data = url_obj.get('head_data', {})
+                if head_data:
+                    title = head_data.get('title', 'No title')
+                    description = head_data.get('description', 'No description')
+                    print(f"     Title: {title[:60]}...")
+                    print(f"     Description: {description[:60]}...")
+                
+            return urls
+            
+        except httpx.HTTPStatusError as e:
+            print(f"❌ HTTP Error: {e.response.status_code}")
+            print(f"Response: {e.response.text}")
+            return []
+        except Exception as e:
+            print(f"❌ Error: {e}")
+            return []
+
+
+async def discover_urls_with_scoring_example():
+    """Example using BM25 relevance scoring."""
+    print("\n🏆 URL Discovery with Relevance Scoring")
+    print("=" * 50)
+    
+    # Discovery with relevance scoring
+    request_data = {
+        "domain": EXAMPLE_DOMAIN,
+        "seeding_config": {
+            "source": "sitemap",
+            "extract_head": True,     # Required for BM25 scoring
+            "query": "politics election",  # Search for political content
+            "scoring_method": "bm25",
+            "score_threshold": 0.1,   # Minimum relevance score
+            "max_urls": 5
+        }
+    }
+    
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.post(
+                f"{BASE_URL}/urls/discover",
+                json=request_data,
+                timeout=60.0
+            )
+            response.raise_for_status()
+            
+            urls = response.json()
+            print(f"✅ Found {len(urls)} relevant URLs")
+            
+            # Display URLs sorted by relevance score
+            for i, url_obj in enumerate(urls[:3]):
+                score = url_obj.get('score', 0)
+                print(f"\n  {i+1}. Score: {score:.3f}")
+                print(f"     URL: {url_obj.get('url', 'N/A')}")
+                
+                head_data = url_obj.get('head_data', {})
+                if head_data:
+                    title = head_data.get('title', 'No title')
+                    print(f"     Title: {title[:60]}...")
+                
+            return urls
+            
+        except httpx.HTTPStatusError as e:
+            print(f"❌ HTTP Error: {e.response.status_code}")
+            print(f"Response: {e.response.text}")
+            return []
+        except Exception as e:
+            print(f"❌ Error: {e}")
+            return []
+
+
+def demonstrate_request_schema():
+    """Show the complete request schema with all options."""
+    print("\n📋 Complete Request Schema")
+    print("=" * 50)
+    
+    complete_schema = {
+        "domain": "example.com",  # Required: Domain to discover URLs from
+        "seeding_config": {       # Optional: Configuration object
+            # Discovery sources
+            "source": "sitemap+cc",           # "sitemap", "cc", or "sitemap+cc"
+            
+            # Filtering options
+            "pattern": "*/blog/*",            # URL pattern filter (glob style)
+            "max_urls": 50,                   # Maximum URLs to return (-1 = no limit)
+            "filter_nonsense_urls": True,     # Filter out nonsense URLs
+            
+            # Metadata and validation
+            "extract_head": True,             # Extract <head> metadata
+            "live_check": True,               # Verify URL accessibility
+            
+            # Performance and rate limiting
+            "concurrency": 100,               # Concurrent requests
+            "hits_per_sec": 10,              # Rate limit (requests/second)
+            "force": False,                   # Bypass cache
+            
+            # Relevance scoring (requires extract_head=True)
+            "query": "search terms",          # Query for BM25 scoring
+            "scoring_method": "bm25",         # Scoring algorithm
+            "score_threshold": 0.2,           # Minimum score threshold
+            
+            # Debugging
+            "verbose": True                   # Enable verbose logging
+        }
+    }
+    
+    print("Full request schema:")
+    print(json.dumps(complete_schema, indent=2))
+    
+
+async def main():
+    """Run all examples."""
+    print("🚀 URL Discovery API Examples")
+    print("=" * 50)
+    print(f"Server: {BASE_URL}")
+    print(f"Domain: {EXAMPLE_DOMAIN}")
+    
+    # Check if server is running
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.get(f"{BASE_URL}/health", timeout=5.0)
+            response.raise_for_status()
+            print("✅ Server is running\n")
+        except Exception as e:
+            print(f"❌ Server not available: {e}")
+            print("Please start the Crawl4AI server first:")
+            print("  docker compose up crawl4ai -d")
+            return
+    
+    # Run examples
+    await discover_urls_basic_example()
+    await discover_urls_advanced_example()
+    await discover_urls_with_scoring_example()
+    
+    # Show schema
+    demonstrate_request_schema()
+    
+    print("\n🎉 Examples complete!")
+    print("\nNext steps:")
+    print("1. Use discovered URLs with the /crawl endpoint")
+    print("2. Filter URLs based on your specific needs")
+    print("3. Combine with other API endpoints for complete workflows")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/test_implementation.py
+++ b/tests/test_implementation.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""
+Test script for the new URL discovery functionality.
+This tests the handler function directly without running the full server.
+"""
+
+import asyncio
+import sys
+import os
+from pathlib import Path
+
+# Add the repo to Python path
+repo_root = Path(__file__).parent
+sys.path.insert(0, str(repo_root))
+sys.path.insert(0, str(repo_root / "deploy" / "docker"))
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.syntax import Syntax
+
+console = Console()
+
+async def test_url_discovery_handler():
+    """Test the URL discovery handler function directly."""
+    try:
+        # Import the handler function and dependencies
+        from api import handle_url_discovery
+        from crawl4ai.async_configs import SeedingConfig
+        
+        console.print("[bold cyan]Testing URL Discovery Handler Function[/bold cyan]")
+        
+        # Test 1: Basic functionality
+        console.print("\n[cyan]Test 1: Basic URL discovery[/cyan]")
+        
+        domain = "docs.crawl4ai.com"
+        seeding_config = {
+            "source": "sitemap",
+            "max_urls": 3,
+            "verbose": True
+        }
+        
+        console.print(f"[blue]Domain:[/blue] {domain}")
+        console.print(f"[blue]Config:[/blue] {seeding_config}")
+        
+        # Call the handler directly
+        result = await handle_url_discovery(domain, seeding_config)
+        
+        console.print(f"[green]✓ Handler executed successfully[/green]")
+        console.print(f"[green]✓ Result type: {type(result)}[/green]")
+        console.print(f"[green]✓ Result length: {len(result)}[/green]")
+        
+        # Print first few results if any
+        if result and len(result) > 0:
+            console.print("\n[blue]Sample results:[/blue]")
+            for i, url_obj in enumerate(result[:2]):
+                console.print(f"  {i+1}. {url_obj}")
+        
+        return True
+        
+    except ImportError as e:
+        console.print(f"[red]✗ Import error: {e}[/red]")
+        console.print("[yellow]This suggests missing dependencies or module structure issues[/yellow]")
+        return False
+    except Exception as e:
+        console.print(f"[red]✗ Handler error: {e}[/red]")
+        return False
+
+async def test_seeding_config_validation():
+    """Test SeedingConfig validation."""
+    try:
+        from crawl4ai.async_configs import SeedingConfig
+        
+        console.print("\n[cyan]Test 2: SeedingConfig validation[/cyan]")
+        
+        # Test valid config
+        valid_config = {
+            "source": "sitemap",
+            "max_urls": 5,
+            "pattern": "*"
+        }
+        
+        config = SeedingConfig(**valid_config)
+        console.print(f"[green]✓ Valid config created: {config.source}, max_urls={config.max_urls}[/green]")
+        
+        # Test invalid config
+        try:
+            invalid_config = {
+                "source": "invalid_source",
+                "max_urls": 5
+            }
+            config = SeedingConfig(**invalid_config)
+            console.print(f"[yellow]? Invalid config unexpectedly accepted[/yellow]")
+        except Exception as e:
+            console.print(f"[green]✓ Invalid config correctly rejected: {str(e)[:50]}...[/green]")
+        
+        return True
+        
+    except Exception as e:
+        console.print(f"[red]✗ SeedingConfig test error: {e}[/red]")
+        return False
+
+async def test_schema_validation():
+    """Test the URLDiscoveryRequest schema."""
+    try:
+        from schemas import URLDiscoveryRequest
+        
+        console.print("\n[cyan]Test 3: URLDiscoveryRequest schema validation[/cyan]")
+        
+        # Test valid request
+        valid_request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap",
+                "max_urls": 10
+            }
+        }
+        
+        request = URLDiscoveryRequest(**valid_request_data)
+        console.print(f"[green]✓ Valid request created: domain={request.domain}[/green]")
+        
+        # Test request with default config
+        minimal_request_data = {
+            "domain": "example.com"
+        }
+        
+        request = URLDiscoveryRequest(**minimal_request_data)
+        console.print(f"[green]✓ Minimal request created with defaults[/green]")
+        
+        return True
+        
+    except Exception as e:
+        console.print(f"[red]✗ Schema test error: {e}[/red]")
+        return False
+
+async def main():
+    """Run all tests."""
+    console.print("[bold blue]🔍 URL Discovery Implementation Tests[/bold blue]")
+    
+    results = []
+    
+    # Test the implementation components
+    results.append(await test_seeding_config_validation())
+    results.append(await test_schema_validation())
+    results.append(await test_url_discovery_handler())
+    
+    # Summary
+    console.print("\n[bold cyan]Test Summary[/bold cyan]")
+    passed = sum(results)
+    total = len(results)
+    
+    if passed == total:
+        console.print(f"[bold green]✓ All {total} implementation tests passed![/bold green]")
+        console.print("[green]The URL discovery endpoint is ready for integration testing[/green]")
+    else:
+        console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
+    
+    return passed == total
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/test_url_discovery.py
+++ b/tests/test_url_discovery.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""
+Test script for the new /urls/discover endpoint in Crawl4AI Docker API.
+"""
+
+import asyncio
+import httpx
+import json
+from rich.console import Console
+from rich.panel import Panel
+from rich.syntax import Syntax
+
+console = Console()
+
+# Configuration
+BASE_URL = "http://localhost:11235"
+TEST_DOMAIN = "docs.crawl4ai.com"
+
+async def check_server_health(client: httpx.AsyncClient) -> bool:
+    """Check if the server is healthy."""
+    console.print("[bold cyan]Checking server health...[/]", end="")
+    try:
+        response = await client.get("/health", timeout=10.0)
+        response.raise_for_status()
+        console.print(" [bold green]✓ Server is healthy![/]")
+        return True
+    except Exception as e:
+        console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
+        console.print(f"Is the server running at {BASE_URL}?")
+        return False
+
+def print_request(endpoint: str, payload: dict, title: str = "Request"):
+    """Pretty print the request."""
+    syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
+    console.print(Panel.fit(
+        f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
+        title=f"[bold blue]{title}[/]",
+        border_style="blue"
+    ))
+
+def print_response(response_data: dict, title: str = "Response"):
+    """Pretty print the response."""
+    syntax = Syntax(json.dumps(response_data, indent=2), "json", theme="monokai")
+    console.print(Panel.fit(
+        syntax,
+        title=f"[bold green]{title}[/]",
+        border_style="green"
+    ))
+
+async def test_urls_discover_basic():
+    """Test basic URL discovery functionality."""
+    console.print("\n[bold yellow]Testing URL Discovery Endpoint[/bold yellow]")
+    
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
+        # Check server health first
+        if not await check_server_health(client):
+            return False
+        
+        # Test 1: Basic discovery with sitemap
+        console.print("\n[cyan]Test 1: Basic URL discovery from sitemap[/cyan]")
+        
+        payload = {
+            "domain": TEST_DOMAIN,
+            "seeding_config": {
+                "source": "sitemap",
+                "max_urls": 5
+            }
+        }
+        
+        print_request("/urls/discover", payload, "Basic Discovery Request")
+        
+        try:
+            response = await client.post("/urls/discover", json=payload)
+            response.raise_for_status()
+            response_data = response.json()
+            
+            print_response(response_data, "Basic Discovery Response")
+            
+            # Validate response structure
+            if isinstance(response_data, list):
+                console.print(f"[green]✓ Discovered {len(response_data)} URLs[/green]")
+                return True
+            else:
+                console.print(f"[red]✗ Expected list, got {type(response_data)}[/red]")
+                return False
+                
+        except httpx.HTTPStatusError as e:
+            console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
+            return False
+        except Exception as e:
+            console.print(f"[red]✗ Error: {e}[/red]")
+            return False
+
+async def test_urls_discover_invalid_config():
+    """Test URL discovery with invalid configuration."""
+    console.print("\n[cyan]Test 2: URL discovery with invalid configuration[/cyan]")
+    
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
+        payload = {
+            "domain": TEST_DOMAIN,
+            "seeding_config": {
+                "source": "invalid_source",  # Invalid source
+                "max_urls": 5
+            }
+        }
+        
+        print_request("/urls/discover", payload, "Invalid Config Request")
+        
+        try:
+            response = await client.post("/urls/discover", json=payload)
+            
+            if response.status_code == 500:
+                console.print("[green]✓ Server correctly rejected invalid config with 500 error[/green]")
+                return True
+            else:
+                console.print(f"[yellow]? Expected 500 error, got {response.status_code}[/yellow]")
+                response_data = response.json()
+                print_response(response_data, "Unexpected Response")
+                return False
+                
+        except Exception as e:
+            console.print(f"[red]✗ Unexpected error: {e}[/red]")
+            return False
+
+async def test_urls_discover_with_filtering():
+    """Test URL discovery with advanced filtering."""
+    console.print("\n[cyan]Test 3: URL discovery with filtering and metadata[/cyan]")
+    
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
+        payload = {
+            "domain": TEST_DOMAIN,
+            "seeding_config": {
+                "source": "sitemap",
+                "pattern": "*/docs/*",  # Filter to docs URLs only
+                "extract_head": True,   # Extract metadata
+                "max_urls": 3
+            }
+        }
+        
+        print_request("/urls/discover", payload, "Filtered Discovery Request")
+        
+        try:
+            response = await client.post("/urls/discover", json=payload)
+            response.raise_for_status()
+            response_data = response.json()
+            
+            print_response(response_data, "Filtered Discovery Response")
+            
+            # Validate response structure with metadata
+            if isinstance(response_data, list) and len(response_data) > 0:
+                sample_url = response_data[0]
+                if "url" in sample_url:
+                    console.print(f"[green]✓ Discovered {len(response_data)} filtered URLs with metadata[/green]")
+                    return True
+                else:
+                    console.print(f"[red]✗ URL objects missing expected fields[/red]")
+                    return False
+            else:
+                console.print(f"[yellow]? No URLs found with filter pattern[/yellow]")
+                return True  # This could be expected
+                
+        except httpx.HTTPStatusError as e:
+            console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
+            return False
+        except Exception as e:
+            console.print(f"[red]✗ Error: {e}[/red]")
+            return False
+
+async def main():
+    """Run all tests."""
+    console.print("[bold cyan]🔍 URL Discovery Endpoint Tests[/bold cyan]")
+    
+    results = []
+    
+    # Run tests
+    results.append(await test_urls_discover_basic())
+    results.append(await test_urls_discover_invalid_config())
+    results.append(await test_urls_discover_with_filtering())
+    
+    # Summary
+    console.print("\n[bold cyan]Test Summary[/bold cyan]")
+    passed = sum(results)
+    total = len(results)
+    
+    if passed == total:
+        console.print(f"[bold green]✓ All {total} tests passed![/bold green]")
+    else:
+        console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
+    
+    return passed == total
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/test_url_discovery_e2e.py
+++ b/tests/test_url_discovery_e2e.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+"""
+End-to-end tests for the URL Discovery endpoint.
+
+This test suite verifies the complete functionality of the /urls/discover endpoint
+including happy path scenarios and error handling.
+"""
+
+import asyncio
+import httpx
+import json
+import pytest
+from typing import Dict, Any
+
+# Test configuration
+BASE_URL = "http://localhost:11235"
+TEST_TIMEOUT = 30.0
+
+
+class TestURLDiscoveryEndpoint:
+    """End-to-end test suite for URL Discovery endpoint."""
+    
+    @pytest.fixture
+    async def client(self):
+        """Create an async HTTP client for testing."""
+        async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
+            yield client
+    
+    async def test_server_health(self, client):
+        """Test that the server is healthy before running other tests."""
+        response = await client.get("/health")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "ok"
+    
+    async def test_endpoint_exists(self, client):
+        """Test that the /urls/discover endpoint exists and is documented."""
+        # Check OpenAPI spec includes our endpoint
+        response = await client.get("/openapi.json")
+        assert response.status_code == 200
+        
+        openapi_spec = response.json()
+        assert "/urls/discover" in openapi_spec["paths"]
+        
+        endpoint_spec = openapi_spec["paths"]["/urls/discover"]
+        assert "post" in endpoint_spec
+        assert endpoint_spec["post"]["summary"] == "URL Discovery and Seeding"
+    
+    async def test_basic_url_discovery_happy_path(self, client):
+        """Test basic URL discovery with minimal configuration."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap",
+                "max_urls": 5
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+        # Note: We don't assert length > 0 because URL discovery 
+        # may legitimately return empty results
+    
+    async def test_minimal_request_with_defaults(self, client):
+        """Test that minimal request works with default seeding_config."""
+        request_data = {
+            "domain": "example.com"
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+    
+    async def test_advanced_configuration(self, client):
+        """Test advanced configuration options."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap+cc",
+                "pattern": "*/docs/*",
+                "extract_head": True,
+                "max_urls": 3,
+                "live_check": True,
+                "concurrency": 50,
+                "hits_per_sec": 5,
+                "verbose": True
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+        
+        # If URLs are returned, they should have the expected structure
+        for url_obj in data:
+            assert isinstance(url_obj, dict)
+            # Should have at least a URL field
+            assert "url" in url_obj
+    
+    async def test_bm25_scoring_configuration(self, client):
+        """Test BM25 relevance scoring configuration."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap",
+                "extract_head": True,  # Required for scoring
+                "query": "documentation",
+                "scoring_method": "bm25",
+                "score_threshold": 0.1,
+                "max_urls": 5
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+        
+        # If URLs are returned with scoring, check structure
+        for url_obj in data:
+            assert isinstance(url_obj, dict)
+            assert "url" in url_obj
+            # Scoring may or may not add score field depending on implementation
+    
+    async def test_missing_required_domain_field(self, client):
+        """Test error handling when required domain field is missing."""
+        request_data = {
+            "seeding_config": {
+                "source": "sitemap",
+                "max_urls": 5
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 422  # Validation error
+        
+        error_data = response.json()
+        assert "detail" in error_data
+        assert any("domain" in str(error).lower() for error in error_data["detail"])
+    
+    async def test_invalid_request_body_structure(self, client):
+        """Test error handling with completely invalid request body."""
+        invalid_request = {
+            "invalid_field": "test_value",
+            "another_invalid": 123
+        }
+        
+        response = await client.post("/urls/discover", json=invalid_request)
+        assert response.status_code == 422  # Validation error
+        
+        error_data = response.json()
+        assert "detail" in error_data
+    
+    async def test_invalid_seeding_config_parameters(self, client):
+        """Test handling of invalid seeding configuration parameters."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "invalid_source",  # Invalid source
+                "max_urls": "not_a_number"   # Invalid type
+            }
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        # The endpoint should handle this gracefully
+        # It may return 200 with empty results or 500 with error details
+        assert response.status_code in [200, 500]
+        
+        if response.status_code == 200:
+            data = response.json()
+            assert isinstance(data, list)
+            # May be empty due to invalid config
+        else:
+            # Should have error details
+            error_data = response.json()
+            assert "detail" in error_data
+    
+    async def test_empty_seeding_config(self, client):
+        """Test with empty seeding_config object."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {}
+        }
+        
+        response = await client.post("/urls/discover", json=request_data)
+        assert response.status_code == 200
+        
+        data = response.json()
+        assert isinstance(data, list)
+    
+    async def test_response_structure_consistency(self, client):
+        """Test that response structure is consistent."""
+        request_data = {
+            "domain": "example.com",
+            "seeding_config": {
+                "source": "sitemap",
+                "max_urls": 1
+            }
+        }
+        
+        # Make multiple requests to ensure consistency
+        for _ in range(3):
+            response = await client.post("/urls/discover", json=request_data)
+            assert response.status_code == 200
+            
+            data = response.json()
+            assert isinstance(data, list)
+            
+            # If there are results, check they have consistent structure
+            for url_obj in data:
+                assert isinstance(url_obj, dict)
+                assert "url" in url_obj
+    
+    async def test_content_type_validation(self, client):
+        """Test that endpoint requires JSON content type."""
+        # Test with wrong content type
+        response = await client.post(
+            "/urls/discover",
+            content="domain=example.com",
+            headers={"Content-Type": "application/x-www-form-urlencoded"}
+        )
+        assert response.status_code == 422
+
+
+# Standalone test runner for when pytest is not available
+async def run_tests_standalone():
+    """Run tests without pytest framework."""
+    print("🧪 Running URL Discovery Endpoint Tests")
+    print("=" * 50)
+    
+    # Check server health first
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
+        try:
+            response = await client.get("/health")
+            assert response.status_code == 200
+            print("✅ Server health check passed")
+        except Exception as e:
+            print(f"❌ Server health check failed: {e}")
+            return False
+    
+    test_suite = TestURLDiscoveryEndpoint()
+    
+    # Run tests manually
+    tests = [
+        ("Endpoint exists", test_suite.test_endpoint_exists),
+        ("Basic URL discovery", test_suite.test_basic_url_discovery_happy_path),
+        ("Minimal request", test_suite.test_minimal_request_with_defaults),
+        ("Advanced configuration", test_suite.test_advanced_configuration),
+        ("BM25 scoring", test_suite.test_bm25_scoring_configuration),
+        ("Missing domain error", test_suite.test_missing_required_domain_field),
+        ("Invalid request body", test_suite.test_invalid_request_body_structure),
+        ("Invalid config handling", test_suite.test_invalid_seeding_config_parameters),
+        ("Empty config", test_suite.test_empty_seeding_config),
+        ("Response consistency", test_suite.test_response_structure_consistency),
+        ("Content type validation", test_suite.test_content_type_validation),
+    ]
+    
+    passed = 0
+    failed = 0
+    
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
+        for test_name, test_func in tests:
+            try:
+                await test_func(client)
+                print(f"✅ {test_name}")
+                passed += 1
+            except Exception as e:
+                print(f"❌ {test_name}: {e}")
+                failed += 1
+    
+    print(f"\n📊 Test Results: {passed} passed, {failed} failed")
+    return failed == 0
+
+
+if __name__ == "__main__":
+    # Run tests standalone
+    success = asyncio.run(run_tests_standalone())
+    exit(0 if success else 1)
--- a/tests/test_virtual_scroll_api.py
+++ b/tests/test_virtual_scroll_api.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""
+Test script for VirtualScrollConfig with the /crawl API endpoint
+"""
+
+import requests
+import json
+
+def test_virtual_scroll_api():
+    """Test the /crawl endpoint with VirtualScrollConfig"""
+
+    # Create a simple HTML page with virtual scroll for testing
+    test_html = '''
+    <html>
+    <head>
+        <style>
+            #container {
+                height: 300px;
+                overflow-y: auto;
+                border: 1px solid #ccc;
+            }
+            .item {
+                height: 30px;
+                padding: 5px;
+                border-bottom: 1px solid #eee;
+            }
+        </style>
+    </head>
+    <body>
+        <h1>Virtual Scroll Test</h1>
+        <div id="container">
+            <div class="item">Item 1</div>
+            <div class="item">Item 2</div>
+            <div class="item">Item 3</div>
+            <div class="item">Item 4</div>
+            <div class="item">Item 5</div>
+        </div>
+        <script>
+            // Simple script to simulate virtual scroll
+            const container = document.getElementById('container');
+            let itemCount = 5;
+
+            // Add more items when scrolling
+            container.addEventListener('scroll', function() {
+                if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
+                    for (let i = 0; i < 5; i++) {
+                        itemCount++;
+                        const newItem = document.createElement('div');
+                        newItem.className = 'item';
+                        newItem.textContent = `Item ${itemCount}`;
+                        container.appendChild(newItem);
+                    }
+                }
+            });
+
+            // Initial scroll to trigger loading
+            setTimeout(() => {
+                container.scrollTop = container.scrollHeight;
+            }, 100);
+        </script>
+    </body>
+    </html>
+    '''
+
+    # Save the HTML to a temporary file and serve it
+    import tempfile
+    import os
+    import http.server
+    import socketserver
+    import threading
+    import time
+
+    # Create temporary HTML file
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+        f.write(test_html)
+        temp_file = f.name
+
+    # Start local server
+    os.chdir(os.path.dirname(temp_file))
+    port = 8080
+
+    class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
+        def log_message(self, format, *args):
+            pass  # Suppress log messages
+
+    try:
+        with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
+            server_thread = threading.Thread(target=httpd.serve_forever)
+            server_thread.daemon = True
+            server_thread.start()
+
+            time.sleep(0.5)  # Give server time to start
+
+            # Now test the API
+            url = f"http://crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"
+
+            payload = {
+                "urls": [url],
+                "browser_config": {
+                    "type": "BrowserConfig",
+                    "params": {
+                        "headless": True,
+                        "viewport_width": 1920,
+                        "viewport_height": 1080
+                    }
+                },
+                "crawler_config": {
+                    "type": "CrawlerRunConfig",
+                    "params": {
+                        "virtual_scroll_config": {
+                            "type": "VirtualScrollConfig",
+                            "params": {
+                                "container_selector": "#container",
+                                "scroll_count": 3,
+                                "scroll_by": "container_height",
+                                "wait_after_scroll": 0.5
+                            }
+                        },
+                        "cache_mode": "bypass",
+                        "extraction_strategy": {
+                            "type": "NoExtractionStrategy",
+                            "params": {}
+                        }
+                    }
+                }
+            }
+
+            print("Testing VirtualScrollConfig with /crawl endpoint...")
+            print(f"Test URL: {url}")
+            print("Payload:")
+            print(json.dumps(payload, indent=2))
+
+            response = requests.post(
+                "http://localhost:11234/crawl",
+                json=payload,
+                headers={"Content-Type": "application/json"}
+            )
+
+            print(f"\nResponse Status: {response.status_code}")
+
+            if response.status_code == 200:
+                result = response.json()
+                print("✅ Success! VirtualScrollConfig is working.")
+                print(f"Content length: {len(result[0]['content']['raw_content'])} characters")
+
+                # Check if virtual scroll captured more content
+                if "Item 10" in result[0]['content']['raw_content']:
+                    print("✅ Virtual scroll successfully captured additional content!")
+                else:
+                    print("⚠️  Virtual scroll may not have worked as expected")
+
+                # Print a snippet of the content
+                content_preview = result[0]['content']['raw_content'][:500] + "..."
+                print(f"\nContent preview:\n{content_preview}")
+
+            else:
+                print(f"❌ Error: {response.status_code}")
+                print(f"Response: {response.text}")
+
+    except Exception as e:
+        print(f"❌ Test failed with error: {e}")
+    finally:
+        # Cleanup
+        try:
+            os.unlink(temp_file)
+        except:
+            pass
+
+if __name__ == "__main__":
+    test_virtual_scroll_api()
--- a/tests/test_virtual_scroll_api_simple.py
+++ b/tests/test_virtual_scroll_api_simple.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+Test VirtualScrollConfig with the /crawl API using existing test assets
+"""
+
+import requests
+import json
+import os
+import http.server
+import socketserver
+import threading
+import time
+from pathlib import Path
+
+def test_virtual_scroll_api():
+    """Test the /crawl endpoint with VirtualScrollConfig using test assets"""
+
+    # Use the existing test assets
+    assets_dir = Path(__file__).parent / "docs" / "examples" / "assets"
+    if not assets_dir.exists():
+        print(f"❌ Assets directory not found: {assets_dir}")
+        return
+
+    # Start local server for assets
+    os.chdir(assets_dir)
+    port = 8081
+
+    class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
+        def log_message(self, format, *args):
+            pass  # Suppress log messages
+
+    try:
+        with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
+            server_thread = threading.Thread(target=httpd.serve_forever)
+            server_thread.daemon = True
+            server_thread.start()
+
+            time.sleep(0.5)  # Give server time to start
+
+            # Test with Twitter-like virtual scroll
+            url = f"http://docs.crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"
+
+            payload = {
+                "urls": [url],
+                "browser_config": {
+                    "type": "BrowserConfig",
+                    "params": {
+                        "headless": True,
+                        "viewport_width": 1280,
+                        "viewport_height": 800
+                    }
+                },
+                "crawler_config": {
+                    "type": "CrawlerRunConfig",
+                    "params": {
+                        "virtual_scroll_config": {
+                            "type": "VirtualScrollConfig",
+                            "params": {
+                                "container_selector": "#timeline",
+                                "scroll_count": 10,
+                                "scroll_by": "container_height",
+                                "wait_after_scroll": 0.3
+                            }
+                        },
+                        "cache_mode": "bypass",
+                        "extraction_strategy": {
+                            "type": "NoExtractionStrategy",
+                            "params": {}
+                        }
+                    }
+                }
+            }
+
+            print("Testing VirtualScrollConfig with /crawl endpoint...")
+            print(f"Test URL: {url}")
+            print("Payload:")
+            print(json.dumps(payload, indent=2))
+
+            response = requests.post(
+                "http://localhost:11234/crawl",
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=60  # Longer timeout for virtual scroll
+            )
+
+            print(f"\nResponse Status: {response.status_code}")
+
+            if response.status_code == 200:
+                result = response.json()
+                print("✅ Success! VirtualScrollConfig is working with the API.")
+                print(f"Content length: {len(result[0]['content']['raw_content'])} characters")
+
+                # Check if we captured multiple posts (indicating virtual scroll worked)
+                content = result[0]['content']['raw_content']
+                post_count = content.count("Post #")
+                print(f"Found {post_count} posts in the content")
+
+                if post_count > 5:  # Should capture more than just the initial posts
+                    print("✅ Virtual scroll successfully captured additional content!")
+                else:
+                    print("⚠️  Virtual scroll may not have captured much additional content")
+
+                # Print a snippet of the content
+                content_preview = content[:1000] + "..." if len(content) > 1000 else content
+                print(f"\nContent preview:\n{content_preview}")
+
+            else:
+                print(f"❌ Error: {response.status_code}")
+                print(f"Response: {response.text}")
+
+    except requests.exceptions.Timeout:
+        print("❌ Request timed out - virtual scroll may be taking too long")
+    except Exception as e:
+        print(f"❌ Test failed with error: {e}")
+
+if __name__ == "__main__":
+    test_virtual_scroll_api()