feat: Add comprehensive tests for URL discovery and virtual scroll functionality

This commit is contained in:
AHMET YILMAZ
2025-10-16 10:35:48 +08:00
parent 674d0741da
commit 74eeff4c51
6 changed files with 287 additions and 0 deletions

View File

@@ -0,0 +1,239 @@
#!/usr/bin/env python3
"""
Runnable example for the /urls/discover endpoint.
This script demonstrates how to use the new URL Discovery API endpoint
to find relevant URLs from a domain before committing to a full crawl.
"""
import asyncio
import httpx
import json
from typing import List, Dict, Any
# Configuration
BASE_URL = "http://localhost:11235"
EXAMPLE_DOMAIN = "nbcnews.com"
async def discover_urls_basic_example():
"""Basic example of URL discovery."""
print("🔍 Basic URL Discovery Example")
print("=" * 50)
# Basic discovery request
request_data = {
"domain": EXAMPLE_DOMAIN,
"seeding_config": {
"source": "sitemap", # Use sitemap for fast discovery
"max_urls": 10 # Limit to 10 URLs
}
}
async with httpx.AsyncClient() as client:
try:
response = await client.post(
f"{BASE_URL}/urls/discover",
json=request_data,
timeout=30.0
)
response.raise_for_status()
urls = response.json()
print(f"✅ Found {len(urls)} URLs")
# Display first few URLs
for i, url_obj in enumerate(urls[:3]):
print(f" {i+1}. {url_obj.get('url', 'N/A')}")
return urls
except httpx.HTTPStatusError as e:
print(f"❌ HTTP Error: {e.response.status_code}")
print(f"Response: {e.response.text}")
return []
except Exception as e:
print(f"❌ Error: {e}")
return []
async def discover_urls_advanced_example():
"""Advanced example with filtering and metadata extraction."""
print("\n🎯 Advanced URL Discovery Example")
print("=" * 50)
# Advanced discovery with filtering
request_data = {
"domain": EXAMPLE_DOMAIN,
"seeding_config": {
"source": "sitemap+cc", # Use both sitemap and Common Crawl
"pattern": "*/news/*", # Filter to news articles only
"extract_head": True, # Extract page metadata
"max_urls": 5,
"live_check": True, # Verify URLs are accessible
"verbose": True
}
}
async with httpx.AsyncClient() as client:
try:
response = await client.post(
f"{BASE_URL}/urls/discover",
json=request_data,
timeout=60.0 # Longer timeout for advanced features
)
response.raise_for_status()
urls = response.json()
print(f"✅ Found {len(urls)} news URLs with metadata")
# Display URLs with metadata
for i, url_obj in enumerate(urls[:3]):
print(f"\n {i+1}. URL: {url_obj.get('url', 'N/A')}")
print(f" Status: {url_obj.get('status', 'unknown')}")
head_data = url_obj.get('head_data', {})
if head_data:
title = head_data.get('title', 'No title')
description = head_data.get('description', 'No description')
print(f" Title: {title[:60]}...")
print(f" Description: {description[:60]}...")
return urls
except httpx.HTTPStatusError as e:
print(f"❌ HTTP Error: {e.response.status_code}")
print(f"Response: {e.response.text}")
return []
except Exception as e:
print(f"❌ Error: {e}")
return []
async def discover_urls_with_scoring_example():
"""Example using BM25 relevance scoring."""
print("\n🏆 URL Discovery with Relevance Scoring")
print("=" * 50)
# Discovery with relevance scoring
request_data = {
"domain": EXAMPLE_DOMAIN,
"seeding_config": {
"source": "sitemap",
"extract_head": True, # Required for BM25 scoring
"query": "politics election", # Search for political content
"scoring_method": "bm25",
"score_threshold": 0.1, # Minimum relevance score
"max_urls": 5
}
}
async with httpx.AsyncClient() as client:
try:
response = await client.post(
f"{BASE_URL}/urls/discover",
json=request_data,
timeout=60.0
)
response.raise_for_status()
urls = response.json()
print(f"✅ Found {len(urls)} relevant URLs")
# Display URLs sorted by relevance score
for i, url_obj in enumerate(urls[:3]):
score = url_obj.get('score', 0)
print(f"\n {i+1}. Score: {score:.3f}")
print(f" URL: {url_obj.get('url', 'N/A')}")
head_data = url_obj.get('head_data', {})
if head_data:
title = head_data.get('title', 'No title')
print(f" Title: {title[:60]}...")
return urls
except httpx.HTTPStatusError as e:
print(f"❌ HTTP Error: {e.response.status_code}")
print(f"Response: {e.response.text}")
return []
except Exception as e:
print(f"❌ Error: {e}")
return []
def demonstrate_request_schema():
"""Show the complete request schema with all options."""
print("\n📋 Complete Request Schema")
print("=" * 50)
complete_schema = {
"domain": "example.com", # Required: Domain to discover URLs from
"seeding_config": { # Optional: Configuration object
# Discovery sources
"source": "sitemap+cc", # "sitemap", "cc", or "sitemap+cc"
# Filtering options
"pattern": "*/blog/*", # URL pattern filter (glob style)
"max_urls": 50, # Maximum URLs to return (-1 = no limit)
"filter_nonsense_urls": True, # Filter out nonsense URLs
# Metadata and validation
"extract_head": True, # Extract <head> metadata
"live_check": True, # Verify URL accessibility
# Performance and rate limiting
"concurrency": 100, # Concurrent requests
"hits_per_sec": 10, # Rate limit (requests/second)
"force": False, # Bypass cache
# Relevance scoring (requires extract_head=True)
"query": "search terms", # Query for BM25 scoring
"scoring_method": "bm25", # Scoring algorithm
"score_threshold": 0.2, # Minimum score threshold
# Debugging
"verbose": True # Enable verbose logging
}
}
print("Full request schema:")
print(json.dumps(complete_schema, indent=2))
async def main():
"""Run all examples."""
print("🚀 URL Discovery API Examples")
print("=" * 50)
print(f"Server: {BASE_URL}")
print(f"Domain: {EXAMPLE_DOMAIN}")
# Check if server is running
async with httpx.AsyncClient() as client:
try:
response = await client.get(f"{BASE_URL}/health", timeout=5.0)
response.raise_for_status()
print("✅ Server is running\n")
except Exception as e:
print(f"❌ Server not available: {e}")
print("Please start the Crawl4AI server first:")
print(" docker compose up crawl4ai -d")
return
# Run examples
await discover_urls_basic_example()
await discover_urls_advanced_example()
await discover_urls_with_scoring_example()
# Show schema
demonstrate_request_schema()
print("\n🎉 Examples complete!")
print("\nNext steps:")
print("1. Use discovered URLs with the /crawl endpoint")
print("2. Filter URLs based on your specific needs")
print("3. Combine with other API endpoints for complete workflows")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""
Test script for the new URL discovery functionality.
This tests the handler function directly without running the full server.
"""
import asyncio
import sys
import os
from pathlib import Path
# Add the repo to Python path
repo_root = Path(__file__).parent
sys.path.insert(0, str(repo_root))
sys.path.insert(0, str(repo_root / "deploy" / "docker"))
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax
console = Console()
async def test_url_discovery_handler():
"""Test the URL discovery handler function directly."""
try:
# Import the handler function and dependencies
from api import handle_url_discovery
from crawl4ai.async_configs import SeedingConfig
console.print("[bold cyan]Testing URL Discovery Handler Function[/bold cyan]")
# Test 1: Basic functionality
console.print("\n[cyan]Test 1: Basic URL discovery[/cyan]")
domain = "docs.crawl4ai.com"
seeding_config = {
"source": "sitemap",
"max_urls": 3,
"verbose": True
}
console.print(f"[blue]Domain:[/blue] {domain}")
console.print(f"[blue]Config:[/blue] {seeding_config}")
# Call the handler directly
result = await handle_url_discovery(domain, seeding_config)
console.print(f"[green]✓ Handler executed successfully[/green]")
console.print(f"[green]✓ Result type: {type(result)}[/green]")
console.print(f"[green]✓ Result length: {len(result)}[/green]")
# Print first few results if any
if result and len(result) > 0:
console.print("\n[blue]Sample results:[/blue]")
for i, url_obj in enumerate(result[:2]):
console.print(f" {i+1}. {url_obj}")
return True
except ImportError as e:
console.print(f"[red]✗ Import error: {e}[/red]")
console.print("[yellow]This suggests missing dependencies or module structure issues[/yellow]")
return False
except Exception as e:
console.print(f"[red]✗ Handler error: {e}[/red]")
return False
async def test_seeding_config_validation():
"""Test SeedingConfig validation."""
try:
from crawl4ai.async_configs import SeedingConfig
console.print("\n[cyan]Test 2: SeedingConfig validation[/cyan]")
# Test valid config
valid_config = {
"source": "sitemap",
"max_urls": 5,
"pattern": "*"
}
config = SeedingConfig(**valid_config)
console.print(f"[green]✓ Valid config created: {config.source}, max_urls={config.max_urls}[/green]")
# Test invalid config
try:
invalid_config = {
"source": "invalid_source",
"max_urls": 5
}
config = SeedingConfig(**invalid_config)
console.print(f"[yellow]? Invalid config unexpectedly accepted[/yellow]")
except Exception as e:
console.print(f"[green]✓ Invalid config correctly rejected: {str(e)[:50]}...[/green]")
return True
except Exception as e:
console.print(f"[red]✗ SeedingConfig test error: {e}[/red]")
return False
async def test_schema_validation():
"""Test the URLDiscoveryRequest schema."""
try:
from schemas import URLDiscoveryRequest
console.print("\n[cyan]Test 3: URLDiscoveryRequest schema validation[/cyan]")
# Test valid request
valid_request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap",
"max_urls": 10
}
}
request = URLDiscoveryRequest(**valid_request_data)
console.print(f"[green]✓ Valid request created: domain={request.domain}[/green]")
# Test request with default config
minimal_request_data = {
"domain": "example.com"
}
request = URLDiscoveryRequest(**minimal_request_data)
console.print(f"[green]✓ Minimal request created with defaults[/green]")
return True
except Exception as e:
console.print(f"[red]✗ Schema test error: {e}[/red]")
return False
async def main():
"""Run all tests."""
console.print("[bold blue]🔍 URL Discovery Implementation Tests[/bold blue]")
results = []
# Test the implementation components
results.append(await test_seeding_config_validation())
results.append(await test_schema_validation())
results.append(await test_url_discovery_handler())
# Summary
console.print("\n[bold cyan]Test Summary[/bold cyan]")
passed = sum(results)
total = len(results)
if passed == total:
console.print(f"[bold green]✓ All {total} implementation tests passed![/bold green]")
console.print("[green]The URL discovery endpoint is ready for integration testing[/green]")
else:
console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
return passed == total
if __name__ == "__main__":
asyncio.run(main())

193
tests/test_url_discovery.py Normal file
View File

@@ -0,0 +1,193 @@
#!/usr/bin/env python3
"""
Test script for the new /urls/discover endpoint in Crawl4AI Docker API.
"""
import asyncio
import httpx
import json
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax
console = Console()
# Configuration
BASE_URL = "http://localhost:11235"
TEST_DOMAIN = "docs.crawl4ai.com"
async def check_server_health(client: httpx.AsyncClient) -> bool:
"""Check if the server is healthy."""
console.print("[bold cyan]Checking server health...[/]", end="")
try:
response = await client.get("/health", timeout=10.0)
response.raise_for_status()
console.print(" [bold green]✓ Server is healthy![/]")
return True
except Exception as e:
console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
console.print(f"Is the server running at {BASE_URL}?")
return False
def print_request(endpoint: str, payload: dict, title: str = "Request"):
"""Pretty print the request."""
syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
console.print(Panel.fit(
f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
title=f"[bold blue]{title}[/]",
border_style="blue"
))
def print_response(response_data: dict, title: str = "Response"):
"""Pretty print the response."""
syntax = Syntax(json.dumps(response_data, indent=2), "json", theme="monokai")
console.print(Panel.fit(
syntax,
title=f"[bold green]{title}[/]",
border_style="green"
))
async def test_urls_discover_basic():
"""Test basic URL discovery functionality."""
console.print("\n[bold yellow]Testing URL Discovery Endpoint[/bold yellow]")
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
# Check server health first
if not await check_server_health(client):
return False
# Test 1: Basic discovery with sitemap
console.print("\n[cyan]Test 1: Basic URL discovery from sitemap[/cyan]")
payload = {
"domain": TEST_DOMAIN,
"seeding_config": {
"source": "sitemap",
"max_urls": 5
}
}
print_request("/urls/discover", payload, "Basic Discovery Request")
try:
response = await client.post("/urls/discover", json=payload)
response.raise_for_status()
response_data = response.json()
print_response(response_data, "Basic Discovery Response")
# Validate response structure
if isinstance(response_data, list):
console.print(f"[green]✓ Discovered {len(response_data)} URLs[/green]")
return True
else:
console.print(f"[red]✗ Expected list, got {type(response_data)}[/red]")
return False
except httpx.HTTPStatusError as e:
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
return False
except Exception as e:
console.print(f"[red]✗ Error: {e}[/red]")
return False
async def test_urls_discover_invalid_config():
"""Test URL discovery with invalid configuration."""
console.print("\n[cyan]Test 2: URL discovery with invalid configuration[/cyan]")
async with httpx.AsyncClient(base_url=BASE_URL, timeout=30.0) as client:
payload = {
"domain": TEST_DOMAIN,
"seeding_config": {
"source": "invalid_source", # Invalid source
"max_urls": 5
}
}
print_request("/urls/discover", payload, "Invalid Config Request")
try:
response = await client.post("/urls/discover", json=payload)
if response.status_code == 500:
console.print("[green]✓ Server correctly rejected invalid config with 500 error[/green]")
return True
else:
console.print(f"[yellow]? Expected 500 error, got {response.status_code}[/yellow]")
response_data = response.json()
print_response(response_data, "Unexpected Response")
return False
except Exception as e:
console.print(f"[red]✗ Unexpected error: {e}[/red]")
return False
async def test_urls_discover_with_filtering():
"""Test URL discovery with advanced filtering."""
console.print("\n[cyan]Test 3: URL discovery with filtering and metadata[/cyan]")
async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
payload = {
"domain": TEST_DOMAIN,
"seeding_config": {
"source": "sitemap",
"pattern": "*/docs/*", # Filter to docs URLs only
"extract_head": True, # Extract metadata
"max_urls": 3
}
}
print_request("/urls/discover", payload, "Filtered Discovery Request")
try:
response = await client.post("/urls/discover", json=payload)
response.raise_for_status()
response_data = response.json()
print_response(response_data, "Filtered Discovery Response")
# Validate response structure with metadata
if isinstance(response_data, list) and len(response_data) > 0:
sample_url = response_data[0]
if "url" in sample_url:
console.print(f"[green]✓ Discovered {len(response_data)} filtered URLs with metadata[/green]")
return True
else:
console.print(f"[red]✗ URL objects missing expected fields[/red]")
return False
else:
console.print(f"[yellow]? No URLs found with filter pattern[/yellow]")
return True # This could be expected
except httpx.HTTPStatusError as e:
console.print(f"[red]✗ HTTP Error: {e.response.status_code} - {e.response.text}[/red]")
return False
except Exception as e:
console.print(f"[red]✗ Error: {e}[/red]")
return False
async def main():
"""Run all tests."""
console.print("[bold cyan]🔍 URL Discovery Endpoint Tests[/bold cyan]")
results = []
# Run tests
results.append(await test_urls_discover_basic())
results.append(await test_urls_discover_invalid_config())
results.append(await test_urls_discover_with_filtering())
# Summary
console.print("\n[bold cyan]Test Summary[/bold cyan]")
passed = sum(results)
total = len(results)
if passed == total:
console.print(f"[bold green]✓ All {total} tests passed![/bold green]")
else:
console.print(f"[bold yellow]⚠ {passed}/{total} tests passed[/bold yellow]")
return passed == total
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,286 @@
#!/usr/bin/env python3
"""
End-to-end tests for the URL Discovery endpoint.
This test suite verifies the complete functionality of the /urls/discover endpoint
including happy path scenarios and error handling.
"""
import asyncio
import httpx
import json
import pytest
from typing import Dict, Any
# Test configuration
BASE_URL = "http://localhost:11235"
TEST_TIMEOUT = 30.0
class TestURLDiscoveryEndpoint:
"""End-to-end test suite for URL Discovery endpoint."""
@pytest.fixture
async def client(self):
"""Create an async HTTP client for testing."""
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
yield client
async def test_server_health(self, client):
"""Test that the server is healthy before running other tests."""
response = await client.get("/health")
assert response.status_code == 200
data = response.json()
assert data["status"] == "ok"
async def test_endpoint_exists(self, client):
"""Test that the /urls/discover endpoint exists and is documented."""
# Check OpenAPI spec includes our endpoint
response = await client.get("/openapi.json")
assert response.status_code == 200
openapi_spec = response.json()
assert "/urls/discover" in openapi_spec["paths"]
endpoint_spec = openapi_spec["paths"]["/urls/discover"]
assert "post" in endpoint_spec
assert endpoint_spec["post"]["summary"] == "URL Discovery and Seeding"
async def test_basic_url_discovery_happy_path(self, client):
"""Test basic URL discovery with minimal configuration."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap",
"max_urls": 5
}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
# Note: We don't assert length > 0 because URL discovery
# may legitimately return empty results
async def test_minimal_request_with_defaults(self, client):
"""Test that minimal request works with default seeding_config."""
request_data = {
"domain": "example.com"
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
async def test_advanced_configuration(self, client):
"""Test advanced configuration options."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap+cc",
"pattern": "*/docs/*",
"extract_head": True,
"max_urls": 3,
"live_check": True,
"concurrency": 50,
"hits_per_sec": 5,
"verbose": True
}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
# If URLs are returned, they should have the expected structure
for url_obj in data:
assert isinstance(url_obj, dict)
# Should have at least a URL field
assert "url" in url_obj
async def test_bm25_scoring_configuration(self, client):
"""Test BM25 relevance scoring configuration."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap",
"extract_head": True, # Required for scoring
"query": "documentation",
"scoring_method": "bm25",
"score_threshold": 0.1,
"max_urls": 5
}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
# If URLs are returned with scoring, check structure
for url_obj in data:
assert isinstance(url_obj, dict)
assert "url" in url_obj
# Scoring may or may not add score field depending on implementation
async def test_missing_required_domain_field(self, client):
"""Test error handling when required domain field is missing."""
request_data = {
"seeding_config": {
"source": "sitemap",
"max_urls": 5
}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 422 # Validation error
error_data = response.json()
assert "detail" in error_data
assert any("domain" in str(error).lower() for error in error_data["detail"])
async def test_invalid_request_body_structure(self, client):
"""Test error handling with completely invalid request body."""
invalid_request = {
"invalid_field": "test_value",
"another_invalid": 123
}
response = await client.post("/urls/discover", json=invalid_request)
assert response.status_code == 422 # Validation error
error_data = response.json()
assert "detail" in error_data
async def test_invalid_seeding_config_parameters(self, client):
"""Test handling of invalid seeding configuration parameters."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "invalid_source", # Invalid source
"max_urls": "not_a_number" # Invalid type
}
}
response = await client.post("/urls/discover", json=request_data)
# The endpoint should handle this gracefully
# It may return 200 with empty results or 500 with error details
assert response.status_code in [200, 500]
if response.status_code == 200:
data = response.json()
assert isinstance(data, list)
# May be empty due to invalid config
else:
# Should have error details
error_data = response.json()
assert "detail" in error_data
async def test_empty_seeding_config(self, client):
"""Test with empty seeding_config object."""
request_data = {
"domain": "example.com",
"seeding_config": {}
}
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
async def test_response_structure_consistency(self, client):
"""Test that response structure is consistent."""
request_data = {
"domain": "example.com",
"seeding_config": {
"source": "sitemap",
"max_urls": 1
}
}
# Make multiple requests to ensure consistency
for _ in range(3):
response = await client.post("/urls/discover", json=request_data)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
# If there are results, check they have consistent structure
for url_obj in data:
assert isinstance(url_obj, dict)
assert "url" in url_obj
async def test_content_type_validation(self, client):
"""Test that endpoint requires JSON content type."""
# Test with wrong content type
response = await client.post(
"/urls/discover",
content="domain=example.com",
headers={"Content-Type": "application/x-www-form-urlencoded"}
)
assert response.status_code == 422
# Standalone test runner for when pytest is not available
async def run_tests_standalone():
"""Run tests without pytest framework."""
print("🧪 Running URL Discovery Endpoint Tests")
print("=" * 50)
# Check server health first
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
try:
response = await client.get("/health")
assert response.status_code == 200
print("✅ Server health check passed")
except Exception as e:
print(f"❌ Server health check failed: {e}")
return False
test_suite = TestURLDiscoveryEndpoint()
# Run tests manually
tests = [
("Endpoint exists", test_suite.test_endpoint_exists),
("Basic URL discovery", test_suite.test_basic_url_discovery_happy_path),
("Minimal request", test_suite.test_minimal_request_with_defaults),
("Advanced configuration", test_suite.test_advanced_configuration),
("BM25 scoring", test_suite.test_bm25_scoring_configuration),
("Missing domain error", test_suite.test_missing_required_domain_field),
("Invalid request body", test_suite.test_invalid_request_body_structure),
("Invalid config handling", test_suite.test_invalid_seeding_config_parameters),
("Empty config", test_suite.test_empty_seeding_config),
("Response consistency", test_suite.test_response_structure_consistency),
("Content type validation", test_suite.test_content_type_validation),
]
passed = 0
failed = 0
async with httpx.AsyncClient(base_url=BASE_URL, timeout=TEST_TIMEOUT) as client:
for test_name, test_func in tests:
try:
await test_func(client)
print(f"{test_name}")
passed += 1
except Exception as e:
print(f"{test_name}: {e}")
failed += 1
print(f"\n📊 Test Results: {passed} passed, {failed} failed")
return failed == 0
if __name__ == "__main__":
# Run tests standalone
success = asyncio.run(run_tests_standalone())
exit(0 if success else 1)

View File

@@ -0,0 +1,170 @@
#!/usr/bin/env python3
"""
Test script for VirtualScrollConfig with the /crawl API endpoint
"""
import requests
import json
def test_virtual_scroll_api():
"""Test the /crawl endpoint with VirtualScrollConfig"""
# Create a simple HTML page with virtual scroll for testing
test_html = '''
<html>
<head>
<style>
#container {
height: 300px;
overflow-y: auto;
border: 1px solid #ccc;
}
.item {
height: 30px;
padding: 5px;
border-bottom: 1px solid #eee;
}
</style>
</head>
<body>
<h1>Virtual Scroll Test</h1>
<div id="container">
<div class="item">Item 1</div>
<div class="item">Item 2</div>
<div class="item">Item 3</div>
<div class="item">Item 4</div>
<div class="item">Item 5</div>
</div>
<script>
// Simple script to simulate virtual scroll
const container = document.getElementById('container');
let itemCount = 5;
// Add more items when scrolling
container.addEventListener('scroll', function() {
if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
for (let i = 0; i < 5; i++) {
itemCount++;
const newItem = document.createElement('div');
newItem.className = 'item';
newItem.textContent = `Item ${itemCount}`;
container.appendChild(newItem);
}
}
});
// Initial scroll to trigger loading
setTimeout(() => {
container.scrollTop = container.scrollHeight;
}, 100);
</script>
</body>
</html>
'''
# Save the HTML to a temporary file and serve it
import tempfile
import os
import http.server
import socketserver
import threading
import time
# Create temporary HTML file
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write(test_html)
temp_file = f.name
# Start local server
os.chdir(os.path.dirname(temp_file))
port = 8080
class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass # Suppress log messages
try:
with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
server_thread = threading.Thread(target=httpd.serve_forever)
server_thread.daemon = True
server_thread.start()
time.sleep(0.5) # Give server time to start
# Now test the API
url = f"http://crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"
payload = {
"urls": [url],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
"viewport_width": 1920,
"viewport_height": 1080
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"virtual_scroll_config": {
"type": "VirtualScrollConfig",
"params": {
"container_selector": "#container",
"scroll_count": 3,
"scroll_by": "container_height",
"wait_after_scroll": 0.5
}
},
"cache_mode": "bypass",
"extraction_strategy": {
"type": "NoExtractionStrategy",
"params": {}
}
}
}
}
print("Testing VirtualScrollConfig with /crawl endpoint...")
print(f"Test URL: {url}")
print("Payload:")
print(json.dumps(payload, indent=2))
response = requests.post(
"http://localhost:11234/crawl",
json=payload,
headers={"Content-Type": "application/json"}
)
print(f"\nResponse Status: {response.status_code}")
if response.status_code == 200:
result = response.json()
print("✅ Success! VirtualScrollConfig is working.")
print(f"Content length: {len(result[0]['content']['raw_content'])} characters")
# Check if virtual scroll captured more content
if "Item 10" in result[0]['content']['raw_content']:
print("✅ Virtual scroll successfully captured additional content!")
else:
print("⚠️ Virtual scroll may not have worked as expected")
# Print a snippet of the content
content_preview = result[0]['content']['raw_content'][:500] + "..."
print(f"\nContent preview:\n{content_preview}")
else:
print(f"❌ Error: {response.status_code}")
print(f"Response: {response.text}")
except Exception as e:
print(f"❌ Test failed with error: {e}")
finally:
# Cleanup
try:
os.unlink(temp_file)
except:
pass
if __name__ == "__main__":
test_virtual_scroll_api()

View File

@@ -0,0 +1,117 @@
#!/usr/bin/env python3
"""
Test VirtualScrollConfig with the /crawl API using existing test assets
"""
import requests
import json
import os
import http.server
import socketserver
import threading
import time
from pathlib import Path
def test_virtual_scroll_api():
"""Test the /crawl endpoint with VirtualScrollConfig using test assets"""
# Use the existing test assets
assets_dir = Path(__file__).parent / "docs" / "examples" / "assets"
if not assets_dir.exists():
print(f"❌ Assets directory not found: {assets_dir}")
return
# Start local server for assets
os.chdir(assets_dir)
port = 8081
class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass # Suppress log messages
try:
with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
server_thread = threading.Thread(target=httpd.serve_forever)
server_thread.daemon = True
server_thread.start()
time.sleep(0.5) # Give server time to start
# Test with Twitter-like virtual scroll
url = f"http://docs.crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"
payload = {
"urls": [url],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
"viewport_width": 1280,
"viewport_height": 800
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"virtual_scroll_config": {
"type": "VirtualScrollConfig",
"params": {
"container_selector": "#timeline",
"scroll_count": 10,
"scroll_by": "container_height",
"wait_after_scroll": 0.3
}
},
"cache_mode": "bypass",
"extraction_strategy": {
"type": "NoExtractionStrategy",
"params": {}
}
}
}
}
print("Testing VirtualScrollConfig with /crawl endpoint...")
print(f"Test URL: {url}")
print("Payload:")
print(json.dumps(payload, indent=2))
response = requests.post(
"http://localhost:11234/crawl",
json=payload,
headers={"Content-Type": "application/json"},
timeout=60 # Longer timeout for virtual scroll
)
print(f"\nResponse Status: {response.status_code}")
if response.status_code == 200:
result = response.json()
print("✅ Success! VirtualScrollConfig is working with the API.")
print(f"Content length: {len(result[0]['content']['raw_content'])} characters")
# Check if we captured multiple posts (indicating virtual scroll worked)
content = result[0]['content']['raw_content']
post_count = content.count("Post #")
print(f"Found {post_count} posts in the content")
if post_count > 5: # Should capture more than just the initial posts
print("✅ Virtual scroll successfully captured additional content!")
else:
print("⚠️ Virtual scroll may not have captured much additional content")
# Print a snippet of the content
content_preview = content[:1000] + "..." if len(content) > 1000 else content
print(f"\nContent preview:\n{content_preview}")
else:
print(f"❌ Error: {response.status_code}")
print(f"Response: {response.text}")
except requests.exceptions.Timeout:
print("❌ Request timed out - virtual scroll may be taking too long")
except Exception as e:
print(f"❌ Test failed with error: {e}")
if __name__ == "__main__":
test_virtual_scroll_api()