Files
crawl4ai/tests/docker/test_server_requests.py
AHMET YILMAZ 8cca9704eb feat: add comprehensive type definitions and improve test coverage
Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
2025-10-13 18:49:01 +08:00

986 lines
43 KiB
Python

import pytest
import pytest_asyncio
import httpx
import json
import asyncio
import os
from typing import List, Dict, Any, AsyncGenerator
from dotenv import load_dotenv
load_dotenv()
# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
# You don't strictly NEED these imports for the tests to run against the server,
# but they help in understanding the structure you are mimicking in JSON.
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter,
BM25ContentFilter,
BFSDeepCrawlStrategy,
FilterChain,
ContentTypeFilter,
DomainFilter,
CompositeScorer,
KeywordRelevanceScorer,
PathDepthScorer,
JsonCssExtractionStrategy,
LLMExtractionStrategy,
LLMConfig
)
# --- Test Configuration ---
# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
# Use a known simple HTML page for basic tests
SIMPLE_HTML_URL = "https://httpbin.org/html"
# Use a site suitable for scraping tests
SCRAPE_TARGET_URL = "http://books.toscrape.com/"
# Use a site with internal links for deep crawl tests
DEEP_CRAWL_URL = "https://python.org"
# --- Pytest Fixtures ---
# Use the built-in event_loop fixture from pytest_asyncio
# The custom implementation was causing issues with closing the loop
@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
"""Provides an async HTTP client"""
client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
yield client
await client.aclose()
# --- Helper Functions ---
async def check_server_health(client: httpx.AsyncClient):
"""Check if the server is healthy before running tests."""
try:
response = await client.get("/health")
response.raise_for_status()
print(f"\nServer healthy: {response.json()}")
return True
except (httpx.RequestError, httpx.HTTPStatusError) as e:
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
async def assert_crawl_result_structure(result: Dict[str, Any]):
"""Asserts the basic structure of a single crawl result."""
assert isinstance(result, dict)
assert "url" in result
assert "success" in result
assert "html" in result
# Add more common checks if needed
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
"""Processes an NDJSON streaming response."""
results = []
completed = False
async for line in response.aiter_lines():
if line:
try:
data = json.loads(line)
if data.get("status") == "completed":
completed = True
break # Stop processing after completion marker
else:
results.append(data)
except json.JSONDecodeError:
pytest.fail(f"Failed to decode JSON line: {line}")
assert completed, "Streaming response did not end with a completion marker."
return results
# --- Test Class ---
@pytest.mark.asyncio
class TestCrawlEndpoints:
@pytest_asyncio.fixture(autouse=True)
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
"""Fixture to ensure server is healthy before each test in the class."""
await check_server_health(async_client)
# 1. Simple Requests (Primitives)
async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
"""Test /crawl with a single URL and simple config values."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False, # Explicitly false for /crawl
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value # Use enum value
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error: {e}")
print(f"Response content: {e.response.text}")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
# It might be null, missing, or populated depending on the server's default behavior
async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
"""Test that /crawl endpoint handles stream=True directly without redirect."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Set stream to True for direct streaming
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value
}
}
}
# Send a request to the /crawl endpoint - should handle streaming directly
async with async_client.stream("POST", "/crawl", json=payload) as response:
assert response.status_code == 200
assert response.headers["content-type"] == "application/x-ndjson"
assert response.headers.get("x-stream-status") == "active"
results = await process_streaming_response(response)
assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with a single URL and simple config values."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Must be true for /crawl/stream
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value
}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
# 2. Multi-URL and Dispatcher
async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
"""Test /crawl with multiple URLs, implicitly testing dispatcher."""
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
payload = {
"urls": urls,
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == len(urls)
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] in urls
async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with multiple URLs."""
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
payload = {
"urls": urls,
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == len(urls)
processed_urls = set()
for result in results:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] in urls
processed_urls.add(result["url"])
assert processed_urls == set(urls) # Ensure all URLs were processed
# 3. Class Values and Nested Classes (Markdown Generator)
async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
"""Test /crawl with MarkdownGenerator using PruningContentFilter."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.ENABLED.value, # Test different cache mode
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.5, # Example param
"threshold_type": "relative"
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "markdown" in result
assert isinstance(result["markdown"], dict)
assert "raw_markdown" in result["markdown"]
assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
assert "Moby-Dick" in result["markdown"]["raw_markdown"]
# Fit markdown content might be different/shorter due to pruning
assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
"""Test /crawl with MarkdownGenerator using BM25ContentFilter."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "BM25ContentFilter",
"params": {
"user_query": "Herman Melville", # Query for BM25
"bm25_threshold": 0.1, # Lower threshold to increase matches
"language": "english" # Valid parameters
}
}
}
}
}
}
}
try:
print(f"Payload for BM25 test: {json.dumps(payload)}")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "markdown" in result
assert isinstance(result["markdown"], dict)
assert "raw_markdown" in result["markdown"]
assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
# Print values for debug
print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
# Either fit_markdown has content (possibly including our query terms)
# or it might be empty if no good BM25 matches were found
# Don't assert specific content since it can be environment-dependent
# 4. Deep Crawling
async def test_deep_crawl(self, async_client: httpx.AsyncClient):
"""Test /crawl with a deep crawl strategy."""
payload = {
"urls": [DEEP_CRAWL_URL], # Start URL
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": CacheMode.BYPASS.value,
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1, # Limit depth for testing speed
"max_pages": 5, # Limit pages to crawl
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "ContentTypeFilter",
"params": {"allowed_types": ["text/html"]}
},
{
"type": "DomainFilter",
"params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
}
]
}
},
"url_scorer": {
"type": "CompositeScorer",
"params": {
"scorers": [
{
"type": "KeywordRelevanceScorer",
"params": {"keywords": ["documentation", "tutorial"]}
},
{
"type": "PathDepthScorer",
"params": {"weight": 0.5, "optimal_depth": 2}
}
]
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
# Expect more than 1 result due to deep crawl (start URL + crawled links)
assert len(data["results"]) > 1
assert len(data["results"]) <= 6 # Start URL + max_links=5
start_url_found = False
crawled_urls_found = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
# Print URL for debugging
print(f"Crawled URL: {result['url']}")
# Allow URLs that contain python.org (including subdomains like docs.python.org)
assert "python.org" in result["url"]
if result["url"] == DEEP_CRAWL_URL:
start_url_found = True
else:
crawled_urls_found = True
assert start_url_found
assert crawled_urls_found
# 5. Extraction without LLM (JSON/CSS)
async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
"""Test /crawl with JsonCssExtractionStrategy."""
payload = {
"urls": [SCRAPE_TARGET_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
"value": {
"name": "BookList",
"baseSelector": "ol.row li.col-xs-6", # Select each book item
"fields": [
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
{"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
]
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
# Extracted content should be a JSON string representing a list of dicts
try:
extracted_data = json.loads(result["extracted_content"])
assert isinstance(extracted_data, list)
assert len(extracted_data) > 0 # Should find some books
# Check structure of the first extracted item
first_item = extracted_data[0]
assert "title" in first_item
assert "price" in first_item
assert "rating" in first_item
assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
# 6. Extraction with LLM
async def test_llm_extraction(self, async_client: httpx.AsyncClient):
"""
Test /crawl with LLMExtractionStrategy.
NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
configured via .llm.env or environment variables.
This test uses the default provider configured in the server's config.yml.
"""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract the main title and the author mentioned in the text into JSON.",
# LLMConfig is implicitly defined by server's config.yml and .llm.env
# If you needed to override provider/token PER REQUEST:
"llm_config": {
"type": "LLMConfig",
"params": {
"provider": "openai/gpt-4o", # Example override
"api_token": os.getenv("OPENAI_API_KEY") # Example override
}
},
"schema": { # Optional: Provide a schema for structured output
"type": "dict", # IMPORTANT: Wrap schema dict
"value": {
"title": "Book Info",
"type": "object",
"properties": {
"title": {"type": "string", "description": "The main title of the work"},
"author": {"type": "string", "description": "The author of the work"}
},
"required": ["title", "author"]
}
}
}
}
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
data = response.json()
except httpx.HTTPStatusError as e:
# Catch potential server errors (like 500 due to missing/invalid API keys)
pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
except httpx.RequestError as e:
pytest.fail(f"LLM extraction request failed: {e}.")
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
# Extracted content should be JSON (because we provided a schema)
try:
extracted_data = json.loads(result["extracted_content"])
print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
# Handle both dict and list formats (server returns a list)
if isinstance(extracted_data, list):
assert len(extracted_data) > 0
extracted_item = extracted_data[0] # Take first item
assert isinstance(extracted_item, dict)
assert "title" in extracted_item
assert "author" in extracted_item
assert "Moby-Dick" in extracted_item.get("title", "")
assert "Herman Melville" in extracted_item.get("author", "")
else:
assert isinstance(extracted_data, dict)
assert "title" in extracted_data
assert "author" in extracted_data
assert "Moby-Dick" in extracted_data.get("title", "")
assert "Herman Melville" in extracted_data.get("author", "")
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
except Exception as e: # Catch any other unexpected error
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
# 7. Error Handling Tests
async def test_invalid_url_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for invalid URLs."""
payload = {
"urls": ["invalid-url", "https://nonexistent-domain-12345.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": CacheMode.BYPASS.value}}
}
response = await async_client.post("/crawl", json=payload)
# Should return 200 with failed results, not 500
print(f"Status code: {response.status_code}")
print(f"Response: {response.text}")
assert response.status_code == 500
data = response.json()
assert data["detail"].startswith("Crawl request failed:")
async def test_mixed_success_failure_urls(self, async_client: httpx.AsyncClient):
"""Test handling of mixed success/failure URLs."""
payload = {
"urls": [
SIMPLE_HTML_URL, # Should succeed
"https://nonexistent-domain-12345.com", # Should fail
"https://invalid-url-with-special-chars-!@#$%^&*()", # Should fail
],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {"threshold": 0.5}
}
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert len(data["results"]) == 3
success_count = 0
failure_count = 0
for result in data["results"]:
if result["success"]:
success_count += 1
else:
failure_count += 1
assert "error_message" in result
assert len(result["error_message"]) > 0
assert success_count >= 1 # At least one should succeed
assert failure_count >= 1 # At least one should fail
async def test_streaming_mixed_urls(self, async_client: httpx.AsyncClient):
"""Test streaming with mixed success/failure URLs."""
payload = {
"urls": [
SIMPLE_HTML_URL, # Should succeed
"https://nonexistent-domain-12345.com", # Should fail
],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True,
"cache_mode": CacheMode.BYPASS.value
}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == 2
success_count = 0
failure_count = 0
for result in results:
if result["success"]:
success_count += 1
assert result["url"] == SIMPLE_HTML_URL
else:
failure_count += 1
assert "error_message" in result
assert result["error_message"] is not None
assert success_count == 1
assert failure_count == 1
async def test_markdown_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for markdown endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url", "f": "fit"}
response = await async_client.post("/md", json=invalid_payload)
# Should return 400 for invalid URL format
assert response.status_code == 400
# Test non-existent URL
nonexistent_payload = {"url": "https://nonexistent-domain-12345.com", "f": "fit"}
response = await async_client.post("/md", json=nonexistent_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_html_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for HTML endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/html", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_screenshot_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for screenshot endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/screenshot", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_pdf_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for PDF endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/pdf", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_execute_js_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for execute_js endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url", "scripts": ["return document.title;"]}
response = await async_client.post("/execute_js", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_llm_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for LLM endpoint."""
# Test missing query parameter
response = await async_client.get("/llm/https://example.com")
assert response.status_code == 422 # FastAPI validation error, not 400
# Test invalid URL
response = await async_client.get("/llm/invalid-url?q=test")
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_ask_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for ask endpoint."""
# Test invalid context_type
response = await async_client.get("/ask?context_type=invalid")
assert response.status_code == 422 # Validation error
# Test invalid score_ratio
response = await async_client.get("/ask?score_ratio=2.0") # > 1.0
assert response.status_code == 422 # Validation error
# Test invalid max_results
response = await async_client.get("/ask?max_results=0") # < 1
assert response.status_code == 422 # Validation error
async def test_config_dump_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for config dump endpoint."""
# Test invalid code
invalid_payload = {"code": "invalid_code"}
response = await async_client.post("/config/dump", json=invalid_payload)
assert response.status_code == 400
# Test nested function calls (not allowed)
nested_payload = {"code": "CrawlerRunConfig(BrowserConfig())"}
response = await async_client.post("/config/dump", json=nested_payload)
assert response.status_code == 400
async def test_llm_job_with_chunking_strategy(self, async_client: httpx.AsyncClient):
"""Test LLM job endpoint with chunking strategy."""
payload = {
"url": SIMPLE_HTML_URL,
"q": "Extract the main title and any headings from the content",
"chunking_strategy": {
"type": "RegexChunking",
"params": {
"patterns": ["\\n\\n+"],
"overlap": 50
}
}
}
try:
# Submit the job
response = await async_client.post("/llm/job", json=payload)
response.raise_for_status()
job_data = response.json()
assert "task_id" in job_data
task_id = job_data["task_id"]
# Poll for completion (simple implementation)
max_attempts = 10 # Reduced for testing
attempt = 0
while attempt < max_attempts:
status_response = await async_client.get(f"/llm/job/{task_id}")
# Check if response is valid JSON
try:
status_data = status_response.json()
except:
print(f"Non-JSON response: {status_response.text}")
attempt += 1
await asyncio.sleep(1)
continue
if status_data.get("status") == "completed":
# Verify we got a result
assert "result" in status_data
result = status_data["result"]
# Result can be string, dict, or list depending on extraction
assert result is not None
print(f"✓ LLM job with chunking completed successfully. Result type: {type(result)}")
break
elif status_data.get("status") == "failed":
pytest.fail(f"LLM job failed: {status_data.get('error', 'Unknown error')}")
break
else:
attempt += 1
await asyncio.sleep(1) # Wait 1 second before checking again
if attempt >= max_attempts:
# For testing purposes, just verify the job was submitted
print("✓ LLM job with chunking submitted successfully (completion check timed out)")
except httpx.HTTPStatusError as e:
pytest.fail(f"LLM job request failed: {e}. Response: {e.response.text}")
except Exception as e:
pytest.fail(f"LLM job test failed: {e}")
async def test_chunking_strategies_supported(self, async_client: httpx.AsyncClient):
"""Test that all chunking strategies are supported by the API."""
from deploy.docker.utils import create_chunking_strategy
# Test all supported chunking strategies
strategies_to_test = [
{"type": "IdentityChunking", "params": {}},
{"type": "RegexChunking", "params": {"patterns": ["\\n\\n"]}},
{"type": "FixedLengthWordChunking", "params": {"chunk_size": 50}},
{"type": "SlidingWindowChunking", "params": {"window_size": 100, "step": 50}},
{"type": "OverlappingWindowChunking", "params": {"window_size": 100, "overlap": 20}},
]
for strategy_config in strategies_to_test:
try:
# Test that the strategy can be created
strategy = create_chunking_strategy(strategy_config)
assert strategy is not None
print(f"{strategy_config['type']} strategy created successfully")
# Test basic chunking functionality
test_text = "This is a test document with multiple sentences. It should be split appropriately."
chunks = strategy.chunk(test_text)
assert isinstance(chunks, list)
assert len(chunks) > 0
print(f"{strategy_config['type']} chunking works: {len(chunks)} chunks")
except Exception as e:
# Some strategies may fail due to missing dependencies (NLTK), but that's OK
if "NlpSentenceChunking" in strategy_config["type"] or "TopicSegmentationChunking" in strategy_config["type"]:
print(f"{strategy_config['type']} requires NLTK dependencies: {e}")
else:
pytest.fail(f"Unexpected error with {strategy_config['type']}: {e}")
async def test_malformed_request_handling(self, async_client: httpx.AsyncClient):
"""Test handling of malformed requests."""
# Test missing required fields
malformed_payload = {"urls": []} # Missing browser_config and crawler_config
response = await async_client.post("/crawl", json=malformed_payload)
print(f"Response: {response.text}")
assert response.status_code == 422 # Validation error
# Test empty URLs list
empty_urls_payload = {
"urls": [],
"browser_config": {"type": "BrowserConfig", "params": {}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {}}
}
response = await async_client.post("/crawl", json=empty_urls_payload)
assert response.status_code == 422 # "At least one URL required"
if __name__ == "__main__":
# Define arguments for pytest programmatically
# -v: verbose output
# -s: show print statements immediately (useful for debugging)
# __file__: tells pytest to run tests in the current file
pytest_args = ["-v", "-s", __file__]
# You can add more pytest arguments here if needed, for example:
# '-k test_llm_extraction': Run only the LLM test function
# pytest_args.append("-k test_llm_extraction")
print(f"Running pytest with args: {pytest_args}")
# Execute pytest
exit_code = pytest.main(pytest_args)
print(f"Pytest finished with exit code: {exit_code}")