Files
crawl4ai/tests/docker/test_server_requests.py
AHMET YILMAZ 674d0741da feat: Add HTTP-only crawling endpoints and related models
- Introduced HTTPCrawlRequest and HTTPCrawlRequestWithHooks models for HTTP-only crawling.
- Implemented /crawl/http and /crawl/http/stream endpoints for fast, lightweight crawling without browser rendering.
- Enhanced server.py to handle HTTP crawl requests and streaming responses.
- Updated utils.py to disable memory wait timeout for testing.
- Expanded API documentation to include new HTTP crawling features.
- Added tests for HTTP crawling endpoints, including error handling and streaming responses.
2025-10-15 17:45:58 +08:00

1121 lines
48 KiB
Python

import pytest
import pytest_asyncio
import httpx
import json
import asyncio
import os
from typing import List, Dict, Any, AsyncGenerator
from dotenv import load_dotenv
load_dotenv()
# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
# You don't strictly NEED these imports for the tests to run against the server,
# but they help in understanding the structure you are mimicking in JSON.
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter,
BM25ContentFilter,
BFSDeepCrawlStrategy,
FilterChain,
ContentTypeFilter,
DomainFilter,
CompositeScorer,
KeywordRelevanceScorer,
PathDepthScorer,
JsonCssExtractionStrategy,
LLMExtractionStrategy,
LLMConfig
)
# --- Test Configuration ---
# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://0.0.0.0:11234") # Make base URL configurable
# Use a known simple HTML page for basic tests
SIMPLE_HTML_URL = "https://docs.crawl4ai.com"
# Use a site suitable for scraping tests
SCRAPE_TARGET_URL = "http://books.toscrape.com/"
# Use a site with internal links for deep crawl tests
DEEP_CRAWL_URL = "https://python.org"
# --- Pytest Fixtures ---
# Use the built-in event_loop fixture from pytest_asyncio
# The custom implementation was causing issues with closing the loop
@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
"""Provides an async HTTP client"""
client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
yield client
await client.aclose()
# --- Helper Functions ---
async def check_server_health(client: httpx.AsyncClient):
"""Check if the server is healthy before running tests."""
try:
response = await client.get("/health")
response.raise_for_status()
print(f"\nServer healthy: {response.json()}")
return True
except (httpx.RequestError, httpx.HTTPStatusError) as e:
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
async def assert_crawl_result_structure(result: Dict[str, Any]):
"""Asserts the basic structure of a single crawl result."""
assert isinstance(result, dict)
assert "url" in result
assert "success" in result
assert "html" in result
# Add more common checks if needed
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
"""Processes an NDJSON streaming response."""
results = []
completed = False
buffer = ""
async for chunk in response.aiter_text():
buffer += chunk
lines = buffer.split('\n')
# Keep the last incomplete line in buffer
buffer = lines.pop() if lines and not lines[-1].endswith('\n') else ""
for line in lines:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
if data.get("status") in ["completed", "error"]:
completed = True
print(f"DEBUG: Received completion marker: {data}") # Debug output
break
else:
results.append(data)
except json.JSONDecodeError:
pytest.fail(f"Failed to decode JSON line: {line}")
if completed:
break
print(f"DEBUG: Final results count: {len(results)}, completed: {completed}") # Debug output
assert completed, "Streaming response did not end with a completion marker."
return results
# --- Test Class ---
@pytest.mark.asyncio
class TestCrawlEndpoints:
@pytest_asyncio.fixture(autouse=True)
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
"""Fixture to ensure server is healthy before each test in the class."""
await check_server_health(async_client)
# 1. Simple Requests (Primitives)
async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
"""Test /crawl with a single URL and simple config values."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False, # Explicitly false for /crawl
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value # Use enum value
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error: {e}")
print(f"Response content: {e.response.text}")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "Crawl4AI Documentation" in result["html"]
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
# It might be null, missing, or populated depending on the server's default behavior
async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
"""Test that /crawl endpoint handles stream=True directly without redirect."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Set stream to True for direct streaming
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value
}
}
}
# Send a request to the /crawl endpoint - should handle streaming directly
async with async_client.stream("POST", "/crawl", json=payload) as response:
assert response.status_code == 200
assert response.headers["content-type"] == "application/x-ndjson"
assert response.headers.get("x-stream-status") == "active"
results = await process_streaming_response(response)
assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "Crawl4AI Documentation" in result["html"]
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with a single URL and simple config values."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Must be true for /crawl/stream
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value
}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "Crawl4AI Documentation" in result["html"]
# 2. Multi-URL and Dispatcher
async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
"""Test /crawl with multiple URLs, implicitly testing dispatcher."""
urls = [SIMPLE_HTML_URL, "https://www.geeksforgeeks.org/"]
payload = {
"urls": urls,
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == len(urls)
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] in urls
async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with multiple URLs."""
urls = [SIMPLE_HTML_URL, "https://www.geeksforgeeks.org/"]
payload = {
"urls": urls,
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == len(urls)
processed_urls = set()
for result in results:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] in urls
processed_urls.add(result["url"])
assert processed_urls == set(urls) # Ensure all URLs were processed
# 3. Class Values and Nested Classes (Markdown Generator)
async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
"""Test /crawl with MarkdownGenerator using PruningContentFilter."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.ENABLED.value, # Test different cache mode
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.5, # Example param
"threshold_type": "relative"
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "markdown" in result
assert isinstance(result["markdown"], dict)
assert "raw_markdown" in result["markdown"]
assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
assert "Crawl4AI" in result["markdown"]["raw_markdown"]
# Fit markdown content might be different/shorter due to pruning
assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
"""Test /crawl with MarkdownGenerator using BM25ContentFilter."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "BM25ContentFilter",
"params": {
"user_query": "Herman Melville", # Query for BM25
"bm25_threshold": 0.1, # Lower threshold to increase matches
"language": "english" # Valid parameters
}
}
}
}
}
}
}
try:
print(f"Payload for BM25 test: {json.dumps(payload)}")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "markdown" in result
assert isinstance(result["markdown"], dict)
assert "raw_markdown" in result["markdown"]
assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
# Print values for debug
print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
# Either fit_markdown has content (possibly including our query terms)
# or it might be empty if no good BM25 matches were found
# Don't assert specific content since it can be environment-dependent
# 4. Deep Crawling
async def test_deep_crawl(self, async_client: httpx.AsyncClient):
"""Test /crawl with a deep crawl strategy."""
payload = {
"urls": [DEEP_CRAWL_URL], # Start URL
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": CacheMode.BYPASS.value,
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1, # Limit depth for testing speed
"max_pages": 5, # Limit pages to crawl
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "ContentTypeFilter",
"params": {"allowed_types": ["text/html"]}
},
{
"type": "DomainFilter",
"params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
}
]
}
},
"url_scorer": {
"type": "CompositeScorer",
"params": {
"scorers": [
{
"type": "KeywordRelevanceScorer",
"params": {"keywords": ["documentation", "tutorial"]}
},
{
"type": "PathDepthScorer",
"params": {"weight": 0.5, "optimal_depth": 2}
}
]
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
# Expect more than 1 result due to deep crawl (start URL + crawled links)
assert len(data["results"]) > 1
assert len(data["results"]) <= 6 # Start URL + max_links=5
start_url_found = False
crawled_urls_found = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
# Print URL for debugging
print(f"Crawled URL: {result['url']}")
# Allow URLs that contain python.org (including subdomains like docs.python.org)
assert "python.org" in result["url"]
if result["url"] == DEEP_CRAWL_URL:
start_url_found = True
else:
crawled_urls_found = True
assert start_url_found
assert crawled_urls_found
# 5. Extraction without LLM (JSON/CSS)
async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
"""Test /crawl with JsonCssExtractionStrategy."""
payload = {
"urls": [SCRAPE_TARGET_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
"value": {
"name": "BookList",
"baseSelector": "ol.row li.col-xs-6", # Select each book item
"fields": [
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
{"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
]
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
# Extracted content should be a JSON string representing a list of dicts
try:
extracted_data = json.loads(result["extracted_content"])
assert isinstance(extracted_data, list)
assert len(extracted_data) > 0 # Should find some books
# Check structure of the first extracted item
first_item = extracted_data[0]
assert "title" in first_item
assert "price" in first_item
assert "rating" in first_item
assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
# 6. Extraction with LLM
async def test_llm_extraction(self, async_client: httpx.AsyncClient):
"""
Test /crawl with LLMExtractionStrategy.
NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
configured via .llm.env or environment variables.
This test uses the default provider configured in the server's config.yml.
"""
# Skip test if no OpenAI API key is configured
if not os.getenv("OPENAI_API_KEY"):
pytest.skip("OPENAI_API_KEY not configured, skipping LLM extraction test")
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract the main title and any key information about Crawl4AI from the text into JSON.",
# LLMConfig is implicitly defined by server's config.yml and .llm.env
# If you needed to override provider/token PER REQUEST:
"llm_config": {
"type": "LLMConfig",
"params": {
"provider": "deepseek/deepseek-chat-v3.1:free", # Use deepseek model from openrouter
"api_token": os.getenv("OPENAI_API_KEY"), # Use OPENAI_API_KEY for openrouter
"base_url": "https://openrouter.ai/api/v1" # OpenRouter base URL
}
},
"schema": { # Optional: Provide a schema for structured output
"type": "dict", # IMPORTANT: Wrap schema dict
"value": {
"title": "Crawl4AI Info",
"type": "object",
"properties": {
"title": {"type": "string", "description": "The main title of the page"},
"description": {"type": "string", "description": "Key information about Crawl4AI"}
},
"required": ["title"]
}
}
}
}
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
data = response.json()
except httpx.HTTPStatusError as e:
# Catch potential server errors (like 500 due to missing/invalid API keys)
pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
except httpx.RequestError as e:
pytest.fail(f"LLM extraction request failed: {e}.")
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
# Extracted content should be JSON (because we provided a schema)
try:
extracted_data = json.loads(result["extracted_content"])
print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
# Handle both dict and list formats (server returns a list)
if isinstance(extracted_data, list):
assert len(extracted_data) > 0
extracted_item = extracted_data[0] # Take first item
assert isinstance(extracted_item, dict)
assert "title" in extracted_item
assert "Crawl4AI" in extracted_item.get("title", "")
else:
assert isinstance(extracted_data, dict)
assert "title" in extracted_data
assert "Crawl4AI" in extracted_data.get("title", "")
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
except Exception as e: # Catch any other unexpected error
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
# 7. Error Handling Tests
async def test_invalid_url_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for invalid URLs."""
payload = {
"urls": ["invalid-url", "https://nonexistent-domain-12345.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": CacheMode.BYPASS.value}}
}
response = await async_client.post("/crawl", json=payload)
# Should return 200 with failed results, not 500
print(f"Status code: {response.status_code}")
print(f"Response: {response.text}")
assert response.status_code == 200
data = response.json()
assert data["success"] is True # Overall success, but individual results may fail
async def test_mixed_success_failure_urls(self, async_client: httpx.AsyncClient):
"""Test handling of mixed success/failure URLs."""
payload = {
"urls": [
SIMPLE_HTML_URL, # Should succeed
"https://nonexistent-domain-12345.com", # Should fail
"https://invalid-url-with-special-chars-!@#$%^&*()", # Should fail
],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {"threshold": 0.5}
}
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert len(data["results"]) == 3
success_count = 0
failure_count = 0
for result in data["results"]:
if result["success"]:
success_count += 1
else:
failure_count += 1
assert "error_message" in result
assert len(result["error_message"]) > 0
assert success_count >= 1 # At least one should succeed
assert failure_count >= 1 # At least one should fail
async def test_streaming_mixed_urls(self, async_client: httpx.AsyncClient):
"""Test streaming with mixed success/failure URLs."""
payload = {
"urls": [
SIMPLE_HTML_URL, # Should succeed
"https://nonexistent-domain-12345.com", # Should fail
],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True,
"cache_mode": CacheMode.BYPASS.value
}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == 2
success_count = 0
failure_count = 0
for result in results:
if result["success"]:
success_count += 1
assert result["url"] == SIMPLE_HTML_URL
else:
failure_count += 1
assert "error_message" in result
assert result["error_message"] is not None
assert success_count == 1
assert failure_count == 1
async def test_markdown_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for markdown endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url", "f": "fit"}
response = await async_client.post("/md", json=invalid_payload)
# Should return 400 for invalid URL format
assert response.status_code == 400
# Test non-existent URL
nonexistent_payload = {"url": "https://nonexistent-domain-12345.com", "f": "fit"}
response = await async_client.post("/md", json=nonexistent_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_html_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for HTML endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/html", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_screenshot_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for screenshot endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/screenshot", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_pdf_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for PDF endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/pdf", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_execute_js_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for execute_js endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url", "scripts": ["return document.title;"]}
response = await async_client.post("/execute_js", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_llm_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for LLM endpoint."""
# Test missing query parameter
response = await async_client.get("/llm/https://example.com")
assert response.status_code == 422 # FastAPI validation error, not 400
# Test invalid URL
response = await async_client.get("/llm/invalid-url?q=test")
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_ask_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for ask endpoint."""
# Test invalid context_type
response = await async_client.get("/ask?context_type=invalid")
assert response.status_code == 422 # Validation error
# Test invalid score_ratio
response = await async_client.get("/ask?score_ratio=2.0") # > 1.0
assert response.status_code == 422 # Validation error
# Test invalid max_results
response = await async_client.get("/ask?max_results=0") # < 1
assert response.status_code == 422 # Validation error
async def test_config_dump_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for config dump endpoint."""
# Test invalid code
invalid_payload = {"code": "invalid_code"}
response = await async_client.post("/config/dump", json=invalid_payload)
assert response.status_code == 400
# Test nested function calls (not allowed)
nested_payload = {"code": "CrawlerRunConfig(BrowserConfig())"}
response = await async_client.post("/config/dump", json=nested_payload)
assert response.status_code == 400
async def test_llm_job_with_chunking_strategy(self, async_client: httpx.AsyncClient):
"""Test LLM job endpoint with chunking strategy."""
payload = {
"url": SIMPLE_HTML_URL,
"q": "Extract the main title and any headings from the content",
"chunking_strategy": {
"type": "RegexChunking",
"params": {
"patterns": ["\\n\\n+"],
"overlap": 50
}
}
}
try:
# Submit the job
response = await async_client.post("/llm/job", json=payload)
response.raise_for_status()
job_data = response.json()
assert "task_id" in job_data
task_id = job_data["task_id"]
# Poll for completion (simple implementation)
max_attempts = 10 # Reduced for testing
attempt = 0
while attempt < max_attempts:
status_response = await async_client.get(f"/llm/job/{task_id}")
# Check if response is valid JSON
try:
status_data = status_response.json()
except:
print(f"Non-JSON response: {status_response.text}")
attempt += 1
await asyncio.sleep(1)
continue
if status_data.get("status") == "completed":
# Verify we got a result
assert "result" in status_data
result = status_data["result"]
# Result can be string, dict, or list depending on extraction
assert result is not None
print(f"✓ LLM job with chunking completed successfully. Result type: {type(result)}")
break
elif status_data.get("status") == "failed":
pytest.fail(f"LLM job failed: {status_data.get('error', 'Unknown error')}")
break
else:
attempt += 1
await asyncio.sleep(1) # Wait 1 second before checking again
if attempt >= max_attempts:
# For testing purposes, just verify the job was submitted
print("✓ LLM job with chunking submitted successfully (completion check timed out)")
except httpx.HTTPStatusError as e:
pytest.fail(f"LLM job request failed: {e}. Response: {e.response.text}")
except Exception as e:
pytest.fail(f"LLM job test failed: {e}")
async def test_chunking_strategies_supported(self, async_client: httpx.AsyncClient):
"""Test that all chunking strategies are supported by the API."""
from deploy.docker.utils import create_chunking_strategy
# Test all supported chunking strategies
strategies_to_test = [
{"type": "IdentityChunking", "params": {}},
{"type": "RegexChunking", "params": {"patterns": ["\\n\\n"]}},
{"type": "FixedLengthWordChunking", "params": {"chunk_size": 50}},
{"type": "SlidingWindowChunking", "params": {"window_size": 100, "step": 50}},
{"type": "OverlappingWindowChunking", "params": {"window_size": 100, "overlap": 20}},
]
for strategy_config in strategies_to_test:
try:
# Test that the strategy can be created
strategy = create_chunking_strategy(strategy_config)
assert strategy is not None
print(f"{strategy_config['type']} strategy created successfully")
# Test basic chunking functionality
test_text = "This is a test document with multiple sentences. It should be split appropriately."
chunks = strategy.chunk(test_text)
assert isinstance(chunks, list)
assert len(chunks) > 0
print(f"{strategy_config['type']} chunking works: {len(chunks)} chunks")
except Exception as e:
# Some strategies may fail due to missing dependencies (NLTK), but that's OK
if "NlpSentenceChunking" in strategy_config["type"] or "TopicSegmentationChunking" in strategy_config["type"]:
print(f"{strategy_config['type']} requires NLTK dependencies: {e}")
else:
pytest.fail(f"Unexpected error with {strategy_config['type']}: {e}")
async def test_malformed_request_handling(self, async_client: httpx.AsyncClient):
"""Test handling of malformed requests."""
# Test missing required fields
malformed_payload = {"urls": []} # Missing browser_config and crawler_config
response = await async_client.post("/crawl", json=malformed_payload)
print(f"Response: {response.text}")
assert response.status_code == 422 # Validation error
# Test empty URLs list
empty_urls_payload = {
"urls": [],
"browser_config": {"type": "BrowserConfig", "params": {}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {}}
}
response = await async_client.post("/crawl", json=empty_urls_payload)
assert response.status_code == 422 # "At least one URL required"
# 7. HTTP-only Crawling Tests
async def test_http_crawl_single_url(self, async_client: httpx.AsyncClient):
"""Test /crawl/http with a single URL using HTTP-only strategy."""
payload = {
"urls": [SIMPLE_HTML_URL],
"http_config": {
"method": "GET",
"headers": {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},
"follow_redirects": True,
"verify_ssl": True
},
"crawler_config": {
"cache_mode": CacheMode.BYPASS.value,
"screenshot": False
}
}
try:
response = await async_client.post("/crawl/http", json=payload)
print(f"HTTP Response status: {response.status_code}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"HTTP Server error: {e}")
print(f"Response content: {e.response.text}")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "Crawl4AI Documentation" in result["html"]
# Check that processing was fast (HTTP should be much faster than browser)
assert data["server_processing_time_s"] < 5.0 # Should complete in under 5 seconds
async def test_http_crawl_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/http/stream with HTTP-only strategy."""
payload = {
"urls": [SIMPLE_HTML_URL],
"http_config": {
"method": "GET",
"headers": {"Accept": "text/html"},
"follow_redirects": True
},
"crawler_config": {
"cache_mode": CacheMode.BYPASS.value,
"screenshot": False
}
}
async with async_client.stream("POST", "/crawl/http/stream", json=payload) as response:
response.raise_for_status()
assert response.headers["content-type"] == "application/x-ndjson"
assert response.headers.get("x-stream-status") == "active"
results = await process_streaming_response(response)
assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "Crawl4AI Documentation" in result["html"]
async def test_http_crawl_api_endpoint(self, async_client: httpx.AsyncClient):
"""Test HTTP crawling with a JSON API endpoint."""
payload = {
"urls": ["https://httpbin.org/json"],
"http_config": {
"method": "GET",
"headers": {"Accept": "application/json"},
"follow_redirects": True
},
"crawler_config": {
"cache_mode": CacheMode.BYPASS.value
}
}
try:
response = await async_client.post("/crawl/http", json=payload)
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"HTTP API test error: {e}")
print(f"Response: {e.response.text}")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
assert result["success"] is True
assert result["url"] == "https://httpbin.org/json"
# Should contain JSON response
assert "slideshow" in result["html"] or "application/json" in result.get("content_type", "")
async def test_http_crawl_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for HTTP crawl endpoints."""
# Test invalid URL
invalid_payload = {
"urls": ["invalid-url"],
"http_config": {"method": "GET"},
"crawler_config": {"cache_mode": CacheMode.BYPASS.value}
}
response = await async_client.post("/crawl/http", json=invalid_payload)
# HTTP crawler handles invalid URLs gracefully, returns 200 with failed results
assert response.status_code == 200
# Test non-existent domain
nonexistent_payload = {
"urls": ["https://nonexistent-domain-12345.com"],
"http_config": {"method": "GET"},
"crawler_config": {"cache_mode": CacheMode.BYPASS.value}
}
response = await async_client.post("/crawl/http", json=nonexistent_payload)
# HTTP crawler handles unreachable hosts gracefully, returns 200 with failed results
assert response.status_code == 200
if __name__ == "__main__":
# Define arguments for pytest programmatically
# -v: verbose output
# -s: show print statements immediately (useful for debugging)
# __file__: tells pytest to run tests in the current file
pytest_args = ["-v", "-s", __file__]
# You can add more pytest arguments here if needed, for example:
# '-k test_llm_extraction': Run only the LLM test function
# pytest_args.append("-k test_llm_extraction")
print(f"Running pytest with args: {pytest_args}")
# Execute pytest
exit_code = pytest.main(pytest_args)
print(f"Pytest finished with exit code: {exit_code}")