feat(docker): improve docker error handling

- Return comprehensive error messages along with status codes for api internal errors.
- Fix fit_html property serialization issue in both /crawl and /crawl/stream endpoints
- Add sanitization to ensure fit_html is always JSON-serializable (string or None)
- Add comprehensive error handling test suite.
This commit is contained in:
Soham Kukreti
2025-08-26 23:18:35 +05:30
parent cce3390a2d
commit 2ad3fb5fc8
3 changed files with 296 additions and 38 deletions

View File

@@ -413,6 +413,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
server_memory_mb = _get_memory_mb()
result_dict = result.model_dump()
result_dict['server_memory_mb'] = server_memory_mb
# Ensure fit_html is JSON-serializable
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
result_dict["fit_html"] = None
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
@@ -493,6 +496,9 @@ async def handle_crawl_request(
processed_results = []
for result in results:
result_dict = result.model_dump()
# if fit_html is not a string, set it to None to avoid serialization errors
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
result_dict["fit_html"] = None
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')

View File

@@ -267,12 +267,26 @@ async def generate_html(
Use when you need sanitized HTML structures for building schemas or further processing.
"""
cfg = CrawlerRunConfig()
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
raw_html = results[0].html
from crawl4ai.utils import preprocess_html_for_schema
processed_html = preprocess_html_for_schema(raw_html)
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
try:
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
# Check if the crawl was successful
if not results[0].success:
raise HTTPException(
status_code=500,
detail=results[0].error_message or "Crawl failed"
)
raw_html = results[0].html
from crawl4ai.utils import preprocess_html_for_schema
processed_html = preprocess_html_for_schema(raw_html)
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
except Exception as e:
# Log and raise as HTTP 500 for other exceptions
raise HTTPException(
status_code=500,
detail=str(e)
)
# Screenshot endpoint
@@ -290,18 +304,29 @@ async def generate_screenshot(
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
Then in result instead of the screenshot you will get a path to the saved file.
"""
cfg = CrawlerRunConfig(
screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
screenshot_data = results[0].screenshot
if body.output_path:
abs_path = os.path.abspath(body.output_path)
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "wb") as f:
f.write(base64.b64decode(screenshot_data))
return {"success": True, "path": abs_path}
return {"success": True, "screenshot": screenshot_data}
try:
cfg = CrawlerRunConfig(
screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500,
detail=results[0].error_message or "Crawl failed"
)
screenshot_data = results[0].screenshot
if body.output_path:
abs_path = os.path.abspath(body.output_path)
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "wb") as f:
f.write(base64.b64decode(screenshot_data))
return {"success": True, "path": abs_path}
return {"success": True, "screenshot": screenshot_data}
except Exception as e:
raise HTTPException(
status_code=500,
detail=str(e)
)
# PDF endpoint
@@ -319,17 +344,28 @@ async def generate_pdf(
Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
Then in result instead of the PDF you will get a path to the saved file.
"""
cfg = CrawlerRunConfig(pdf=True)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
pdf_data = results[0].pdf
if body.output_path:
abs_path = os.path.abspath(body.output_path)
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "wb") as f:
f.write(pdf_data)
return {"success": True, "path": abs_path}
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
try:
cfg = CrawlerRunConfig(pdf=True)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500,
detail=results[0].error_message or "Crawl failed"
)
pdf_data = results[0].pdf
if body.output_path:
abs_path = os.path.abspath(body.output_path)
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "wb") as f:
f.write(pdf_data)
return {"success": True, "path": abs_path}
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
except Exception as e:
raise HTTPException(
status_code=500,
detail=str(e)
)
@app.post("/execute_js")
@@ -385,12 +421,23 @@ async def execute_js(
```
"""
cfg = CrawlerRunConfig(js_code=body.scripts)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
# Return JSON-serializable dict of the first CrawlResult
data = results[0].model_dump()
return JSONResponse(data)
try:
cfg = CrawlerRunConfig(js_code=body.scripts)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500,
detail=results[0].error_message or "Crawl failed"
)
# Return JSON-serializable dict of the first CrawlResult
data = results[0].model_dump()
return JSONResponse(data)
except Exception as e:
raise HTTPException(
status_code=500,
detail=str(e)
)
@app.get("/llm/{url:path}")
@@ -438,13 +485,16 @@ async def crawl(
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
res = await handle_crawl_request(
results = await handle_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
)
return JSONResponse(res)
# check if all of the results are not successful
if all(not result["success"] for result in results["results"]):
raise HTTPException(500, f"Crawl request failed: {results['results'][0]['error_message']}")
return JSONResponse(results)
@app.post("/crawl/stream")

View File

@@ -636,6 +636,208 @@ class TestCrawlEndpoints:
except Exception as e: # Catch any other unexpected error
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
# 7. Error Handling Tests
async def test_invalid_url_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for invalid URLs."""
payload = {
"urls": ["invalid-url", "https://nonexistent-domain-12345.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": CacheMode.BYPASS.value}}
}
response = await async_client.post("/crawl", json=payload)
# Should return 200 with failed results, not 500
print(f"Status code: {response.status_code}")
print(f"Response: {response.text}")
assert response.status_code == 500
data = response.json()
assert data["detail"].startswith("Crawl request failed:")
async def test_mixed_success_failure_urls(self, async_client: httpx.AsyncClient):
"""Test handling of mixed success/failure URLs."""
payload = {
"urls": [
SIMPLE_HTML_URL, # Should succeed
"https://nonexistent-domain-12345.com", # Should fail
"https://invalid-url-with-special-chars-!@#$%^&*()", # Should fail
],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {"threshold": 0.5}
}
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert len(data["results"]) == 3
success_count = 0
failure_count = 0
for result in data["results"]:
if result["success"]:
success_count += 1
else:
failure_count += 1
assert "error_message" in result
assert len(result["error_message"]) > 0
assert success_count >= 1 # At least one should succeed
assert failure_count >= 1 # At least one should fail
async def test_streaming_mixed_urls(self, async_client: httpx.AsyncClient):
"""Test streaming with mixed success/failure URLs."""
payload = {
"urls": [
SIMPLE_HTML_URL, # Should succeed
"https://nonexistent-domain-12345.com", # Should fail
],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True,
"cache_mode": CacheMode.BYPASS.value
}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == 2
success_count = 0
failure_count = 0
for result in results:
if result["success"]:
success_count += 1
assert result["url"] == SIMPLE_HTML_URL
else:
failure_count += 1
assert "error_message" in result
assert result["error_message"] is not None
assert success_count == 1
assert failure_count == 1
async def test_markdown_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for markdown endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url", "f": "fit"}
response = await async_client.post("/md", json=invalid_payload)
# Should return 400 for invalid URL format
assert response.status_code == 400
# Test non-existent URL
nonexistent_payload = {"url": "https://nonexistent-domain-12345.com", "f": "fit"}
response = await async_client.post("/md", json=nonexistent_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_html_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for HTML endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/html", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_screenshot_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for screenshot endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/screenshot", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_pdf_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for PDF endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url"}
response = await async_client.post("/pdf", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_execute_js_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for execute_js endpoint."""
# Test invalid URL
invalid_payload = {"url": "invalid-url", "scripts": ["return document.title;"]}
response = await async_client.post("/execute_js", json=invalid_payload)
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_llm_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for LLM endpoint."""
# Test missing query parameter
response = await async_client.get("/llm/https://example.com")
assert response.status_code == 422 # FastAPI validation error, not 400
# Test invalid URL
response = await async_client.get("/llm/invalid-url?q=test")
# Should return 500 for crawl failure
assert response.status_code == 500
async def test_ask_endpoint_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for ask endpoint."""
# Test invalid context_type
response = await async_client.get("/ask?context_type=invalid")
assert response.status_code == 422 # Validation error
# Test invalid score_ratio
response = await async_client.get("/ask?score_ratio=2.0") # > 1.0
assert response.status_code == 422 # Validation error
# Test invalid max_results
response = await async_client.get("/ask?max_results=0") # < 1
assert response.status_code == 422 # Validation error
async def test_config_dump_error_handling(self, async_client: httpx.AsyncClient):
"""Test error handling for config dump endpoint."""
# Test invalid code
invalid_payload = {"code": "invalid_code"}
response = await async_client.post("/config/dump", json=invalid_payload)
assert response.status_code == 400
# Test nested function calls (not allowed)
nested_payload = {"code": "CrawlerRunConfig(BrowserConfig())"}
response = await async_client.post("/config/dump", json=nested_payload)
assert response.status_code == 400
async def test_malformed_request_handling(self, async_client: httpx.AsyncClient):
"""Test handling of malformed requests."""
# Test missing required fields
malformed_payload = {"urls": []} # Missing browser_config and crawler_config
response = await async_client.post("/crawl", json=malformed_payload)
print(f"Response: {response.text}")
assert response.status_code == 422 # Validation error
# Test empty URLs list
empty_urls_payload = {
"urls": [],
"browser_config": {"type": "BrowserConfig", "params": {}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {}}
}
response = await async_client.post("/crawl", json=empty_urls_payload)
assert response.status_code == 422 # "At least one URL required"
if __name__ == "__main__":
# Define arguments for pytest programmatically
# -v: verbose output