diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 53359e1f..850f5fd6 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -413,6 +413,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) server_memory_mb = _get_memory_mb() result_dict = result.model_dump() result_dict['server_memory_mb'] = server_memory_mb + # Ensure fit_html is JSON-serializable + if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)): + result_dict["fit_html"] = None # If PDF exists, encode it to base64 if result_dict.get('pdf') is not None: result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8') @@ -493,6 +496,9 @@ async def handle_crawl_request( processed_results = [] for result in results: result_dict = result.model_dump() + # if fit_html is not a string, set it to None to avoid serialization errors + if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)): + result_dict["fit_html"] = None # If PDF exists, encode it to base64 if result_dict.get('pdf') is not None: result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8') diff --git a/deploy/docker/server.py b/deploy/docker/server.py index e453758a..b79324da 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -267,12 +267,26 @@ async def generate_html( Use when you need sanitized HTML structures for building schemas or further processing. """ cfg = CrawlerRunConfig() - async with AsyncWebCrawler(config=BrowserConfig()) as crawler: - results = await crawler.arun(url=body.url, config=cfg) - raw_html = results[0].html - from crawl4ai.utils import preprocess_html_for_schema - processed_html = preprocess_html_for_schema(raw_html) - return JSONResponse({"html": processed_html, "url": body.url, "success": True}) + try: + async with AsyncWebCrawler(config=BrowserConfig()) as crawler: + results = await crawler.arun(url=body.url, config=cfg) + # Check if the crawl was successful + if not results[0].success: + raise HTTPException( + status_code=500, + detail=results[0].error_message or "Crawl failed" + ) + + raw_html = results[0].html + from crawl4ai.utils import preprocess_html_for_schema + processed_html = preprocess_html_for_schema(raw_html) + return JSONResponse({"html": processed_html, "url": body.url, "success": True}) + except Exception as e: + # Log and raise as HTTP 500 for other exceptions + raise HTTPException( + status_code=500, + detail=str(e) + ) # Screenshot endpoint @@ -290,18 +304,29 @@ async def generate_screenshot( Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot. Then in result instead of the screenshot you will get a path to the saved file. """ - cfg = CrawlerRunConfig( - screenshot=True, screenshot_wait_for=body.screenshot_wait_for) - async with AsyncWebCrawler(config=BrowserConfig()) as crawler: - results = await crawler.arun(url=body.url, config=cfg) - screenshot_data = results[0].screenshot - if body.output_path: - abs_path = os.path.abspath(body.output_path) - os.makedirs(os.path.dirname(abs_path), exist_ok=True) - with open(abs_path, "wb") as f: - f.write(base64.b64decode(screenshot_data)) - return {"success": True, "path": abs_path} - return {"success": True, "screenshot": screenshot_data} + try: + cfg = CrawlerRunConfig( + screenshot=True, screenshot_wait_for=body.screenshot_wait_for) + async with AsyncWebCrawler(config=BrowserConfig()) as crawler: + results = await crawler.arun(url=body.url, config=cfg) + if not results[0].success: + raise HTTPException( + status_code=500, + detail=results[0].error_message or "Crawl failed" + ) + screenshot_data = results[0].screenshot + if body.output_path: + abs_path = os.path.abspath(body.output_path) + os.makedirs(os.path.dirname(abs_path), exist_ok=True) + with open(abs_path, "wb") as f: + f.write(base64.b64decode(screenshot_data)) + return {"success": True, "path": abs_path} + return {"success": True, "screenshot": screenshot_data} + except Exception as e: + raise HTTPException( + status_code=500, + detail=str(e) + ) # PDF endpoint @@ -319,17 +344,28 @@ async def generate_pdf( Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF. Then in result instead of the PDF you will get a path to the saved file. """ - cfg = CrawlerRunConfig(pdf=True) - async with AsyncWebCrawler(config=BrowserConfig()) as crawler: - results = await crawler.arun(url=body.url, config=cfg) - pdf_data = results[0].pdf - if body.output_path: - abs_path = os.path.abspath(body.output_path) - os.makedirs(os.path.dirname(abs_path), exist_ok=True) - with open(abs_path, "wb") as f: - f.write(pdf_data) - return {"success": True, "path": abs_path} - return {"success": True, "pdf": base64.b64encode(pdf_data).decode()} + try: + cfg = CrawlerRunConfig(pdf=True) + async with AsyncWebCrawler(config=BrowserConfig()) as crawler: + results = await crawler.arun(url=body.url, config=cfg) + if not results[0].success: + raise HTTPException( + status_code=500, + detail=results[0].error_message or "Crawl failed" + ) + pdf_data = results[0].pdf + if body.output_path: + abs_path = os.path.abspath(body.output_path) + os.makedirs(os.path.dirname(abs_path), exist_ok=True) + with open(abs_path, "wb") as f: + f.write(pdf_data) + return {"success": True, "path": abs_path} + return {"success": True, "pdf": base64.b64encode(pdf_data).decode()} + except Exception as e: + raise HTTPException( + status_code=500, + detail=str(e) + ) @app.post("/execute_js") @@ -385,12 +421,23 @@ async def execute_js( ``` """ - cfg = CrawlerRunConfig(js_code=body.scripts) - async with AsyncWebCrawler(config=BrowserConfig()) as crawler: - results = await crawler.arun(url=body.url, config=cfg) - # Return JSON-serializable dict of the first CrawlResult - data = results[0].model_dump() - return JSONResponse(data) + try: + cfg = CrawlerRunConfig(js_code=body.scripts) + async with AsyncWebCrawler(config=BrowserConfig()) as crawler: + results = await crawler.arun(url=body.url, config=cfg) + if not results[0].success: + raise HTTPException( + status_code=500, + detail=results[0].error_message or "Crawl failed" + ) + # Return JSON-serializable dict of the first CrawlResult + data = results[0].model_dump() + return JSONResponse(data) + except Exception as e: + raise HTTPException( + status_code=500, + detail=str(e) + ) @app.get("/llm/{url:path}") @@ -438,13 +485,16 @@ async def crawl( """ if not crawl_request.urls: raise HTTPException(400, "At least one URL required") - res = await handle_crawl_request( + results = await handle_crawl_request( urls=crawl_request.urls, browser_config=crawl_request.browser_config, crawler_config=crawl_request.crawler_config, config=config, ) - return JSONResponse(res) + # check if all of the results are not successful + if all(not result["success"] for result in results["results"]): + raise HTTPException(500, f"Crawl request failed: {results['results'][0]['error_message']}") + return JSONResponse(results) @app.post("/crawl/stream") diff --git a/tests/docker/test_server_requests.py b/tests/docker/test_server_requests.py index 56d2ada4..034a2e86 100644 --- a/tests/docker/test_server_requests.py +++ b/tests/docker/test_server_requests.py @@ -635,7 +635,209 @@ class TestCrawlEndpoints: pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}") except Exception as e: # Catch any other unexpected error pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}") - + + + # 7. Error Handling Tests + async def test_invalid_url_handling(self, async_client: httpx.AsyncClient): + """Test error handling for invalid URLs.""" + payload = { + "urls": ["invalid-url", "https://nonexistent-domain-12345.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": CacheMode.BYPASS.value}} + } + + response = await async_client.post("/crawl", json=payload) + # Should return 200 with failed results, not 500 + print(f"Status code: {response.status_code}") + print(f"Response: {response.text}") + assert response.status_code == 500 + data = response.json() + assert data["detail"].startswith("Crawl request failed:") + + async def test_mixed_success_failure_urls(self, async_client: httpx.AsyncClient): + """Test handling of mixed success/failure URLs.""" + payload = { + "urls": [ + SIMPLE_HTML_URL, # Should succeed + "https://nonexistent-domain-12345.com", # Should fail + "https://invalid-url-with-special-chars-!@#$%^&*()", # Should fail + ], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": CacheMode.BYPASS.value, + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": {"threshold": 0.5} + } + } + } + } + } + } + + response = await async_client.post("/crawl", json=payload) + assert response.status_code == 200 + data = response.json() + assert data["success"] is True + assert len(data["results"]) == 3 + + success_count = 0 + failure_count = 0 + + for result in data["results"]: + if result["success"]: + success_count += 1 + else: + failure_count += 1 + assert "error_message" in result + assert len(result["error_message"]) > 0 + + assert success_count >= 1 # At least one should succeed + assert failure_count >= 1 # At least one should fail + + async def test_streaming_mixed_urls(self, async_client: httpx.AsyncClient): + """Test streaming with mixed success/failure URLs.""" + payload = { + "urls": [ + SIMPLE_HTML_URL, # Should succeed + "https://nonexistent-domain-12345.com", # Should fail + ], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": True, + "cache_mode": CacheMode.BYPASS.value + } + } + } + + async with async_client.stream("POST", "/crawl/stream", json=payload) as response: + response.raise_for_status() + results = await process_streaming_response(response) + + assert len(results) == 2 + + success_count = 0 + failure_count = 0 + + for result in results: + if result["success"]: + success_count += 1 + assert result["url"] == SIMPLE_HTML_URL + else: + failure_count += 1 + assert "error_message" in result + assert result["error_message"] is not None + + assert success_count == 1 + assert failure_count == 1 + + async def test_markdown_endpoint_error_handling(self, async_client: httpx.AsyncClient): + """Test error handling for markdown endpoint.""" + # Test invalid URL + invalid_payload = {"url": "invalid-url", "f": "fit"} + response = await async_client.post("/md", json=invalid_payload) + # Should return 400 for invalid URL format + assert response.status_code == 400 + + # Test non-existent URL + nonexistent_payload = {"url": "https://nonexistent-domain-12345.com", "f": "fit"} + response = await async_client.post("/md", json=nonexistent_payload) + # Should return 500 for crawl failure + assert response.status_code == 500 + + async def test_html_endpoint_error_handling(self, async_client: httpx.AsyncClient): + """Test error handling for HTML endpoint.""" + # Test invalid URL + invalid_payload = {"url": "invalid-url"} + response = await async_client.post("/html", json=invalid_payload) + # Should return 500 for crawl failure + assert response.status_code == 500 + + async def test_screenshot_endpoint_error_handling(self, async_client: httpx.AsyncClient): + """Test error handling for screenshot endpoint.""" + # Test invalid URL + invalid_payload = {"url": "invalid-url"} + response = await async_client.post("/screenshot", json=invalid_payload) + # Should return 500 for crawl failure + assert response.status_code == 500 + + async def test_pdf_endpoint_error_handling(self, async_client: httpx.AsyncClient): + """Test error handling for PDF endpoint.""" + # Test invalid URL + invalid_payload = {"url": "invalid-url"} + response = await async_client.post("/pdf", json=invalid_payload) + # Should return 500 for crawl failure + assert response.status_code == 500 + + async def test_execute_js_endpoint_error_handling(self, async_client: httpx.AsyncClient): + """Test error handling for execute_js endpoint.""" + # Test invalid URL + invalid_payload = {"url": "invalid-url", "scripts": ["return document.title;"]} + response = await async_client.post("/execute_js", json=invalid_payload) + # Should return 500 for crawl failure + assert response.status_code == 500 + + async def test_llm_endpoint_error_handling(self, async_client: httpx.AsyncClient): + """Test error handling for LLM endpoint.""" + # Test missing query parameter + response = await async_client.get("/llm/https://example.com") + assert response.status_code == 422 # FastAPI validation error, not 400 + + # Test invalid URL + response = await async_client.get("/llm/invalid-url?q=test") + # Should return 500 for crawl failure + assert response.status_code == 500 + + async def test_ask_endpoint_error_handling(self, async_client: httpx.AsyncClient): + """Test error handling for ask endpoint.""" + # Test invalid context_type + response = await async_client.get("/ask?context_type=invalid") + assert response.status_code == 422 # Validation error + + # Test invalid score_ratio + response = await async_client.get("/ask?score_ratio=2.0") # > 1.0 + assert response.status_code == 422 # Validation error + + # Test invalid max_results + response = await async_client.get("/ask?max_results=0") # < 1 + assert response.status_code == 422 # Validation error + + async def test_config_dump_error_handling(self, async_client: httpx.AsyncClient): + """Test error handling for config dump endpoint.""" + # Test invalid code + invalid_payload = {"code": "invalid_code"} + response = await async_client.post("/config/dump", json=invalid_payload) + assert response.status_code == 400 + + # Test nested function calls (not allowed) + nested_payload = {"code": "CrawlerRunConfig(BrowserConfig())"} + response = await async_client.post("/config/dump", json=nested_payload) + assert response.status_code == 400 + + async def test_malformed_request_handling(self, async_client: httpx.AsyncClient): + """Test handling of malformed requests.""" + # Test missing required fields + malformed_payload = {"urls": []} # Missing browser_config and crawler_config + response = await async_client.post("/crawl", json=malformed_payload) + print(f"Response: {response.text}") + assert response.status_code == 422 # Validation error + + # Test empty URLs list + empty_urls_payload = { + "urls": [], + "browser_config": {"type": "BrowserConfig", "params": {}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {}} + } + response = await async_client.post("/crawl", json=empty_urls_payload) + assert response.status_code == 422 # "At least one URL required" + if __name__ == "__main__": # Define arguments for pytest programmatically # -v: verbose output