feat(docker): improve docker error handling

- Return comprehensive error messages along with status codes for api internal errors. - Fix fit_html property serialization issue in both /crawl and /crawl/stream endpoints - Add sanitization to ensure fit_html is always JSON-serializable (string or None) - Add comprehensive error handling test suite.
2025-08-26 23:18:35 +05:30
parent cce3390a2d
commit 2ad3fb5fc8
3 changed files with 296 additions and 38 deletions
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -413,6 +413,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
                server_memory_mb = _get_memory_mb()
                result_dict = result.model_dump()
                result_dict['server_memory_mb'] = server_memory_mb
+                # Ensure fit_html is JSON-serializable
+                if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
+                    result_dict["fit_html"] = None
                # If PDF exists, encode it to base64
                if result_dict.get('pdf') is not None:
                    result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
@@ -493,6 +496,9 @@ async def handle_crawl_request(
        processed_results = []
        for result in results:
            result_dict = result.model_dump()
+            # if fit_html is not a string, set it to None to avoid serialization errors
+            if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
+                result_dict["fit_html"] = None
            # If PDF exists, encode it to base64
            if result_dict.get('pdf') is not None:
                result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -267,12 +267,26 @@ async def generate_html(
    Use when you need sanitized HTML structures for building schemas or further processing.
    """
    cfg = CrawlerRunConfig()
-    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
-        results = await crawler.arun(url=body.url, config=cfg)
-    raw_html = results[0].html
-    from crawl4ai.utils import preprocess_html_for_schema
-    processed_html = preprocess_html_for_schema(raw_html)
-    return JSONResponse({"html": processed_html, "url": body.url, "success": True})
+    try:
+        async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+            results = await crawler.arun(url=body.url, config=cfg)
+        # Check if the crawl was successful
+        if not results[0].success:
+            raise HTTPException(
+                status_code=500,
+                detail=results[0].error_message or "Crawl failed"
+            )
+        
+        raw_html = results[0].html
+        from crawl4ai.utils import preprocess_html_for_schema
+        processed_html = preprocess_html_for_schema(raw_html)
+        return JSONResponse({"html": processed_html, "url": body.url, "success": True})
+    except Exception as e:
+        # Log and raise as HTTP 500 for other exceptions
+        raise HTTPException(
+            status_code=500,
+            detail=str(e)
+        )

 # Screenshot endpoint

@@ -290,18 +304,29 @@ async def generate_screenshot(
    Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
    Then in result instead of the screenshot you will get a path to the saved file.
    """
-    cfg = CrawlerRunConfig(
-        screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
-    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
-        results = await crawler.arun(url=body.url, config=cfg)
-    screenshot_data = results[0].screenshot
-    if body.output_path:
-        abs_path = os.path.abspath(body.output_path)
-        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
-        with open(abs_path, "wb") as f:
-            f.write(base64.b64decode(screenshot_data))
-        return {"success": True, "path": abs_path}
-    return {"success": True, "screenshot": screenshot_data}
+    try:
+        cfg = CrawlerRunConfig(
+            screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
+        async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+            results = await crawler.arun(url=body.url, config=cfg)
+        if not results[0].success:
+            raise HTTPException(
+                status_code=500,
+                detail=results[0].error_message or "Crawl failed"
+            )
+        screenshot_data = results[0].screenshot
+        if body.output_path:
+            abs_path = os.path.abspath(body.output_path)
+            os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+            with open(abs_path, "wb") as f:
+                f.write(base64.b64decode(screenshot_data))
+            return {"success": True, "path": abs_path}
+        return {"success": True, "screenshot": screenshot_data}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=str(e)
+        )

 # PDF endpoint

@@ -319,17 +344,28 @@ async def generate_pdf(
    Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
    Then in result instead of the PDF you will get a path to the saved file.
    """
-    cfg = CrawlerRunConfig(pdf=True)
-    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
-        results = await crawler.arun(url=body.url, config=cfg)
-    pdf_data = results[0].pdf
-    if body.output_path:
-        abs_path = os.path.abspath(body.output_path)
-        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
-        with open(abs_path, "wb") as f:
-            f.write(pdf_data)
-        return {"success": True, "path": abs_path}
-    return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
+    try:
+        cfg = CrawlerRunConfig(pdf=True)
+        async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+            results = await crawler.arun(url=body.url, config=cfg)
+        if not results[0].success:
+            raise HTTPException(
+                status_code=500,
+                detail=results[0].error_message or "Crawl failed"
+            )
+        pdf_data = results[0].pdf
+        if body.output_path:
+            abs_path = os.path.abspath(body.output_path)
+            os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+            with open(abs_path, "wb") as f:
+                f.write(pdf_data)
+            return {"success": True, "path": abs_path}
+        return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=str(e)
+        )


@app.post("/execute_js")
@@ -385,12 +421,23 @@ async def execute_js(
        ```

    """
-    cfg = CrawlerRunConfig(js_code=body.scripts)
-    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
-        results = await crawler.arun(url=body.url, config=cfg)
-    # Return JSON-serializable dict of the first CrawlResult
-    data = results[0].model_dump()
-    return JSONResponse(data)
+    try:
+        cfg = CrawlerRunConfig(js_code=body.scripts)
+        async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+            results = await crawler.arun(url=body.url, config=cfg)
+        if not results[0].success:
+            raise HTTPException(
+                status_code=500,
+                detail=results[0].error_message or "Crawl failed"
+            )
+        # Return JSON-serializable dict of the first CrawlResult
+        data = results[0].model_dump()
+        return JSONResponse(data)
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=str(e)
+        )


@app.get("/llm/{url:path}")
@@ -438,13 +485,16 @@ async def crawl(
    """
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
-    res = await handle_crawl_request(
+    results = await handle_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
        crawler_config=crawl_request.crawler_config,
        config=config,
    )
-    return JSONResponse(res)
+    # check if all of the results are not successful
+    if all(not result["success"] for result in results["results"]):
+        raise HTTPException(500, f"Crawl request failed: {results['results'][0]['error_message']}")
+    return JSONResponse(results)


@app.post("/crawl/stream")
--- a/tests/docker/test_server_requests.py
+++ b/tests/docker/test_server_requests.py
@@ -636,6 +636,208 @@ class TestCrawlEndpoints:
        except Exception as e: # Catch any other unexpected error
            pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")

+
+    # 7. Error Handling Tests
+    async def test_invalid_url_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for invalid URLs."""
+        payload = {
+            "urls": ["invalid-url", "https://nonexistent-domain-12345.com"],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": CacheMode.BYPASS.value}}
+        }
+        
+        response = await async_client.post("/crawl", json=payload)
+        # Should return 200 with failed results, not 500
+        print(f"Status code: {response.status_code}")
+        print(f"Response: {response.text}")
+        assert response.status_code == 500
+        data = response.json()
+        assert data["detail"].startswith("Crawl request failed:")
+
+    async def test_mixed_success_failure_urls(self, async_client: httpx.AsyncClient):
+        """Test handling of mixed success/failure URLs."""
+        payload = {
+            "urls": [
+                SIMPLE_HTML_URL,  # Should succeed
+                "https://nonexistent-domain-12345.com",  # Should fail
+                "https://invalid-url-with-special-chars-!@#$%^&*()",  # Should fail
+            ],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig", 
+                "params": {
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "markdown_generator": {
+                        "type": "DefaultMarkdownGenerator",
+                        "params": {
+                            "content_filter": {
+                                "type": "PruningContentFilter",
+                                "params": {"threshold": 0.5}
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        
+        response = await async_client.post("/crawl", json=payload)
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+        assert len(data["results"]) == 3
+        
+        success_count = 0
+        failure_count = 0
+        
+        for result in data["results"]:
+            if result["success"]:
+                success_count += 1
+            else:
+                failure_count += 1
+                assert "error_message" in result
+                assert len(result["error_message"]) > 0
+                
+        assert success_count >= 1  # At least one should succeed
+        assert failure_count >= 1  # At least one should fail
+
+    async def test_streaming_mixed_urls(self, async_client: httpx.AsyncClient):
+        """Test streaming with mixed success/failure URLs."""
+        payload = {
+            "urls": [
+                SIMPLE_HTML_URL,  # Should succeed
+                "https://nonexistent-domain-12345.com",  # Should fail
+            ],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig", 
+                "params": {
+                    "stream": True,
+                    "cache_mode": CacheMode.BYPASS.value
+                }
+            }
+        }
+        
+        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+            response.raise_for_status()
+            results = await process_streaming_response(response)
+        
+        assert len(results) == 2
+        
+        success_count = 0
+        failure_count = 0
+        
+        for result in results:
+            if result["success"]:
+                success_count += 1
+                assert result["url"] == SIMPLE_HTML_URL
+            else:
+                failure_count += 1
+                assert "error_message" in result
+                assert result["error_message"] is not None
+        
+        assert success_count == 1
+        assert failure_count == 1
+
+    async def test_markdown_endpoint_error_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for markdown endpoint."""
+        # Test invalid URL
+        invalid_payload = {"url": "invalid-url", "f": "fit"}
+        response = await async_client.post("/md", json=invalid_payload)
+        # Should return 400 for invalid URL format
+        assert response.status_code == 400
+        
+        # Test non-existent URL
+        nonexistent_payload = {"url": "https://nonexistent-domain-12345.com", "f": "fit"}
+        response = await async_client.post("/md", json=nonexistent_payload)
+        # Should return 500 for crawl failure
+        assert response.status_code == 500
+
+    async def test_html_endpoint_error_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for HTML endpoint."""
+        # Test invalid URL
+        invalid_payload = {"url": "invalid-url"}
+        response = await async_client.post("/html", json=invalid_payload)
+        # Should return 500 for crawl failure
+        assert response.status_code == 500
+
+    async def test_screenshot_endpoint_error_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for screenshot endpoint."""
+        # Test invalid URL
+        invalid_payload = {"url": "invalid-url"}
+        response = await async_client.post("/screenshot", json=invalid_payload)
+        # Should return 500 for crawl failure
+        assert response.status_code == 500
+
+    async def test_pdf_endpoint_error_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for PDF endpoint."""
+        # Test invalid URL
+        invalid_payload = {"url": "invalid-url"}
+        response = await async_client.post("/pdf", json=invalid_payload)
+        # Should return 500 for crawl failure
+        assert response.status_code == 500
+
+    async def test_execute_js_endpoint_error_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for execute_js endpoint."""
+        # Test invalid URL
+        invalid_payload = {"url": "invalid-url", "scripts": ["return document.title;"]}
+        response = await async_client.post("/execute_js", json=invalid_payload)
+        # Should return 500 for crawl failure
+        assert response.status_code == 500
+
+    async def test_llm_endpoint_error_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for LLM endpoint."""
+        # Test missing query parameter
+        response = await async_client.get("/llm/https://example.com")
+        assert response.status_code == 422  # FastAPI validation error, not 400
+        
+        # Test invalid URL
+        response = await async_client.get("/llm/invalid-url?q=test")
+        # Should return 500 for crawl failure
+        assert response.status_code == 500
+
+    async def test_ask_endpoint_error_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for ask endpoint."""
+        # Test invalid context_type
+        response = await async_client.get("/ask?context_type=invalid")
+        assert response.status_code == 422  # Validation error
+        
+        # Test invalid score_ratio
+        response = await async_client.get("/ask?score_ratio=2.0")  # > 1.0
+        assert response.status_code == 422  # Validation error
+        
+        # Test invalid max_results
+        response = await async_client.get("/ask?max_results=0")  # < 1
+        assert response.status_code == 422  # Validation error
+
+    async def test_config_dump_error_handling(self, async_client: httpx.AsyncClient):
+        """Test error handling for config dump endpoint."""
+        # Test invalid code
+        invalid_payload = {"code": "invalid_code"}
+        response = await async_client.post("/config/dump", json=invalid_payload)
+        assert response.status_code == 400
+        
+        # Test nested function calls (not allowed)
+        nested_payload = {"code": "CrawlerRunConfig(BrowserConfig())"}
+        response = await async_client.post("/config/dump", json=nested_payload)
+        assert response.status_code == 400
+
+    async def test_malformed_request_handling(self, async_client: httpx.AsyncClient):
+        """Test handling of malformed requests."""
+        # Test missing required fields
+        malformed_payload = {"urls": []}  # Missing browser_config and crawler_config
+        response = await async_client.post("/crawl", json=malformed_payload)
+        print(f"Response: {response.text}")
+        assert response.status_code == 422  # Validation error
+        
+        # Test empty URLs list
+        empty_urls_payload = {
+            "urls": [],
+            "browser_config": {"type": "BrowserConfig", "params": {}},
+            "crawler_config": {"type": "CrawlerRunConfig", "params": {}}
+        }
+        response = await async_client.post("/crawl", json=empty_urls_payload)
+        assert response.status_code == 422  # "At least one URL required"
+
 if __name__ == "__main__":
    # Define arguments for pytest programmatically
    # -v: verbose output