feat(docker): add Docker deployment configuration and API server

Add Docker deployment setup with FastAPI server implementation for Crawl4AI: - Create Dockerfile with Python 3.10 and Playwright dependencies - Implement FastAPI server with streaming and non-streaming endpoints - Add request/response models and JSON serialization - Include test script for API verification Also includes: - Update .gitignore for Continue development files - Add project rules in .continuerules - Clean up async_dispatcher.py formatting
2025-01-31 15:22:21 +08:00
parent f81712eb91
commit ce4f04dad2
9 changed files with 426 additions and 638 deletions
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -0,0 +1,148 @@
+# pyright: ignore
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+import json
+import asyncio
+from typing import AsyncGenerator
+from datetime import datetime
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    AsyncWebCrawler,
+    MemoryAdaptiveDispatcher,
+    RateLimiter,
+)
+from .models import CrawlRequest, CrawlResponse
+
+class CrawlJSONEncoder(json.JSONEncoder):
+    """Custom JSON encoder for crawler results"""
+    def default(self, obj):
+        if isinstance(obj, datetime):
+            return obj.isoformat()
+        if isinstance(obj, bytes):
+            return obj.decode('utf-8', errors='ignore')
+        if hasattr(obj, 'model_dump'):
+            return obj.model_dump()
+        if hasattr(obj, '__dict__'):
+            return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')}
+        return str(obj)  # Fallback to string representation
+
+def serialize_result(result) -> dict:
+    """Safely serialize a crawler result"""
+    try:
+        # Convert to dict handling special cases
+        if hasattr(result, 'model_dump'):
+            result_dict = result.model_dump()
+        else:
+            result_dict = {
+                k: v for k, v in result.__dict__.items()
+                if not k.startswith('_')
+            }
+
+        # Remove known non-serializable objects
+        result_dict.pop('ssl_certificate', None)
+        result_dict.pop('downloaded_files', None)
+
+        return result_dict
+    except Exception as e:
+        print(f"Error serializing result: {e}")
+        return {"error": str(e), "url": getattr(result, 'url', 'unknown')}
+
+app = FastAPI(title="Crawl4AI API")
+
+async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
+    """Stream results and manage crawler lifecycle"""
+    try:
+        async for result in results_gen:
+            try:
+                # Handle serialization of result
+                result_dict = serialize_result(result)
+                # Remove non-serializable objects
+                print(f"Streaming result for URL: {result_dict['url']}, Success: {result_dict['success']}")
+                yield (json.dumps(result_dict, cls=CrawlJSONEncoder) + "\n").encode('utf-8')
+            except Exception as e:
+                # Log error but continue streaming
+                print(f"Error serializing result: {e}")
+                error_response = {
+                    "error": str(e),
+                    "url": getattr(result, 'url', 'unknown')
+                }
+                yield (json.dumps(error_response) + "\n").encode('utf-8')
+    except asyncio.CancelledError:
+        # Handle client disconnection gracefully
+        print("Client disconnected, cleaning up...")
+    finally:
+        # Ensure crawler cleanup happens in all cases
+        try:
+            await crawler.close()
+        except Exception as e:
+            print(f"Error closing crawler: {e}")
+
+@app.post("/crawl")
+async def crawl(request: CrawlRequest):
+    browser_config, crawler_config = request.get_configs()
+
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=75.0,
+        rate_limiter=RateLimiter(base_delay=(1.0, 2.0)),
+        # monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
+    )
+
+    try:
+        if crawler_config.stream:
+            # For streaming, manage crawler lifecycle manually
+            crawler = AsyncWebCrawler(config=browser_config)
+            await crawler.start()
+
+            results_gen = await crawler.arun_many(
+                urls=request.urls,
+                config=crawler_config,
+                dispatcher=dispatcher
+            )
+
+            return StreamingResponse(
+                stream_results(crawler, results_gen),
+                media_type='application/x-ndjson'
+            )
+        else:
+            # For non-streaming, use context manager
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                results = await crawler.arun_many(
+                    urls=request.urls,
+                    config=crawler_config,
+                    dispatcher=dispatcher
+                )
+                # Handle serialization of results
+                results_dict = []
+                for result in results:
+                    try:
+                        result_dict = {
+                            k: v for k, v in (result.model_dump() if hasattr(result, 'model_dump')
+                                            else result.__dict__).items()
+                            if not k.startswith('_')
+                        }
+                        result_dict.pop('ssl_certificate', None)
+                        result_dict.pop('downloaded_files', None)
+                        results_dict.append(result_dict)
+                    except Exception as e:
+                        print(f"Error serializing result: {e}")
+                        continue
+
+                return CrawlResponse(success=True, results=results_dict)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/schema")
+async def get_schema():
+    """Return config schemas for client validation"""
+    return {
+        "browser": BrowserConfig.model_json_schema(),
+        "crawler": CrawlerRunConfig.model_json_schema()
+    }
+
+
+if __name__ == "__main__":
+    import uvicorn
+    # Run in auto reload mode
+    # WARNING:  You must pass the application as an import string to enable 'reload' or 'workers'.
+    uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)