feat(docker): add Docker deployment configuration and API server

Add Docker deployment setup with FastAPI server implementation for Crawl4AI: - Create Dockerfile with Python 3.10 and Playwright dependencies - Implement FastAPI server with streaming and non-streaming endpoints - Add request/response models and JSON serialization - Include test script for API verification Also includes: - Update .gitignore for Continue development files - Add project rules in .continuerules - Clean up async_dispatcher.py formatting
2025-01-31 15:22:21 +08:00
parent f81712eb91
commit ce4f04dad2
9 changed files with 426 additions and 638 deletions
--- a/deploy/docker/Dockerfile
+++ b/deploy/docker/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.10-slim
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Playwright dependencies
+RUN playwright install --with-deps chromium
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY . .
+
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/deploy/docker/models.py
+++ b/deploy/docker/models.py
@@ -0,0 +1,78 @@
+from typing import List, Optional, Any, Dict
+from pydantic import BaseModel
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    BM25ContentFilter,
+    LLMContentFilter,
+    # Add other strategy classes as needed
+)
+
+class StrategyConfig(BaseModel):
+    """Base class for strategy configurations"""
+    type: str
+    params: Dict[str, Any]
+
+    def create_instance(self):
+        """Convert config to actual strategy instance"""
+        strategy_mappings = {
+            # Markdown Generators
+            'DefaultMarkdownGenerator': DefaultMarkdownGenerator,
+
+            # Content Filters
+            'PruningContentFilter': PruningContentFilter,
+            'BM25ContentFilter': BM25ContentFilter,
+            'LLMContentFilter': LLMContentFilter,
+
+            # Add other mappings as needed
+            # 'CustomStrategy': CustomStrategyClass,
+        }
+
+        strategy_class = strategy_mappings.get(self.type)
+        if not strategy_class:
+            raise ValueError(f"Unknown strategy type: {self.type}")
+
+        # Handle nested strategy configurations
+        processed_params = {}
+        for key, value in self.params.items():
+            if isinstance(value, dict) and 'type' in value:
+                # Recursively create nested strategy instances
+                nested_config = StrategyConfig(type=value['type'], params=value.get('params', {}))
+                processed_params[key] = nested_config.create_instance()
+            else:
+                processed_params[key] = value
+
+        return strategy_class(**processed_params)
+
+class CrawlRequest(BaseModel):
+    urls: List[str]
+    browser_config: Optional[dict] = None
+    crawler_config: Optional[dict] = None
+
+    def get_configs(self):
+        """Enhanced conversion of dicts to config objects"""
+        browser_config = BrowserConfig.from_kwargs(self.browser_config or {})
+
+        crawler_dict = self.crawler_config or {}
+
+        # Process strategy configurations
+        for key, value in crawler_dict.items():
+            if isinstance(value, dict) and 'type' in value:
+                # Convert strategy configuration to actual instance
+                strategy_config = StrategyConfig(
+                    type=value['type'],
+                    params=value.get('params', {})
+                )
+                crawler_dict[key] = strategy_config.create_instance()
+
+        crawler_config = CrawlerRunConfig.from_kwargs(crawler_dict)
+        return browser_config, crawler_config
+
+class CrawlResponse(BaseModel):
+    success: bool
+    results: List[dict]  # Will contain serialized CrawlResults
+
+    class Config:
+        arbitrary_types_allowed = True
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -0,0 +1,3 @@
+crawl4ai
+fastapi
+uvicorn
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -0,0 +1,148 @@
+# pyright: ignore
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+import json
+import asyncio
+from typing import AsyncGenerator
+from datetime import datetime
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    AsyncWebCrawler,
+    MemoryAdaptiveDispatcher,
+    RateLimiter,
+)
+from .models import CrawlRequest, CrawlResponse
+
+class CrawlJSONEncoder(json.JSONEncoder):
+    """Custom JSON encoder for crawler results"""
+    def default(self, obj):
+        if isinstance(obj, datetime):
+            return obj.isoformat()
+        if isinstance(obj, bytes):
+            return obj.decode('utf-8', errors='ignore')
+        if hasattr(obj, 'model_dump'):
+            return obj.model_dump()
+        if hasattr(obj, '__dict__'):
+            return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')}
+        return str(obj)  # Fallback to string representation
+
+def serialize_result(result) -> dict:
+    """Safely serialize a crawler result"""
+    try:
+        # Convert to dict handling special cases
+        if hasattr(result, 'model_dump'):
+            result_dict = result.model_dump()
+        else:
+            result_dict = {
+                k: v for k, v in result.__dict__.items()
+                if not k.startswith('_')
+            }
+
+        # Remove known non-serializable objects
+        result_dict.pop('ssl_certificate', None)
+        result_dict.pop('downloaded_files', None)
+
+        return result_dict
+    except Exception as e:
+        print(f"Error serializing result: {e}")
+        return {"error": str(e), "url": getattr(result, 'url', 'unknown')}
+
+app = FastAPI(title="Crawl4AI API")
+
+async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
+    """Stream results and manage crawler lifecycle"""
+    try:
+        async for result in results_gen:
+            try:
+                # Handle serialization of result
+                result_dict = serialize_result(result)
+                # Remove non-serializable objects
+                print(f"Streaming result for URL: {result_dict['url']}, Success: {result_dict['success']}")
+                yield (json.dumps(result_dict, cls=CrawlJSONEncoder) + "\n").encode('utf-8')
+            except Exception as e:
+                # Log error but continue streaming
+                print(f"Error serializing result: {e}")
+                error_response = {
+                    "error": str(e),
+                    "url": getattr(result, 'url', 'unknown')
+                }
+                yield (json.dumps(error_response) + "\n").encode('utf-8')
+    except asyncio.CancelledError:
+        # Handle client disconnection gracefully
+        print("Client disconnected, cleaning up...")
+    finally:
+        # Ensure crawler cleanup happens in all cases
+        try:
+            await crawler.close()
+        except Exception as e:
+            print(f"Error closing crawler: {e}")
+
+@app.post("/crawl")
+async def crawl(request: CrawlRequest):
+    browser_config, crawler_config = request.get_configs()
+
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=75.0,
+        rate_limiter=RateLimiter(base_delay=(1.0, 2.0)),
+        # monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
+    )
+
+    try:
+        if crawler_config.stream:
+            # For streaming, manage crawler lifecycle manually
+            crawler = AsyncWebCrawler(config=browser_config)
+            await crawler.start()
+
+            results_gen = await crawler.arun_many(
+                urls=request.urls,
+                config=crawler_config,
+                dispatcher=dispatcher
+            )
+
+            return StreamingResponse(
+                stream_results(crawler, results_gen),
+                media_type='application/x-ndjson'
+            )
+        else:
+            # For non-streaming, use context manager
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                results = await crawler.arun_many(
+                    urls=request.urls,
+                    config=crawler_config,
+                    dispatcher=dispatcher
+                )
+                # Handle serialization of results
+                results_dict = []
+                for result in results:
+                    try:
+                        result_dict = {
+                            k: v for k, v in (result.model_dump() if hasattr(result, 'model_dump')
+                                            else result.__dict__).items()
+                            if not k.startswith('_')
+                        }
+                        result_dict.pop('ssl_certificate', None)
+                        result_dict.pop('downloaded_files', None)
+                        results_dict.append(result_dict)
+                    except Exception as e:
+                        print(f"Error serializing result: {e}")
+                        continue
+
+                return CrawlResponse(success=True, results=results_dict)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/schema")
+async def get_schema():
+    """Return config schemas for client validation"""
+    return {
+        "browser": BrowserConfig.model_json_schema(),
+        "crawler": CrawlerRunConfig.model_json_schema()
+    }
+
+
+if __name__ == "__main__":
+    import uvicorn
+    # Run in auto reload mode
+    # WARNING:  You must pass the application as an import string to enable 'reload' or 'workers'.
+    uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)
--- a/deploy/docker/test.py
+++ b/deploy/docker/test.py
@@ -0,0 +1,108 @@
+import httpx
+import asyncio
+import json
+
+async def test_regular():
+    """Test non-streaming API call"""
+    async with httpx.AsyncClient() as client:
+        response = await client.post("http://localhost:8000/crawl", json={
+            "urls": ["https://example.com"] * 3,  # Test with 3 identical URLs
+            "browser_config": {
+                "headless": True,
+                "verbose": False
+            },
+            "crawler_config": {
+                "cache_mode": "BYPASS",
+                "stream": False
+            }
+        })
+        results = response.json()
+        print("\nRegular Response:")
+        print(f"Got {len(results['results'])} results at once")
+        for result in results['results']:
+            print(f"URL: {result['url']}, Success: {result['success']}")
+
+async def test_streaming():
+    """Test streaming API call"""
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.post(
+                "http://localhost:8000/crawl",
+                json={
+                    "urls": ["https://example.com"] * 3,
+                    "browser_config": {
+                        "headless": True,
+                        "verbose": False
+                    },
+                    "crawler_config": {
+                        "cache_mode": "BYPASS",
+                        "stream": True
+                    }
+                },
+                timeout=30.0
+            )
+
+            print("\nStreaming Response:")
+            async for line in response.aiter_lines():
+                if line.strip():
+                    try:
+                        result = json.loads(line)
+                        print(f"Received result for URL: {result['url']}, Success: {result['success']}")
+                    except json.JSONDecodeError as e:
+                        print(f"Error decoding response: {e}")
+                        continue
+        except Exception as e:
+            print(f"Error during streaming: {e}")
+
+async def test_complex_config():
+    """Test API with complex nested configurations"""
+    async with httpx.AsyncClient() as client:
+        response = await client.post("http://localhost:8000/crawl",
+                timeout=30.0, json={
+            "urls": ["https://en.wikipedia.org/wiki/Apple"],
+            "browser_config": {
+                "headless": True,
+                "verbose": False
+            },
+            "crawler_config": {
+                "cache_mode": "BYPASS",
+                "excluded_tags": ["nav", "footer", "aside"],
+                "remove_overlay_elements": True,
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_filter": {
+                            "type": "PruningContentFilter",
+                            "params": {
+                                "threshold": 0.48,
+                                "threshold_type": "fixed",
+                                "min_word_threshold": 0
+                            }
+                        },
+                        "options": {"ignore_links": True}
+                    }
+                }
+            }
+        })
+
+        result = response.json()
+        if result['success']:
+            for r in result['results']:
+                print(f"Full Markdown Length: {len(r['markdown_v2']['raw_markdown'])}")
+                print(f"Fit Markdown Length: {len(r['markdown_v2']['fit_markdown'])}")
+
+async def main():
+    """Run both tests"""
+    print("Testing Crawl4AI API...")
+
+    # print("\n1. Testing regular (non-streaming) endpoint...")
+    # await test_regular()
+
+    # print("\n2. Testing streaming endpoint...")
+    # await test_streaming()
+
+    print("\n3. Testing complex configuration...")
+    await test_complex_config()
+
+if __name__ == "__main__":
+    asyncio.run(main())