feat(docker): enhance Docker deployment setup and configuration

Add comprehensive Docker deployment configuration with: - New .dockerignore and .llm.env.example files - Enhanced Dockerfile with multi-stage build and optimizations - Detailed README with setup instructions and environment configurations - Improved requirements.txt with Gunicorn - Better error handling in async_configs.py BREAKING CHANGE: Docker deployment now requires .llm.env file for API keys
2025-02-01 19:33:27 +08:00
parent 20920fa17b
commit 2f15976b34
10 changed files with 324 additions and 121 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -250,3 +250,4 @@ continue_config.json

 .prompts/

+.llm.env
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,3 +1,3 @@
 # crawl4ai/_version.py
 # __version__ = "0.4.3b3"
-__version__ = "0.4.300b4"
+__version__ = "0.4.300"
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,3 +1,4 @@
+from regex import B
 from .config import (
    MIN_WORD_THRESHOLD,
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -359,7 +360,7 @@ class BrowserConfig():
    @staticmethod
    def load( data: dict) -> "BrowserConfig":
        # Deserialize the object from a dictionary
-        return from_serializable_dict(data)
+        return from_serializable_dict(data) if data else BrowserConfig()


 class CrawlerRunConfig():
@@ -794,7 +795,7 @@ class CrawlerRunConfig():
    @staticmethod
    def load(data: dict) -> "CrawlerRunConfig":
        # Deserialize the object from a dictionary
-        return from_serializable_dict(data)
+        return from_serializable_dict(data) if data else CrawlerRunConfig()

    def to_dict(self):
        return {
--- a/deploy/docker/.dockerignore
+++ b/deploy/docker/.dockerignore
@@ -0,0 +1,31 @@
+# .dockerignore
+*
+
+# Allow specific files and directories when using local installation
+!crawl4ai/
+!docs/
+!deploy/docker/
+!setup.py
+!pyproject.toml
+!README.md
+!LICENSE
+!MANIFEST.in
+!setup.cfg
+!mkdocs.yml
+
+.git/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.DS_Store
+.env
+.venv
+venv/
+tests/
+coverage.xml
+*.log
+*.swp
+*.egg-info/
+dist/
+build/
--- a/deploy/docker/.llm.env.example
+++ b/deploy/docker/.llm.env.example
@@ -0,0 +1,8 @@
+# LLM Provider Keys
+OPENAI_API_KEY=your_openai_key_here
+DEEPSEEK_API_KEY=your_deepseek_key_here
+ANTHROPIC_API_KEY=your_anthropic_key_here
+GROQ_API_KEY=your_groq_key_here
+TOGETHER_API_KEY=your_together_key_here
+MISTRAL_API_KEY=your_mistral_key_here
+GEMINI_API_TOKEN=your_gemini_key_here
--- a/deploy/docker/Dockerfile
+++ b/deploy/docker/Dockerfile
@@ -1,18 +1,174 @@
 FROM python:3.10-slim

-# Install system dependencies
-RUN apt-get update && apt-get install -y \
+# Set build arguments
+ARG APP_HOME=/app
+ARG GITHUB_REPO=https://github.com/yourusername/crawl4ai.git
+ARG GITHUB_BRANCH=main
+ARG USE_LOCAL=false
+
+# 🤓 Environment variables - because who doesn't love a good ENV party?
+ENV PYTHONFAULTHANDLER=1 \
+    PYTHONHASHSEED=random \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_DEFAULT_TIMEOUT=100 \
+    DEBIAN_FRONTEND=noninteractive
+
+# Other build arguments
+ARG PYTHON_VERSION=3.10
+ARG INSTALL_TYPE=default
+ARG ENABLE_GPU=false
+ARG TARGETARCH
+
+# 🎯 Platform-specific labels - because even containers need ID badges
+LABEL maintainer="unclecode"
+LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
+LABEL version="1.0"    
+
+# 📦 Installing system dependencies... please hold, your package is being delivered
+RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
+    curl \
    wget \
+    gnupg \
+    git \
+    cmake \
+    pkg-config \
+    python3-dev \
+    libjpeg-dev \
    && rm -rf /var/lib/apt/lists/*

-# Install Playwright dependencies
+# 🎭 Playwright dependencies - because browsers need their vitamins too
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libglib2.0-0 \
+    libnss3 \
+    libnspr4 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdrm2 \
+    libdbus-1-3 \
+    libxcb1 \
+    libxkbcommon0 \
+    libx11-6 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxext6 \
+    libxfixes3 \
+    libxrandr2 \
+    libgbm1 \
+    libpango-1.0-0 \
+    libcairo2 \
+    libasound2 \
+    libatspi2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+
+# 🎮 GPU support - because sometimes CPU just doesn't cut it
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
+    apt-get update && apt-get install -y --no-install-recommends \
+    nvidia-cuda-toolkit \
+    && rm -rf /var/lib/apt/lists/* ; \
+else \
+    echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
+fi
+
+# 🏗️ Platform-specific optimizations - because one size doesn't fit all
+RUN if [ "$TARGETARCH" = "arm64" ]; then \
+    echo "🦾 Installing ARM-specific optimizations"; \
+    apt-get update && apt-get install -y --no-install-recommends \
+    libopenblas-dev \
+    && rm -rf /var/lib/apt/lists/*; \
+elif [ "$TARGETARCH" = "amd64" ]; then \
+    echo "🖥️ Installing AMD64-specific optimizations"; \
+    apt-get update && apt-get install -y --no-install-recommends \
+    libomp-dev \
+    && rm -rf /var/lib/apt/lists/*; \
+fi
+
+WORKDIR ${APP_HOME}
+
+# 🔄 Installation script - now with retry logic because sometimes Git needs a coffee break
+RUN echo '#!/bin/bash\n\
+if [ "$USE_LOCAL" = "true" ]; then\n\
+    echo "📦 Installing from local source..."\n\
+    pip install --no-cache-dir /tmp/project/\n\
+else\n\
+    echo "🌐 Installing from GitHub..."\n\
+    for i in {1..3}; do \n\
+        git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
+        { echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
+    done\n\
+    pip install --no-cache-dir /tmp/crawl4ai\n\
+fi' > /tmp/install.sh && chmod +x /tmp/install.sh
+
+# Copy local project if USE_LOCAL is true
+COPY . /tmp/project/
+
+# Copy and install other requirements
+COPY deploy/docker/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install ML dependencies first for better layer caching
+RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
+        pip install --no-cache-dir \
+            torch \
+            torchvision \
+            torchaudio \
+            scikit-learn \
+            nltk \
+            transformers \
+            tokenizers && \
+        python -m nltk.downloader punkt stopwords ; \
+    fi
+
+# Install the package
+RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
+        pip install "/tmp/project/[all]" && \
+        python -m crawl4ai.model_loader ; \
+    elif [ "$INSTALL_TYPE" = "torch" ] ; then \
+        pip install "/tmp/project/[torch]" ; \
+    elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
+        pip install "/tmp/project/[transformer]" && \
+        python -m crawl4ai.model_loader ; \
+    else \
+        pip install "/tmp/project" ; \
+    fi
+    
+    # 🚀 Installation validation - trust but verify!
+RUN pip install --no-cache-dir --upgrade pip && \
+    /tmp/install.sh && \
+    python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
+    python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
+    
 RUN playwright install --with-deps chromium

-WORKDIR /app
-COPY requirements.txt .
-RUN pip install -r requirements.txt
+# Copy application files
+COPY deploy/docker/* ${APP_HOME}/

-COPY . .
+# 🏥 Health check - now with memory validation!
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD bash -c '\
+    MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
+    if [ $MEM -lt 2048 ]; then \
+        echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
+        exit 1; \
+    fi && \
+    curl -f http://localhost:8000/health || exit 1'

-CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
+# Entrypoint script
+COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+ENTRYPOINT ["docker-entrypoint.sh"]
+
+# Default command - may the server be with you! 🚀
+CMD ["gunicorn", \
+    "--bind", "0.0.0.0:8000", \
+    "--workers", "4", \
+    "--threads", "2", \
+    "--timeout", "120", \
+    "--graceful-timeout", "30", \
+    "--log-level", "info", \
+    "--worker-class", "uvicorn.workers.UvicornWorker", \
+    "server:app"]
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -0,0 +1,113 @@
+# Crawl4AI Docker Setup
+
+## Quick Start
+1. Build the Docker image:
+   ```bash
+   docker build -t crawl4ai-server:prod .
+   ```
+
+2. Run the container:
+   ```bash
+   docker run -d -p 8000:8000 \
+     --env-file .llm.env \
+     --name crawl4ai \
+     crawl4ai-server:prod
+   ```
+
+---
+
+## Configuration Options
+
+### 1. **Using .llm.env File**
+Create a `.llm.env` file with your API keys:
+```bash
+OPENAI_API_KEY=sk-your-key
+DEEPSEEK_API_KEY=your-deepseek-key
+```
+
+Run with:
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  crawl4ai-server:prod
+```
+
+### 2. **Direct Environment Variables**
+Pass keys directly:
+```bash
+docker run -d -p 8000:8000 \
+  -e OPENAI_API_KEY="sk-your-key" \
+  -e DEEPSEEK_API_KEY="your-deepseek-key" \
+  crawl4ai-server:prod
+```
+
+### 3. **Copy Host Environment Variables**
+Use the `--copy-env` flag to copy `.llm.env` from the host:
+```bash
+docker run -d -p 8000:8000 \
+  --copy-env \
+  crawl4ai-server:prod
+```
+
+### 4. **Advanced: Docker Compose**
+Create a `docker-compose.yml`:
+```yaml
+version: '3.8'
+services:
+  crawl4ai:
+    image: crawl4ai-server:prod
+    ports:
+      - "8000:8000"
+    env_file:
+      - .llm.env
+    restart: unless-stopped
+```
+
+Run with:
+```bash
+docker-compose up -d
+```
+
+---
+
+## Supported Environment Variables
+| Variable               | Description                          |
+|------------------------|--------------------------------------|
+| `OPENAI_API_KEY`       | OpenAI API key                       |
+| `DEEPSEEK_API_KEY`     | DeepSeek API key                     |
+| `ANTHROPIC_API_KEY`    | Anthropic API key                    |
+| `GROQ_API_KEY`         | Groq API key                         |
+| `TOGETHER_API_KEY`     | Together API key                     |
+| `LLAMA_CLOUD_API_KEY`  | Llama Cloud API key                  |
+| `COHERE_API_KEY`       | Cohere API key                       |
+| `MISTRAL_API_KEY`      | Mistral API key                      |
+| `PERPLEXITY_API_KEY`   | Perplexity API key                   |
+| `VERTEXAI_PROJECT_ID`  | Google Vertex AI project ID          |
+| `VERTEXAI_LOCATION`    | Google Vertex AI location            |
+
+---
+
+## Healthcheck
+The container includes a healthcheck:
+```bash
+curl http://localhost:8000/health
+```
+
+---
+
+## Troubleshooting
+1. **Missing Keys**: Ensure all required keys are set in `.llm.env`.
+2. **Permissions**: Run `chmod +x docker-entrypoint.sh` if permissions are denied.
+3. **Logs**: Check logs with:
+   ```bash
+   docker logs crawl4ai
+   ```
+
+---
+
+## Security Best Practices
+- Never commit `.llm.env` to version control.
+- Use Docker secrets in production (Swarm/K8s).
+- Rotate keys regularly.
+
+
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -1,3 +1,4 @@
 crawl4ai
 fastapi
-uvicorn
+uvicorn
+gunicorn>=23.0.0
--- a/deploy/docker/test.py
+++ b/deploy/docker/test.py
@@ -1,108 +0,0 @@
-import httpx
-import asyncio
-import json
-
-async def test_regular():
-    """Test non-streaming API call"""
-    async with httpx.AsyncClient() as client:
-        response = await client.post("http://localhost:8000/crawl", json={
-            "urls": ["https://example.com"] * 3,  # Test with 3 identical URLs
-            "browser_config": {
-                "headless": True,
-                "verbose": False
-            },
-            "crawler_config": {
-                "cache_mode": "BYPASS",
-                "stream": False
-            }
-        })
-        results = response.json()
-        print("\nRegular Response:")
-        print(f"Got {len(results['results'])} results at once")
-        for result in results['results']:
-            print(f"URL: {result['url']}, Success: {result['success']}")
-
-async def test_streaming():
-    """Test streaming API call"""
-    async with httpx.AsyncClient() as client:
-        try:
-            response = await client.post(
-                "http://localhost:8000/crawl",
-                json={
-                    "urls": ["https://example.com"] * 3,
-                    "browser_config": {
-                        "headless": True,
-                        "verbose": False
-                    },
-                    "crawler_config": {
-                        "cache_mode": "BYPASS",
-                        "stream": True
-                    }
-                },
-                timeout=30.0
-            )
-
-            print("\nStreaming Response:")
-            async for line in response.aiter_lines():
-                if line.strip():
-                    try:
-                        result = json.loads(line)
-                        print(f"Received result for URL: {result['url']}, Success: {result['success']}")
-                    except json.JSONDecodeError as e:
-                        print(f"Error decoding response: {e}")
-                        continue
-        except Exception as e:
-            print(f"Error during streaming: {e}")
-
-async def test_complex_config():
-    """Test API with complex nested configurations"""
-    async with httpx.AsyncClient() as client:
-        response = await client.post("http://localhost:8000/crawl",
-                timeout=30.0, json={
-            "urls": ["https://en.wikipedia.org/wiki/Apple"],
-            "browser_config": {
-                "headless": True,
-                "verbose": False
-            },
-            "crawler_config": {
-                "cache_mode": "BYPASS",
-                "excluded_tags": ["nav", "footer", "aside"],
-                "remove_overlay_elements": True,
-                "markdown_generator": {
-                    "type": "DefaultMarkdownGenerator",
-                    "params": {
-                        "content_filter": {
-                            "type": "PruningContentFilter",
-                            "params": {
-                                "threshold": 0.48,
-                                "threshold_type": "fixed",
-                                "min_word_threshold": 0
-                            }
-                        },
-                        "options": {"ignore_links": True}
-                    }
-                }
-            }
-        })
-
-        result = response.json()
-        if result['success']:
-            for r in result['results']:
-                print(f"Full Markdown Length: {len(r['markdown_v2']['raw_markdown'])}")
-                print(f"Fit Markdown Length: {len(r['markdown_v2']['fit_markdown'])}")
-
-async def main():
-    """Run both tests"""
-    print("Testing Crawl4AI API...")
-
-    # print("\n1. Testing regular (non-streaming) endpoint...")
-    # await test_regular()
-
-    # print("\n2. Testing streaming endpoint...")
-    # await test_streaming()
-
-    print("\n3. Testing complex configuration...")
-    await test_complex_config()
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/tests/docker/test_docker.py
+++ b/tests/docker/test_docker.py
@@ -164,7 +164,7 @@ async def main():
    """Run all tests"""
    # Test direct API
    print("Testing direct API calls...")
-    # await test_direct_api()
+    await test_direct_api()

    # Test client SDK
    print("\nTesting client SDK...")