refactor(docker): improve server architecture and configuration

Complete overhaul of Docker deployment setup with improved architecture: - Add Redis integration for task management - Implement rate limiting and security middleware - Add Prometheus metrics and health checks - Improve error handling and logging - Add support for streaming responses - Implement proper configuration management - Add platform-specific optimizations for ARM64/AMD64 BREAKING CHANGE: Docker deployment now requires Redis and new config.yml structure
2025-02-02 20:19:51 +08:00
parent 7b1ef07c41
commit 33a21d6a7a
16 changed files with 1918 additions and 344 deletions
--- a/148
+++ b/148
@@ -1,32 +1,31 @@
-# syntax=docker/dockerfile:1.4
+FROM python:3.10-slim

-ARG TARGETPLATFORM
-ARG BUILDPLATFORM
+# Set build arguments
+ARG APP_HOME=/app
+ARG GITHUB_REPO=https://github.com/yourusername/crawl4ai.git
+ARG GITHUB_BRANCH=main
+ARG USE_LOCAL=true

-# Other build arguments
-ARG PYTHON_VERSION=3.10
-
-# Base stage with system dependencies
-FROM python:${PYTHON_VERSION}-slim as base
-
-# Declare ARG variables again within the build stage
-ARG INSTALL_TYPE=all
-ARG ENABLE_GPU=false
-
-# Platform-specific labels
-LABEL maintainer="unclecode"
-LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
-LABEL version="1.0"
-
-# Environment setup
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1 \
+ENV PYTHONFAULTHANDLER=1 \
+    PYTHONHASHSEED=random \
+    PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    PIP_DEFAULT_TIMEOUT=100 \
-    DEBIAN_FRONTEND=noninteractive
+    DEBIAN_FRONTEND=noninteractive \
+    REDIS_HOST=localhost \
+    REDIS_PORT=6379
+
+ARG PYTHON_VERSION=3.10
+ARG INSTALL_TYPE=default
+ARG ENABLE_GPU=false
+ARG TARGETARCH
+
+LABEL maintainer="unclecode"
+LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
+LABEL version="1.0"    

-# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    curl \
@@ -37,10 +36,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    pkg-config \
    python3-dev \
    libjpeg-dev \
-    libpng-dev \
+    redis-server \
    && rm -rf /var/lib/apt/lists/*

-# Playwright system dependencies for Linux
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libglib2.0-0 \
    libnss3 \
@@ -65,8 +63,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libatspi2.0-0 \
    && rm -rf /var/lib/apt/lists/*

-# GPU support if enabled and architecture is supported
-RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
    apt-get update && apt-get install -y --no-install-recommends \
    nvidia-cuda-toolkit \
    && rm -rf /var/lib/apt/lists/* ; \
@@ -74,19 +71,40 @@ else \
    echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
 fi

-# Create and set working directory
-WORKDIR /app
+RUN if [ "$TARGETARCH" = "arm64" ]; then \
+    echo "🦾 Installing ARM-specific optimizations"; \
+    apt-get update && apt-get install -y --no-install-recommends \
+    libopenblas-dev \
+    && rm -rf /var/lib/apt/lists/*; \
+elif [ "$TARGETARCH" = "amd64" ]; then \
+    echo "🖥️ Installing AMD64-specific optimizations"; \
+    apt-get update && apt-get install -y --no-install-recommends \
+    libomp-dev \
+    && rm -rf /var/lib/apt/lists/*; \
+else \
+    echo "Skipping platform-specific optimizations (unsupported platform)"; \
+fi

-# Copy the entire project
-COPY . .
+WORKDIR ${APP_HOME}

-# Install base requirements
+RUN echo '#!/bin/bash\n\
+if [ "$USE_LOCAL" = "true" ]; then\n\
+    echo "📦 Installing from local source..."\n\
+    pip install --no-cache-dir /tmp/project/\n\
+else\n\
+    echo "🌐 Installing from GitHub..."\n\
+    for i in {1..3}; do \n\
+        git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
+        { echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
+    done\n\
+    pip install --no-cache-dir /tmp/crawl4ai\n\
+fi' > /tmp/install.sh && chmod +x /tmp/install.sh
+
+COPY . /tmp/project/
+
+COPY deploy/docker/requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt

-# Install required library for FastAPI
-RUN pip install fastapi uvicorn psutil
-
-# Install ML dependencies first for better layer caching
 RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
        pip install --no-cache-dir \
            torch \
@@ -99,38 +117,50 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
        python -m nltk.downloader punkt stopwords ; \
    fi

-# Install the package
 RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
-        pip install ".[all]" && \
+        pip install "/tmp/project/[all]" && \
        python -m crawl4ai.model_loader ; \
    elif [ "$INSTALL_TYPE" = "torch" ] ; then \
-        pip install ".[torch]" ; \
+        pip install "/tmp/project/[torch]" ; \
    elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
-        pip install ".[transformer]" && \
+        pip install "/tmp/project/[transformer]" && \
        python -m crawl4ai.model_loader ; \
    else \
-        pip install "." ; \
+        pip install "/tmp/project" ; \
    fi
+    
+RUN pip install --no-cache-dir --upgrade pip && \
+    /tmp/install.sh && \
+    python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
+    python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
+    
+RUN playwright install --with-deps chromium

-    # Install MkDocs and required plugins
-RUN pip install --no-cache-dir \
-    mkdocs \
-    mkdocs-material \
-    mkdocs-terminal \
-    pymdown-extensions
+COPY deploy/docker/* ${APP_HOME}/

-# Build MkDocs documentation
-RUN mkdocs build
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD bash -c '\
+    MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
+    if [ $MEM -lt 2048 ]; then \
+        echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
+        exit 1; \
+    fi && \
+    redis-cli ping > /dev/null && \
+    curl -f http://localhost:8000/health || exit 1'

-# Install Playwright and browsers
-RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
-    playwright install chromium; \
-    elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-    playwright install chromium; \
-    fi
+COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh

-# Expose port
-EXPOSE 8000 11235 9222 8080
+EXPOSE 6379

-# Start the FastAPI server
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
+ENTRYPOINT ["docker-entrypoint.sh"]
+
+CMD service redis-server start && gunicorn \
+    --bind 0.0.0.0:8000 \
+    --workers 4 \
+    --threads 2 \
+    --timeout 120 \
+    --graceful-timeout 30 \
+    --log-level info \
+    --worker-class uvicorn.workers.UvicornWorker \
+    server:app