FROM python:3.12-slim-bookworm AS build # C4ai version ARG C4AI_VER=0.7.8 ENV C4AI_VERSION=$C4AI_VER LABEL c4ai.version=$C4AI_VER # Set build arguments ARG APP_HOME=/app ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git ARG GITHUB_BRANCH=main ARG USE_LOCAL=true ENV PYTHONFAULTHANDLER=1 \ PYTHONHASHSEED=random \ PYTHONUNBUFFERED=1 \ PIP_NO_CACHE_DIR=1 \ PYTHONDONTWRITEBYTECODE=1 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ PIP_DEFAULT_TIMEOUT=100 \ DEBIAN_FRONTEND=noninteractive \ REDIS_HOST=localhost \ REDIS_PORT=6379 ARG PYTHON_VERSION=3.12 ARG INSTALL_TYPE=default ARG ENABLE_GPU=false ARG TARGETARCH LABEL maintainer="unclecode" LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" LABEL version="1.0" RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ curl \ wget \ gnupg \ git \ cmake \ pkg-config \ python3-dev \ libjpeg-dev \ redis-server \ supervisor \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y --no-install-recommends \ libglib2.0-0 \ libnss3 \ libnspr4 \ libatk1.0-0 \ libatk-bridge2.0-0 \ libcups2 \ libdrm2 \ libdbus-1-3 \ libxcb1 \ libxkbcommon0 \ libx11-6 \ libxcomposite1 \ libxdamage1 \ libxext6 \ libxfixes3 \ libxrandr2 \ libgbm1 \ libpango-1.0-0 \ libcairo2 \ libasound2 \ libatspi2.0-0 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get dist-upgrade -y \ && rm -rf /var/lib/apt/lists/* RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ apt-get update && apt-get install -y --no-install-recommends \ nvidia-cuda-toolkit \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* ; \ else \ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ fi RUN if [ "$TARGETARCH" = "arm64" ]; then \ echo "🦾 Installing ARM-specific optimizations"; \ apt-get update && apt-get install -y --no-install-recommends \ libopenblas-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/*; \ elif [ "$TARGETARCH" = "amd64" ]; then \ echo "🖥️ Installing AMD64-specific optimizations"; \ apt-get update && apt-get install -y --no-install-recommends \ libomp-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/*; \ else \ echo "Skipping platform-specific optimizations (unsupported platform)"; \ fi # Create a non-root user and group RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser # Create and set permissions for appuser home directory RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser WORKDIR ${APP_HOME} RUN echo '#!/bin/bash\n\ if [ "$USE_LOCAL" = "true" ]; then\n\ echo "📦 Installing from local source..."\n\ pip install --no-cache-dir /tmp/project/\n\ else\n\ echo "🌐 Installing from GitHub..."\n\ for i in {1..3}; do \n\ git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\ { echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\ done\n\ pip install --no-cache-dir /tmp/crawl4ai\n\ fi' > /tmp/install.sh && chmod +x /tmp/install.sh COPY . /tmp/project/ # Copy supervisor config first (might need root later, but okay for now) COPY deploy/docker/supervisord.conf . COPY deploy/docker/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ pip install --no-cache-dir \ torch \ torchvision \ torchaudio \ scikit-learn \ nltk \ transformers \ tokenizers && \ python -m nltk.downloader punkt stopwords ; \ fi RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ pip install "/tmp/project/[all]" && \ python -m crawl4ai.model_loader ; \ elif [ "$INSTALL_TYPE" = "torch" ] ; then \ pip install "/tmp/project/[torch]" ; \ elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ pip install "/tmp/project/[transformer]" && \ python -m crawl4ai.model_loader ; \ else \ pip install "/tmp/project" ; \ fi RUN pip install --no-cache-dir --upgrade pip && \ /tmp/install.sh && \ python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \ python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')" RUN crawl4ai-setup RUN playwright install --with-deps RUN mkdir -p /home/appuser/.cache/ms-playwright \ && cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \ && chown -R appuser:appuser /home/appuser/.cache/ms-playwright RUN crawl4ai-doctor # Ensure all cache directories belong to appuser # This fixes permission issues with .cache/url_seeder and other runtime cache dirs RUN mkdir -p /home/appuser/.cache \ && chown -R appuser:appuser /home/appuser/.cache # Copy application code COPY deploy/docker/* ${APP_HOME}/ # copy the playground + any future static assets COPY deploy/docker/static ${APP_HOME}/static # Change ownership of the application directory to the non-root user RUN chown -R appuser:appuser ${APP_HOME} # give permissions to redis persistence dirs if used RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD bash -c '\ MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ if [ $MEM -lt 2048 ]; then \ echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \ exit 1; \ fi && \ redis-cli ping > /dev/null && \ curl -f http://localhost:11235/health || exit 1' EXPOSE 6379 # Switch to the non-root user before starting the application USER appuser # Set environment variables to ptoduction ENV PYTHON_ENV=production # Start the application using supervisord CMD ["supervisord", "-c", "supervisord.conf"]