Add comprehensive Docker deployment configuration with: - New .dockerignore and .llm.env.example files - Enhanced Dockerfile with multi-stage build and optimizations - Detailed README with setup instructions and environment configurations - Improved requirements.txt with Gunicorn - Better error handling in async_configs.py BREAKING CHANGE: Docker deployment now requires .llm.env file for API keys
174 lines
5.4 KiB
Docker
174 lines
5.4 KiB
Docker
FROM python:3.10-slim
|
|
|
|
# Set build arguments
|
|
ARG APP_HOME=/app
|
|
ARG GITHUB_REPO=https://github.com/yourusername/crawl4ai.git
|
|
ARG GITHUB_BRANCH=main
|
|
ARG USE_LOCAL=false
|
|
|
|
# 🤓 Environment variables - because who doesn't love a good ENV party?
|
|
ENV PYTHONFAULTHANDLER=1 \
|
|
PYTHONHASHSEED=random \
|
|
PYTHONUNBUFFERED=1 \
|
|
PIP_NO_CACHE_DIR=1 \
|
|
PYTHONDONTWRITEBYTECODE=1 \
|
|
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
|
PIP_DEFAULT_TIMEOUT=100 \
|
|
DEBIAN_FRONTEND=noninteractive
|
|
|
|
# Other build arguments
|
|
ARG PYTHON_VERSION=3.10
|
|
ARG INSTALL_TYPE=default
|
|
ARG ENABLE_GPU=false
|
|
ARG TARGETARCH
|
|
|
|
# 🎯 Platform-specific labels - because even containers need ID badges
|
|
LABEL maintainer="unclecode"
|
|
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
|
LABEL version="1.0"
|
|
|
|
# 📦 Installing system dependencies... please hold, your package is being delivered
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
build-essential \
|
|
curl \
|
|
wget \
|
|
gnupg \
|
|
git \
|
|
cmake \
|
|
pkg-config \
|
|
python3-dev \
|
|
libjpeg-dev \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# 🎭 Playwright dependencies - because browsers need their vitamins too
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
libglib2.0-0 \
|
|
libnss3 \
|
|
libnspr4 \
|
|
libatk1.0-0 \
|
|
libatk-bridge2.0-0 \
|
|
libcups2 \
|
|
libdrm2 \
|
|
libdbus-1-3 \
|
|
libxcb1 \
|
|
libxkbcommon0 \
|
|
libx11-6 \
|
|
libxcomposite1 \
|
|
libxdamage1 \
|
|
libxext6 \
|
|
libxfixes3 \
|
|
libxrandr2 \
|
|
libgbm1 \
|
|
libpango-1.0-0 \
|
|
libcairo2 \
|
|
libasound2 \
|
|
libatspi2.0-0 \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# 🎮 GPU support - because sometimes CPU just doesn't cut it
|
|
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
|
|
apt-get update && apt-get install -y --no-install-recommends \
|
|
nvidia-cuda-toolkit \
|
|
&& rm -rf /var/lib/apt/lists/* ; \
|
|
else \
|
|
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
|
fi
|
|
|
|
# 🏗️ Platform-specific optimizations - because one size doesn't fit all
|
|
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
|
echo "🦾 Installing ARM-specific optimizations"; \
|
|
apt-get update && apt-get install -y --no-install-recommends \
|
|
libopenblas-dev \
|
|
&& rm -rf /var/lib/apt/lists/*; \
|
|
elif [ "$TARGETARCH" = "amd64" ]; then \
|
|
echo "🖥️ Installing AMD64-specific optimizations"; \
|
|
apt-get update && apt-get install -y --no-install-recommends \
|
|
libomp-dev \
|
|
&& rm -rf /var/lib/apt/lists/*; \
|
|
fi
|
|
|
|
WORKDIR ${APP_HOME}
|
|
|
|
# 🔄 Installation script - now with retry logic because sometimes Git needs a coffee break
|
|
RUN echo '#!/bin/bash\n\
|
|
if [ "$USE_LOCAL" = "true" ]; then\n\
|
|
echo "📦 Installing from local source..."\n\
|
|
pip install --no-cache-dir /tmp/project/\n\
|
|
else\n\
|
|
echo "🌐 Installing from GitHub..."\n\
|
|
for i in {1..3}; do \n\
|
|
git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
|
|
{ echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
|
|
done\n\
|
|
pip install --no-cache-dir /tmp/crawl4ai\n\
|
|
fi' > /tmp/install.sh && chmod +x /tmp/install.sh
|
|
|
|
# Copy local project if USE_LOCAL is true
|
|
COPY . /tmp/project/
|
|
|
|
# Copy and install other requirements
|
|
COPY deploy/docker/requirements.txt .
|
|
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
# Install ML dependencies first for better layer caching
|
|
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
|
pip install --no-cache-dir \
|
|
torch \
|
|
torchvision \
|
|
torchaudio \
|
|
scikit-learn \
|
|
nltk \
|
|
transformers \
|
|
tokenizers && \
|
|
python -m nltk.downloader punkt stopwords ; \
|
|
fi
|
|
|
|
# Install the package
|
|
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
|
pip install "/tmp/project/[all]" && \
|
|
python -m crawl4ai.model_loader ; \
|
|
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
|
|
pip install "/tmp/project/[torch]" ; \
|
|
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
|
|
pip install "/tmp/project/[transformer]" && \
|
|
python -m crawl4ai.model_loader ; \
|
|
else \
|
|
pip install "/tmp/project" ; \
|
|
fi
|
|
|
|
# 🚀 Installation validation - trust but verify!
|
|
RUN pip install --no-cache-dir --upgrade pip && \
|
|
/tmp/install.sh && \
|
|
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
|
|
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
|
|
|
|
RUN playwright install --with-deps chromium
|
|
|
|
# Copy application files
|
|
COPY deploy/docker/* ${APP_HOME}/
|
|
|
|
# 🏥 Health check - now with memory validation!
|
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|
CMD bash -c '\
|
|
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
|
|
if [ $MEM -lt 2048 ]; then \
|
|
echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
|
|
exit 1; \
|
|
fi && \
|
|
curl -f http://localhost:8000/health || exit 1'
|
|
|
|
# Entrypoint script
|
|
COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/
|
|
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
|
ENTRYPOINT ["docker-entrypoint.sh"]
|
|
|
|
# Default command - may the server be with you! 🚀
|
|
CMD ["gunicorn", \
|
|
"--bind", "0.0.0.0:8000", \
|
|
"--workers", "4", \
|
|
"--threads", "2", \
|
|
"--timeout", "120", \
|
|
"--graceful-timeout", "30", \
|
|
"--log-level", "info", \
|
|
"--worker-class", "uvicorn.workers.UvicornWorker", \
|
|
"server:app"] |