Files
crawl4ai/Dockerfile
Nasrin a87e8c1c9e Release/v0.7.8 (#1662)
* Fix: Use correct URL variable for raw HTML extraction (#1116)

- Prevents full HTML content from being passed as URL to extraction strategies
- Added unit tests to verify raw HTML and regular URL processing

Fix: Wrong URL variable used for extraction of raw html

* Fix #1181: Preserve whitespace in code blocks during HTML scraping

  The remove_empty_elements_fast() method was removing whitespace-only
  span elements inside <pre> and <code> tags, causing import statements
  like "import torch" to become "importtorch". Now skips elements inside
  code blocks where whitespace is significant.

* Refactor Pydantic model configuration to use ConfigDict for arbitrary types

* Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621

* Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638

* fix: ensure BrowserConfig.to_dict serializes proxy_config

* feat: make LLM backoff configurable end-to-end

- extend LLMConfig with backoff delay/attempt/factor fields and thread them
  through LLMExtractionStrategy, LLMContentFilter, table extraction, and
  Docker API handlers
- expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff
  and document them in the md_v2 guides

* reproduced AttributeError from #1642

* pass timeout parameter to docker client request

* added missing deep crawling objects to init

* generalized query in ContentRelevanceFilter to be a str or list

* import modules from enhanceable deserialization

* parameterized tests

* Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268

* refactor: replace PyPDF2 with pypdf across the codebase. ref #1412

* announcement: add application form for cloud API closed beta

* Release v0.7.8: Stability & Bug Fix Release

- Updated version to 0.7.8
- Introduced focused stability release addressing 11 community-reported bugs.
- Key fixes include Docker API improvements, LLM extraction enhancements, URL handling corrections, and dependency updates.
- Added detailed release notes for v0.7.8 in the blog and created a dedicated verification script to ensure all fixes are functioning as intended.
- Updated documentation to reflect recent changes and improvements.

* docs: add section for Crawl4AI Cloud API closed beta with application link

* fix: add disk cleanup step to Docker workflow

---------

Co-authored-by: rbushria <rbushri@gmail.com>
Co-authored-by: AHMET YILMAZ <tawfik@kidocode.com>
Co-authored-by: Soham Kukreti <kukretisoham@gmail.com>
Co-authored-by: Chris Murphy <chris.murphy@klaviyo.com>
Co-authored-by: Aravind Karnam <aravind.karanam@gmail.com>
2025-12-11 11:04:52 +01:00

205 lines
6.0 KiB
Docker

FROM python:3.12-slim-bookworm AS build
# C4ai version
ARG C4AI_VER=0.7.8
ENV C4AI_VERSION=$C4AI_VER
LABEL c4ai.version=$C4AI_VER
# Set build arguments
ARG APP_HOME=/app
ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
ARG GITHUB_BRANCH=main
ARG USE_LOCAL=true
ENV PYTHONFAULTHANDLER=1 \
PYTHONHASHSEED=random \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_DEFAULT_TIMEOUT=100 \
DEBIAN_FRONTEND=noninteractive \
REDIS_HOST=localhost \
REDIS_PORT=6379
ARG PYTHON_VERSION=3.12
ARG INSTALL_TYPE=default
ARG ENABLE_GPU=false
ARG TARGETARCH
LABEL maintainer="unclecode"
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
LABEL version="1.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
wget \
gnupg \
git \
cmake \
pkg-config \
python3-dev \
libjpeg-dev \
redis-server \
supervisor \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --no-install-recommends \
libglib2.0-0 \
libnss3 \
libnspr4 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdrm2 \
libdbus-1-3 \
libxcb1 \
libxkbcommon0 \
libx11-6 \
libxcomposite1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libpango-1.0-0 \
libcairo2 \
libasound2 \
libatspi2.0-0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get dist-upgrade -y \
&& rm -rf /var/lib/apt/lists/*
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
apt-get update && apt-get install -y --no-install-recommends \
nvidia-cuda-toolkit \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* ; \
else \
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
fi
RUN if [ "$TARGETARCH" = "arm64" ]; then \
echo "🦾 Installing ARM-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \
libopenblas-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*; \
elif [ "$TARGETARCH" = "amd64" ]; then \
echo "🖥️ Installing AMD64-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \
libomp-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*; \
else \
echo "Skipping platform-specific optimizations (unsupported platform)"; \
fi
# Create a non-root user and group
RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
# Create and set permissions for appuser home directory
RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
WORKDIR ${APP_HOME}
RUN echo '#!/bin/bash\n\
if [ "$USE_LOCAL" = "true" ]; then\n\
echo "📦 Installing from local source..."\n\
pip install --no-cache-dir /tmp/project/\n\
else\n\
echo "🌐 Installing from GitHub..."\n\
for i in {1..3}; do \n\
git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
{ echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
done\n\
pip install --no-cache-dir /tmp/crawl4ai\n\
fi' > /tmp/install.sh && chmod +x /tmp/install.sh
COPY . /tmp/project/
# Copy supervisor config first (might need root later, but okay for now)
COPY deploy/docker/supervisord.conf .
COPY deploy/docker/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
pip install --no-cache-dir \
torch \
torchvision \
torchaudio \
scikit-learn \
nltk \
transformers \
tokenizers && \
python -m nltk.downloader punkt stopwords ; \
fi
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
pip install "/tmp/project/[all]" && \
python -m crawl4ai.model_loader ; \
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
pip install "/tmp/project/[torch]" ; \
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
pip install "/tmp/project/[transformer]" && \
python -m crawl4ai.model_loader ; \
else \
pip install "/tmp/project" ; \
fi
RUN pip install --no-cache-dir --upgrade pip && \
/tmp/install.sh && \
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
RUN crawl4ai-setup
RUN playwright install --with-deps
RUN mkdir -p /home/appuser/.cache/ms-playwright \
&& cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \
&& chown -R appuser:appuser /home/appuser/.cache/ms-playwright
RUN crawl4ai-doctor
# Ensure all cache directories belong to appuser
# This fixes permission issues with .cache/url_seeder and other runtime cache dirs
RUN mkdir -p /home/appuser/.cache \
&& chown -R appuser:appuser /home/appuser/.cache
# Copy application code
COPY deploy/docker/* ${APP_HOME}/
# copy the playground + any future static assets
COPY deploy/docker/static ${APP_HOME}/static
# Change ownership of the application directory to the non-root user
RUN chown -R appuser:appuser ${APP_HOME}
# give permissions to redis persistence dirs if used
RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD bash -c '\
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
if [ $MEM -lt 2048 ]; then \
echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
exit 1; \
fi && \
redis-cli ping > /dev/null && \
curl -f http://localhost:11235/health || exit 1'
EXPOSE 6379
# Switch to the non-root user before starting the application
USER appuser
# Set environment variables to ptoduction
ENV PYTHON_ENV=production
# Start the application using supervisord
CMD ["supervisord", "-c", "supervisord.conf"]