refactor: Update Dockerfile to install Crawl4AI with specified options

This commit updates the Dockerfile to install Crawl4AI with the specified options. The `INSTALL_OPTION` build argument is used to determine which additional packages to install. If the option is set to "all", all models will be downloaded. If the option is set to "torch", only torch models will be downloaded. If the option is set to "transformer", only transformer models will be downloaded. If no option is specified, the default installation will be used. This change improves the flexibility and customization of the Crawl4AI installation process.
This commit is contained in:
unclecode
2024-08-01 17:56:19 +08:00
parent 40477493d3
commit b6713870ef
4 changed files with 89 additions and 18 deletions

View File

@@ -4,6 +4,9 @@ FROM python:3.10-slim-bookworm
# Set the working directory in the container
WORKDIR /usr/src/app
# Define build arguments
ARG INSTALL_OPTION=default
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
@@ -21,8 +24,20 @@ RUN apt-get update && \
# Copy the application code
COPY . .
# Install Crawl4AI using the local setup.py (which will use the default installation)
RUN pip install --no-cache-dir .
# Install Crawl4AI using the local setup.py with the specified option
# and download models only for torch, transformer, or all options
RUN if [ "$INSTALL_OPTION" = "all" ]; then \
pip install --no-cache-dir .[all] && \
crawl4ai-download-models; \
elif [ "$INSTALL_OPTION" = "torch" ]; then \
pip install --no-cache-dir .[torch] && \
crawl4ai-download-models; \
elif [ "$INSTALL_OPTION" = "transformer" ]; then \
pip install --no-cache-dir .[transformer] && \
crawl4ai-download-models; \
else \
pip install --no-cache-dir .; \
fi
# Install Google Chrome
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
@@ -30,9 +45,6 @@ RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key
apt-get update && \
apt-get install -y google-chrome-stable
# Update webdriver_manager to version 4.0.2
RUN pip install --no-cache-dir webdriver_manager==4.0.2
# Set environment to use Chrome properly
ENV CHROME_BIN=/usr/bin/google-chrome \
DISPLAY=:99 \
@@ -40,14 +52,11 @@ ENV CHROME_BIN=/usr/bin/google-chrome \
PYTHONUNBUFFERED=1
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH /opt/conda/bin:$PATH
ENV PATH=/opt/conda/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Download models call cli "crawl4ai-download-models"
# RUN crawl4ai-download-models
# Install mkdocs
RUN pip install mkdocs mkdocs-terminal