From b6713870efedc0201a3ab42a9dd603c17838dea6 Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 1 Aug 2024 17:56:19 +0800 Subject: [PATCH] refactor: Update Dockerfile to install Crawl4AI with specified options This commit updates the Dockerfile to install Crawl4AI with the specified options. The `INSTALL_OPTION` build argument is used to determine which additional packages to install. If the option is set to "all", all models will be downloaded. If the option is set to "torch", only torch models will be downloaded. If the option is set to "transformer", only transformer models will be downloaded. If no option is specified, the default installation will be used. This change improves the flexibility and customization of the Crawl4AI installation process. --- Dockerfile | 27 +++++++++----- crawl4ai/crawler_strategy.py | 6 ++-- docs/md/installation.md | 69 +++++++++++++++++++++++++++++++++--- requirements.txt | 5 +-- 4 files changed, 89 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index e49e2f6f..3f74a26a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,9 @@ FROM python:3.10-slim-bookworm # Set the working directory in the container WORKDIR /usr/src/app +# Define build arguments +ARG INSTALL_OPTION=default + # Install build dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -21,8 +24,20 @@ RUN apt-get update && \ # Copy the application code COPY . . -# Install Crawl4AI using the local setup.py (which will use the default installation) -RUN pip install --no-cache-dir . +# Install Crawl4AI using the local setup.py with the specified option +# and download models only for torch, transformer, or all options +RUN if [ "$INSTALL_OPTION" = "all" ]; then \ + pip install --no-cache-dir .[all] && \ + crawl4ai-download-models; \ + elif [ "$INSTALL_OPTION" = "torch" ]; then \ + pip install --no-cache-dir .[torch] && \ + crawl4ai-download-models; \ + elif [ "$INSTALL_OPTION" = "transformer" ]; then \ + pip install --no-cache-dir .[transformer] && \ + crawl4ai-download-models; \ + else \ + pip install --no-cache-dir .; \ + fi # Install Google Chrome RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ @@ -30,9 +45,6 @@ RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key apt-get update && \ apt-get install -y google-chrome-stable -# Update webdriver_manager to version 4.0.2 -RUN pip install --no-cache-dir webdriver_manager==4.0.2 - # Set environment to use Chrome properly ENV CHROME_BIN=/usr/bin/google-chrome \ DISPLAY=:99 \ @@ -40,14 +52,11 @@ ENV CHROME_BIN=/usr/bin/google-chrome \ PYTHONUNBUFFERED=1 # Ensure the PATH environment variable includes the location of the installed packages -ENV PATH /opt/conda/bin:$PATH +ENV PATH=/opt/conda/bin:$PATH # Make port 80 available to the world outside this container EXPOSE 80 -# Download models call cli "crawl4ai-download-models" -# RUN crawl4ai-download-models - # Install mkdocs RUN pip install mkdocs mkdocs-terminal diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 6d4f0a0e..fb7980d3 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -6,9 +6,9 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import InvalidArgumentException, WebDriverException -from selenium.webdriver.chrome.service import Service as ChromeService -from webdriver_manager.chrome import ChromeDriverManager -from urllib3.exceptions import MaxRetryError +# from selenium.webdriver.chrome.service import Service as ChromeService +# from webdriver_manager.chrome import ChromeDriverManager +# from urllib3.exceptions import MaxRetryError from .config import * import logging, time diff --git a/docs/md/installation.md b/docs/md/installation.md index 1f26ed55..f4688d5f 100644 --- a/docs/md/installation.md +++ b/docs/md/installation.md @@ -59,16 +59,77 @@ crawl4ai-download-models ## Using Docker for Local Server -To run Crawl4AI as a local server using Docker: +Crawl4AI can be run as a local server using Docker. The Dockerfile supports different installation options to cater to various use cases. Here's how you can build and run the Docker image: + +### Default Installation + +The default installation includes the basic Crawl4AI package without additional dependencies or pre-downloaded models. ```bash -# For Mac users -# docker build --platform linux/amd64 -t crawl4ai . +# For Mac users (M1/M2) +docker build --platform linux/amd64 -t crawl4ai . + # For other users -# docker build -t crawl4ai . +docker build -t crawl4ai . + +# Run the container docker run -d -p 8000:80 crawl4ai ``` +### Full Installation (All Dependencies and Models) + +This option installs all dependencies and downloads the models. + +```bash +# For Mac users (M1/M2) +docker build --platform linux/amd64 --build-arg INSTALL_OPTION=all -t crawl4ai:all . + +# For other users +docker build --build-arg INSTALL_OPTION=all -t crawl4ai:all . + +# Run the container +docker run -d -p 8000:80 crawl4ai:all +``` + +### Torch Installation + +This option installs torch-related dependencies and downloads the models. + +```bash +# For Mac users (M1/M2) +docker build --platform linux/amd64 --build-arg INSTALL_OPTION=torch -t crawl4ai:torch . + +# For other users +docker build --build-arg INSTALL_OPTION=torch -t crawl4ai:torch . + +# Run the container +docker run -d -p 8000:80 crawl4ai:torch +``` + +### Transformer Installation + +This option installs transformer-related dependencies and downloads the models. + +```bash +# For Mac users (M1/M2) +docker build --platform linux/amd64 --build-arg INSTALL_OPTION=transformer -t crawl4ai:transformer . + +# For other users +docker build --build-arg INSTALL_OPTION=transformer -t crawl4ai:transformer . + +# Run the container +docker run -d -p 8000:80 crawl4ai:transformer +``` + +### Notes + +- The `--platform linux/amd64` flag is necessary for Mac users with M1/M2 chips to ensure compatibility. +- The `-t` flag tags the image with a name (and optionally a tag in the 'name:tag' format). +- The `-d` flag runs the container in detached mode. +- The `-p 8000:80` flag maps port 8000 on the host to port 80 in the container. + +Choose the installation option that best suits your needs. The default installation is suitable for basic usage, while the other options provide additional capabilities for more advanced use cases. + ## Using Google Colab diff --git a/requirements.txt b/requirements.txt index 359f0b7b..2574cf60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,9 +15,10 @@ scikit-learn==1.5.0 selenium==4.23.1 uvicorn==0.30.1 transformers==4.41.2 -chromedriver-autoinstaller==0.6.4 +# webdriver-manager==4.0.1 +# chromedriver-autoinstaller==0.6.4 torch==2.3.1 onnxruntime==1.18.0 tokenizers==0.19.1 pillow==10.3.0 -webdriver-manager==4.0.1 \ No newline at end of file +slowapi==0.1.9 \ No newline at end of file