diff --git a/Dockerfile b/Dockerfile index e49e2f6f..3f74a26a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,9 @@ FROM python:3.10-slim-bookworm # Set the working directory in the container WORKDIR /usr/src/app +# Define build arguments +ARG INSTALL_OPTION=default + # Install build dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -21,8 +24,20 @@ RUN apt-get update && \ # Copy the application code COPY . . -# Install Crawl4AI using the local setup.py (which will use the default installation) -RUN pip install --no-cache-dir . +# Install Crawl4AI using the local setup.py with the specified option +# and download models only for torch, transformer, or all options +RUN if [ "$INSTALL_OPTION" = "all" ]; then \ + pip install --no-cache-dir .[all] && \ + crawl4ai-download-models; \ + elif [ "$INSTALL_OPTION" = "torch" ]; then \ + pip install --no-cache-dir .[torch] && \ + crawl4ai-download-models; \ + elif [ "$INSTALL_OPTION" = "transformer" ]; then \ + pip install --no-cache-dir .[transformer] && \ + crawl4ai-download-models; \ + else \ + pip install --no-cache-dir .; \ + fi # Install Google Chrome RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ @@ -30,9 +45,6 @@ RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key apt-get update && \ apt-get install -y google-chrome-stable -# Update webdriver_manager to version 4.0.2 -RUN pip install --no-cache-dir webdriver_manager==4.0.2 - # Set environment to use Chrome properly ENV CHROME_BIN=/usr/bin/google-chrome \ DISPLAY=:99 \ @@ -40,14 +52,11 @@ ENV CHROME_BIN=/usr/bin/google-chrome \ PYTHONUNBUFFERED=1 # Ensure the PATH environment variable includes the location of the installed packages -ENV PATH /opt/conda/bin:$PATH +ENV PATH=/opt/conda/bin:$PATH # Make port 80 available to the world outside this container EXPOSE 80 -# Download models call cli "crawl4ai-download-models" -# RUN crawl4ai-download-models - # Install mkdocs RUN pip install mkdocs mkdocs-terminal diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 6d4f0a0e..fb7980d3 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -6,9 +6,9 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import InvalidArgumentException, WebDriverException -from selenium.webdriver.chrome.service import Service as ChromeService -from webdriver_manager.chrome import ChromeDriverManager -from urllib3.exceptions import MaxRetryError +# from selenium.webdriver.chrome.service import Service as ChromeService +# from webdriver_manager.chrome import ChromeDriverManager +# from urllib3.exceptions import MaxRetryError from .config import * import logging, time diff --git a/docs/md/installation.md b/docs/md/installation.md index 1f26ed55..f4688d5f 100644 --- a/docs/md/installation.md +++ b/docs/md/installation.md @@ -59,16 +59,77 @@ crawl4ai-download-models ## Using Docker for Local Server -To run Crawl4AI as a local server using Docker: +Crawl4AI can be run as a local server using Docker. The Dockerfile supports different installation options to cater to various use cases. Here's how you can build and run the Docker image: + +### Default Installation + +The default installation includes the basic Crawl4AI package without additional dependencies or pre-downloaded models. ```bash -# For Mac users -# docker build --platform linux/amd64 -t crawl4ai . +# For Mac users (M1/M2) +docker build --platform linux/amd64 -t crawl4ai . + # For other users -# docker build -t crawl4ai . +docker build -t crawl4ai . + +# Run the container docker run -d -p 8000:80 crawl4ai ``` +### Full Installation (All Dependencies and Models) + +This option installs all dependencies and downloads the models. + +```bash +# For Mac users (M1/M2) +docker build --platform linux/amd64 --build-arg INSTALL_OPTION=all -t crawl4ai:all . + +# For other users +docker build --build-arg INSTALL_OPTION=all -t crawl4ai:all . + +# Run the container +docker run -d -p 8000:80 crawl4ai:all +``` + +### Torch Installation + +This option installs torch-related dependencies and downloads the models. + +```bash +# For Mac users (M1/M2) +docker build --platform linux/amd64 --build-arg INSTALL_OPTION=torch -t crawl4ai:torch . + +# For other users +docker build --build-arg INSTALL_OPTION=torch -t crawl4ai:torch . + +# Run the container +docker run -d -p 8000:80 crawl4ai:torch +``` + +### Transformer Installation + +This option installs transformer-related dependencies and downloads the models. + +```bash +# For Mac users (M1/M2) +docker build --platform linux/amd64 --build-arg INSTALL_OPTION=transformer -t crawl4ai:transformer . + +# For other users +docker build --build-arg INSTALL_OPTION=transformer -t crawl4ai:transformer . + +# Run the container +docker run -d -p 8000:80 crawl4ai:transformer +``` + +### Notes + +- The `--platform linux/amd64` flag is necessary for Mac users with M1/M2 chips to ensure compatibility. +- The `-t` flag tags the image with a name (and optionally a tag in the 'name:tag' format). +- The `-d` flag runs the container in detached mode. +- The `-p 8000:80` flag maps port 8000 on the host to port 80 in the container. + +Choose the installation option that best suits your needs. The default installation is suitable for basic usage, while the other options provide additional capabilities for more advanced use cases. + ## Using Google Colab diff --git a/requirements.txt b/requirements.txt index 359f0b7b..2574cf60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,9 +15,10 @@ scikit-learn==1.5.0 selenium==4.23.1 uvicorn==0.30.1 transformers==4.41.2 -chromedriver-autoinstaller==0.6.4 +# webdriver-manager==4.0.1 +# chromedriver-autoinstaller==0.6.4 torch==2.3.1 onnxruntime==1.18.0 tokenizers==0.19.1 pillow==10.3.0 -webdriver-manager==4.0.1 \ No newline at end of file +slowapi==0.1.9 \ No newline at end of file