From 4d43880cde128a30c5fb6d1e9058ec093e386836 Mon Sep 17 00:00:00 2001 From: unclecode Date: Tue, 18 Jun 2024 19:08:46 +0800 Subject: [PATCH] Playing with different Docker settings to find the best one --- DockerfileTest | 37 ++++++++++++++++++++ DockerfileTest2 | 73 ++++++++++++++++++++++++++++++++++++++++ DockerfileTest3 | 61 +++++++++++++++++++++++++++++++++ README.md | 2 +- crawl4ai/model_loader.py | 9 +++-- setup.py | 12 ++++++- 6 files changed, 187 insertions(+), 7 deletions(-) create mode 100644 DockerfileTest create mode 100644 DockerfileTest2 create mode 100644 DockerfileTest3 diff --git a/DockerfileTest b/DockerfileTest new file mode 100644 index 00000000..e9986beb --- /dev/null +++ b/DockerfileTest @@ -0,0 +1,37 @@ + +# First stage: Build and install dependencies +FROM python:3.10-slim-bookworm as builder + +# Set the working directory in the container +WORKDIR /usr/src/app + +# Install build dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + wget \ + curl \ + unzip + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt && \ + pip install --no-cache-dir spacy + +# Copy the rest of the application code +COPY . . + +# Set environment to use Chrome and ChromeDriver properly +ENV CHROME_BIN=/usr/bin/google-chrome \ + CHROMEDRIVER=/usr/local/bin/chromedriver \ + DISPLAY=:99 \ + DBUS_SESSION_BUS_ADDRESS=/dev/null \ + PYTHONUNBUFFERED=1 + +# Ensure the PATH environment variable includes the location of the installed packages +ENV PATH /usr/local/bin:$PATH + +# Make port 80 available to the world outside this container +EXPOSE 80 + +# Print helloworld when the container launches +CMD ["echo", "Hello, World!"] \ No newline at end of file diff --git a/DockerfileTest2 b/DockerfileTest2 new file mode 100644 index 00000000..35a06a9d --- /dev/null +++ b/DockerfileTest2 @@ -0,0 +1,73 @@ +# First stage: Build and install dependencies +FROM pytorch/pytorch:latest as builder + +# Set the working directory in the container +WORKDIR /usr/src/app + +# Install build dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + wget \ + git \ + curl \ + unzip \ + gnupg \ + xvfb \ + ca-certificates \ + apt-transport-https \ + software-properties-common && \ + rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt && \ + pip install --no-cache-dir spacy onnxruntime && \ + python -m spacy download en_core_web_sm + +# Install Google Chrome and ChromeDriver +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ + sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \ + apt-get update && \ + apt-get install -y google-chrome-stable && \ + wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \ + unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ + +# Second stage: Create the final image +FROM pytorch/pytorch:latest + +# Set the working directory in the container +WORKDIR /usr/src/app + +# Copy Chromedriver and Chrome from the builder stage +COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver +COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome + +# Copy installed Python packages from builder stage +COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages +COPY --from=builder /opt/conda/bin /opt/conda/bin + +# Copy the rest of the application code +COPY . . + +# Set environment to use Chrome and ChromeDriver properly +ENV CHROME_BIN=/usr/bin/google-chrome \ + CHROMEDRIVER=/usr/local/bin/chromedriver \ + DISPLAY=:99 \ + DBUS_SESSION_BUS_ADDRESS=/dev/null \ + PYTHONUNBUFFERED=1 + +# pip install -e .[all] +RUN pip install --no-cache-dir -e .[all] + +# Ensure the PATH environment variable includes the location of the installed packages +ENV PATH /opt/conda/bin:$PATH + +# Make port 80 available to the world outside this container +EXPOSE 80 + +# Download models call cli "crawl4ai-download-models" +RUN crawl4ai-download-models +# RUN python crawl4ai/model_loader.py + +# Run uvicorn +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"] diff --git a/DockerfileTest3 b/DockerfileTest3 new file mode 100644 index 00000000..dda6b4c9 --- /dev/null +++ b/DockerfileTest3 @@ -0,0 +1,61 @@ +# First stage: Build and install dependencies +FROM pytorch/pytorch:latest + +# Set the working directory in the container +WORKDIR /usr/src/app + +# Install build dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + wget \ + git \ + curl \ + unzip \ + gnupg \ + xvfb \ + ca-certificates \ + apt-transport-https \ + software-properties-common && \ + rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt && \ + pip install --no-cache-dir spacy onnxruntime && \ + python -m spacy download en_core_web_sm + +# Install Google Chrome and ChromeDriver +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ + sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \ + apt-get update && \ + apt-get install -y google-chrome-stable && \ + wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \ + unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ + +# Copy the rest of the application code +COPY . . + +# Set environment to use Chrome and ChromeDriver properly +ENV CHROME_BIN=/usr/bin/google-chrome \ + CHROMEDRIVER=/usr/local/bin/chromedriver \ + DISPLAY=:99 \ + DBUS_SESSION_BUS_ADDRESS=/dev/null \ + PYTHONUNBUFFERED=1 + +# pip install -e .[all] +RUN pip install --no-cache-dir -e .[all] + +# Ensure the PATH environment variable includes the location of the installed packages +ENV PATH /opt/conda/bin:$PATH + +# Make port 80 available to the world outside this container +EXPOSE 80 + +# Download models call cli "crawl4ai-download-models" +RUN crawl4ai-download-models +# RUN python crawl4ai/model_loader.py + +# Run uvicorn +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"] + + diff --git a/README.md b/README.md index ab4cf3f6..5af07211 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,7 @@ pip install -e .[all] # docker build --platform linux/amd64 -t crawl4ai . # For other users # docker build -t crawl4ai . -docker run -d -p 8000:80 crawl4ai +docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai ``` diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py index 7e17f7f9..0b7838c1 100644 --- a/crawl4ai/model_loader.py +++ b/crawl4ai/model_loader.py @@ -53,7 +53,6 @@ def set_model_device(model): model.to(device) return model, device -@lru_cache() def get_home_folder(): home_folder = os.path.join(Path.home(), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) @@ -202,7 +201,7 @@ def load_spacy_model(): repo_folder = os.path.join(home_folder, "crawl4ai") model_folder = os.path.join(home_folder, name) - # print("[LOG] ⏬ Downloading Spacy model for the first time...") + print("[LOG] ⏬ Downloading Spacy model for the first time...") # Remove existing repo folder if it exists if Path(repo_folder).exists(): @@ -230,7 +229,7 @@ def load_spacy_model(): shutil.rmtree(repo_folder) # Print completion message - # print("[LOG] ✅ Spacy Model downloaded successfully") + print("[LOG] ✅ Spacy Model downloaded successfully") except subprocess.CalledProcessError as e: print(f"An error occurred while cloning the repository: {e}") except Exception as e: @@ -255,8 +254,8 @@ def download_all_models(remove_existing=False): # Load each model to trigger download # print("[LOG] Downloading BERT Base Uncased...") # load_bert_base_uncased() - # print("[LOG] Downloading BGE Small EN v1.5...") - # load_bge_small_en_v1_5() + print("[LOG] Downloading BGE Small EN v1.5...") + load_bge_small_en_v1_5() # print("[LOG] Downloading ONNX model...") # load_onnx_all_MiniLM_l6_v2() print("[LOG] Downloading text classifier...") diff --git a/setup.py b/setup.py index 168dfac6..9d9f067d 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,18 @@ from setuptools import setup, find_packages -import os +import os, sys +from pathlib import Path import subprocess from setuptools.command.install import install +def get_home_folder(): + home_folder = os.path.join(Path.home(), ".crawl4ai") + os.makedirs(home_folder, exist_ok=True) + os.makedirs(f"{home_folder}/cache", exist_ok=True) + os.makedirs(f"{home_folder}/models", exist_ok=True) + return home_folder + +home_folder = get_home_folder() + # Read the requirements from requirements.txt with open("requirements.txt") as f: requirements = f.read().splitlines()