Compare commits

...

1 Commits

Author SHA1 Message Date
unclecode
4d43880cde Playing with different Docker settings to find the best one 2024-06-18 19:08:46 +08:00
6 changed files with 187 additions and 7 deletions

37
DockerfileTest Normal file
View File

@@ -0,0 +1,37 @@
# First stage: Build and install dependencies
FROM python:3.10-slim-bookworm as builder
# Set the working directory in the container
WORKDIR /usr/src/app
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
curl \
unzip
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir spacy
# Copy the rest of the application code
COPY . .
# Set environment to use Chrome and ChromeDriver properly
ENV CHROME_BIN=/usr/bin/google-chrome \
CHROMEDRIVER=/usr/local/bin/chromedriver \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH /usr/local/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Print helloworld when the container launches
CMD ["echo", "Hello, World!"]

73
DockerfileTest2 Normal file
View File

@@ -0,0 +1,73 @@
# First stage: Build and install dependencies
FROM pytorch/pytorch:latest as builder
# Set the working directory in the container
WORKDIR /usr/src/app
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
git \
curl \
unzip \
gnupg \
xvfb \
ca-certificates \
apt-transport-https \
software-properties-common && \
rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir spacy onnxruntime && \
python -m spacy download en_core_web_sm
# Install Google Chrome and ChromeDriver
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
apt-get update && \
apt-get install -y google-chrome-stable && \
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
# Second stage: Create the final image
FROM pytorch/pytorch:latest
# Set the working directory in the container
WORKDIR /usr/src/app
# Copy Chromedriver and Chrome from the builder stage
COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome
# Copy installed Python packages from builder stage
COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
COPY --from=builder /opt/conda/bin /opt/conda/bin
# Copy the rest of the application code
COPY . .
# Set environment to use Chrome and ChromeDriver properly
ENV CHROME_BIN=/usr/bin/google-chrome \
CHROMEDRIVER=/usr/local/bin/chromedriver \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# pip install -e .[all]
RUN pip install --no-cache-dir -e .[all]
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH /opt/conda/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Download models call cli "crawl4ai-download-models"
RUN crawl4ai-download-models
# RUN python crawl4ai/model_loader.py
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

61
DockerfileTest3 Normal file
View File

@@ -0,0 +1,61 @@
# First stage: Build and install dependencies
FROM pytorch/pytorch:latest
# Set the working directory in the container
WORKDIR /usr/src/app
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
git \
curl \
unzip \
gnupg \
xvfb \
ca-certificates \
apt-transport-https \
software-properties-common && \
rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir spacy onnxruntime && \
python -m spacy download en_core_web_sm
# Install Google Chrome and ChromeDriver
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
apt-get update && \
apt-get install -y google-chrome-stable && \
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
# Copy the rest of the application code
COPY . .
# Set environment to use Chrome and ChromeDriver properly
ENV CHROME_BIN=/usr/bin/google-chrome \
CHROMEDRIVER=/usr/local/bin/chromedriver \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# pip install -e .[all]
RUN pip install --no-cache-dir -e .[all]
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH /opt/conda/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Download models call cli "crawl4ai-download-models"
RUN crawl4ai-download-models
# RUN python crawl4ai/model_loader.py
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

View File

@@ -188,7 +188,7 @@ pip install -e .[all]
# docker build --platform linux/amd64 -t crawl4ai . # docker build --platform linux/amd64 -t crawl4ai .
# For other users # For other users
# docker build -t crawl4ai . # docker build -t crawl4ai .
docker run -d -p 8000:80 crawl4ai docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai
``` ```

View File

@@ -53,7 +53,6 @@ def set_model_device(model):
model.to(device) model.to(device)
return model, device return model, device
@lru_cache()
def get_home_folder(): def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai") home_folder = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True) os.makedirs(home_folder, exist_ok=True)
@@ -202,7 +201,7 @@ def load_spacy_model():
repo_folder = os.path.join(home_folder, "crawl4ai") repo_folder = os.path.join(home_folder, "crawl4ai")
model_folder = os.path.join(home_folder, name) model_folder = os.path.join(home_folder, name)
# print("[LOG] ⏬ Downloading Spacy model for the first time...") print("[LOG] ⏬ Downloading Spacy model for the first time...")
# Remove existing repo folder if it exists # Remove existing repo folder if it exists
if Path(repo_folder).exists(): if Path(repo_folder).exists():
@@ -230,7 +229,7 @@ def load_spacy_model():
shutil.rmtree(repo_folder) shutil.rmtree(repo_folder)
# Print completion message # Print completion message
# print("[LOG] ✅ Spacy Model downloaded successfully") print("[LOG] ✅ Spacy Model downloaded successfully")
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"An error occurred while cloning the repository: {e}") print(f"An error occurred while cloning the repository: {e}")
except Exception as e: except Exception as e:
@@ -255,8 +254,8 @@ def download_all_models(remove_existing=False):
# Load each model to trigger download # Load each model to trigger download
# print("[LOG] Downloading BERT Base Uncased...") # print("[LOG] Downloading BERT Base Uncased...")
# load_bert_base_uncased() # load_bert_base_uncased()
# print("[LOG] Downloading BGE Small EN v1.5...") print("[LOG] Downloading BGE Small EN v1.5...")
# load_bge_small_en_v1_5() load_bge_small_en_v1_5()
# print("[LOG] Downloading ONNX model...") # print("[LOG] Downloading ONNX model...")
# load_onnx_all_MiniLM_l6_v2() # load_onnx_all_MiniLM_l6_v2()
print("[LOG] Downloading text classifier...") print("[LOG] Downloading text classifier...")

View File

@@ -1,8 +1,18 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
import os import os, sys
from pathlib import Path
import subprocess import subprocess
from setuptools.command.install import install from setuptools.command.install import install
def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True)
os.makedirs(f"{home_folder}/cache", exist_ok=True)
os.makedirs(f"{home_folder}/models", exist_ok=True)
return home_folder
home_folder = get_home_folder()
# Read the requirements from requirements.txt # Read the requirements from requirements.txt
with open("requirements.txt") as f: with open("requirements.txt") as f:
requirements = f.read().splitlines() requirements = f.read().splitlines()