From 4d43880cde128a30c5fb6d1e9058ec093e386836 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Tue, 18 Jun 2024 19:08:46 +0800
Subject: [PATCH] Playing with different Docker settings to find the best one

---
 DockerfileTest           | 37 ++++++++++++++++++++
 DockerfileTest2          | 73 ++++++++++++++++++++++++++++++++++++++++
 DockerfileTest3          | 61 +++++++++++++++++++++++++++++++++
 README.md                |  2 +-
 crawl4ai/model_loader.py |  9 +++--
 setup.py                 | 12 ++++++-
 6 files changed, 187 insertions(+), 7 deletions(-)
 create mode 100644 DockerfileTest
 create mode 100644 DockerfileTest2
 create mode 100644 DockerfileTest3

diff --git a/DockerfileTest b/DockerfileTest
new file mode 100644
index 00000000..e9986beb
--- /dev/null
+++ b/DockerfileTest
@@ -0,0 +1,37 @@
+
+# First stage: Build and install dependencies
+FROM python:3.10-slim-bookworm as builder
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Install build dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    wget \
+    curl \
+    unzip 
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir spacy
+
+# Copy the rest of the application code
+COPY . .
+
+# Set environment to use Chrome and ChromeDriver properly
+ENV CHROME_BIN=/usr/bin/google-chrome \
+    CHROMEDRIVER=/usr/local/bin/chromedriver \
+    DISPLAY=:99 \
+    DBUS_SESSION_BUS_ADDRESS=/dev/null \
+    PYTHONUNBUFFERED=1
+
+# Ensure the PATH environment variable includes the location of the installed packages
+ENV PATH /usr/local/bin:$PATH   
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Print helloworld when the container launches
+CMD ["echo", "Hello, World!"]
\ No newline at end of file
diff --git a/DockerfileTest2 b/DockerfileTest2
new file mode 100644
index 00000000..35a06a9d
--- /dev/null
+++ b/DockerfileTest2
@@ -0,0 +1,73 @@
+# First stage: Build and install dependencies
+FROM pytorch/pytorch:latest as builder
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Install build dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    wget \
+    git \
+    curl \
+    unzip \
+    gnupg \
+    xvfb \
+    ca-certificates \
+    apt-transport-https \
+    software-properties-common && \
+    rm -rf /var/lib/apt/lists/*    
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir spacy onnxruntime && \
+    python -m spacy download en_core_web_sm
+
+# Install Google Chrome and ChromeDriver
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
+    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
+    apt-get update && \
+    apt-get install -y google-chrome-stable && \
+    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
+    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
+
+# Second stage: Create the final image
+FROM pytorch/pytorch:latest
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Copy Chromedriver and Chrome from the builder stage
+COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
+COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome
+
+# Copy installed Python packages from builder stage
+COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
+COPY --from=builder /opt/conda/bin /opt/conda/bin
+
+# Copy the rest of the application code
+COPY . .
+
+# Set environment to use Chrome and ChromeDriver properly
+ENV CHROME_BIN=/usr/bin/google-chrome \
+    CHROMEDRIVER=/usr/local/bin/chromedriver \
+    DISPLAY=:99 \
+    DBUS_SESSION_BUS_ADDRESS=/dev/null \
+    PYTHONUNBUFFERED=1
+
+#  pip install -e .[all]
+RUN pip install --no-cache-dir -e .[all]
+
+# Ensure the PATH environment variable includes the location of the installed packages
+ENV PATH /opt/conda/bin:$PATH   
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Download models call cli "crawl4ai-download-models"
+RUN crawl4ai-download-models
+# RUN python crawl4ai/model_loader.py
+
+# Run uvicorn
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
diff --git a/DockerfileTest3 b/DockerfileTest3
new file mode 100644
index 00000000..dda6b4c9
--- /dev/null
+++ b/DockerfileTest3
@@ -0,0 +1,61 @@
+# First stage: Build and install dependencies
+FROM pytorch/pytorch:latest 
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Install build dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    wget \
+    git \
+    curl \
+    unzip \
+    gnupg \
+    xvfb \
+    ca-certificates \
+    apt-transport-https \
+    software-properties-common && \
+    rm -rf /var/lib/apt/lists/*    
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir spacy onnxruntime && \
+    python -m spacy download en_core_web_sm
+
+# Install Google Chrome and ChromeDriver
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
+    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
+    apt-get update && \
+    apt-get install -y google-chrome-stable && \
+    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
+    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
+
+# Copy the rest of the application code
+COPY . .
+
+# Set environment to use Chrome and ChromeDriver properly
+ENV CHROME_BIN=/usr/bin/google-chrome \
+    CHROMEDRIVER=/usr/local/bin/chromedriver \
+    DISPLAY=:99 \
+    DBUS_SESSION_BUS_ADDRESS=/dev/null \
+    PYTHONUNBUFFERED=1
+
+#  pip install -e .[all]
+RUN pip install --no-cache-dir -e .[all]
+
+# Ensure the PATH environment variable includes the location of the installed packages
+ENV PATH /opt/conda/bin:$PATH   
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Download models call cli "crawl4ai-download-models"
+RUN crawl4ai-download-models
+# RUN python crawl4ai/model_loader.py
+
+# Run uvicorn
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
+
+
diff --git a/README.md b/README.md
index ab4cf3f6..5af07211 100644
--- a/README.md
+++ b/README.md
@@ -188,7 +188,7 @@ pip install -e .[all]
 # docker build --platform linux/amd64 -t crawl4ai .
 # For other users
 # docker build -t crawl4ai .
-docker run -d -p 8000:80 crawl4ai
+docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai
 ```
 
 
diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py
index 7e17f7f9..0b7838c1 100644
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -53,7 +53,6 @@ def set_model_device(model):
     model.to(device)    
     return model, device
 
-@lru_cache()
 def get_home_folder():
     home_folder = os.path.join(Path.home(), ".crawl4ai")
     os.makedirs(home_folder, exist_ok=True)
@@ -202,7 +201,7 @@ def load_spacy_model():
         repo_folder = os.path.join(home_folder, "crawl4ai")
         model_folder = os.path.join(home_folder, name)
 
-        # print("[LOG] ⏬ Downloading Spacy model for the first time...")
+        print("[LOG] ⏬ Downloading Spacy model for the first time...")
 
         # Remove existing repo folder if it exists
         if Path(repo_folder).exists():
@@ -230,7 +229,7 @@ def load_spacy_model():
             shutil.rmtree(repo_folder)
 
             # Print completion message
-            # print("[LOG] ✅ Spacy Model downloaded successfully")
+            print("[LOG] ✅ Spacy Model downloaded successfully")
         except subprocess.CalledProcessError as e:
             print(f"An error occurred while cloning the repository: {e}")
         except Exception as e:
@@ -255,8 +254,8 @@ def download_all_models(remove_existing=False):
     # Load each model to trigger download
     # print("[LOG] Downloading BERT Base Uncased...")
     # load_bert_base_uncased()
-    # print("[LOG] Downloading BGE Small EN v1.5...")
-    # load_bge_small_en_v1_5()
+    print("[LOG] Downloading BGE Small EN v1.5...")
+    load_bge_small_en_v1_5()
     # print("[LOG] Downloading ONNX model...")
     # load_onnx_all_MiniLM_l6_v2()
     print("[LOG] Downloading text classifier...")
diff --git a/setup.py b/setup.py
index 168dfac6..9d9f067d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,18 @@
 from setuptools import setup, find_packages
-import os
+import os, sys
+from pathlib import Path
 import subprocess
 from setuptools.command.install import install
 
+def get_home_folder():
+    home_folder = os.path.join(Path.home(), ".crawl4ai")
+    os.makedirs(home_folder, exist_ok=True)
+    os.makedirs(f"{home_folder}/cache", exist_ok=True)
+    os.makedirs(f"{home_folder}/models", exist_ok=True)
+    return home_folder 
+
+home_folder = get_home_folder()
+
 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
     requirements = f.read().splitlines()