Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-02 07:56:00 +00:00
parent 6ddccc144c ae77589a98
commit 9b0f71ba88
12 changed files with 214 additions and 71 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -173,4 +173,5 @@ Crawl4AI.egg-info/
 requirements0.txt
 a.txt

-*.sh
+*.sh
+.idea
--- a/86
+++ b/86
@@ -1,43 +1,77 @@
-# Use an official Python runtime as a parent image
-FROM python:3.10-slim
+
+# First stage: Build and install dependencies
+FROM python:3.10-slim-bookworm as builder

 # Set the working directory in the container
 WORKDIR /usr/src/app

-# Copy the current directory contents into the container at /usr/src/app
-COPY . .
-
-# Install dependencies for Chrome and ChromeDriver
-RUN apt-get update && apt-get install -y --no-install-recommends \
+# Install build dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
    wget \
-    xvfb \
-    unzip \
    curl \
+    unzip 
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
+    python -m spacy download en_core_web_sm
+
+# Download and install ChromeDriver
+RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
+    wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
+    unzip /tmp/chromedriver_linux64.zip -d /tmp && \
+    mv /tmp/chromedriver /usr/local/bin/chromedriver && \
+    chmod +x /usr/local/bin/chromedriver && \
+    rm /tmp/chromedriver_linux64.zip
+
+# Second stage: Create final runtime image
+FROM python:3.10-slim-bookworm
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Install runtime dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    wget \
+    git \
+    xvfb \
    gnupg2 \
    ca-certificates \
    apt-transport-https \
-    software-properties-common \
-    && mkdir -p /etc/apt/keyrings \
-    && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
-    && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
-    && apt-get update \
-    && apt-get install -y google-chrome-stable \
-    && rm -rf /var/lib/apt/lists/* \
-    && apt-get install -y chromium-chromedriver
+    software-properties-common && \
+    wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
+    echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends google-chrome-stable && \
+    rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list

-# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install spacy torch torchvision torchaudio
+# Copy Chromedriver from the builder stage
+COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver

-# Set display port and dbus env to avoid hanging
-ENV DISPLAY=:99
-ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
+# Copy installed Python packages from builder stage
+COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# Copy the rest of the application code
+COPY . .
+
+# Set environment to use Chrome and ChromeDriver properly
+ENV CHROME_BIN=/usr/bin/google-chrome \
+    CHROMEDRIVER=/usr/local/bin/chromedriver \
+    DISPLAY=:99 \
+    DBUS_SESSION_BUS_ADDRESS=/dev/null \
+    PYTHONUNBUFFERED=1
+
+# Ensure the PATH environment variable includes the location of the installed packages
+ENV PATH /usr/local/bin:$PATH   

 # Make port 80 available to the world outside this container
 EXPOSE 80

-# Define environment variable
-ENV PYTHONUNBUFFERED 1
-
 # Run uvicorn
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
+
+
--- a/45
+++ b/45
@@ -0,0 +1,45 @@
+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+# In case you had some weird issues, try this Image
+# FROM python:3.10-slim-bookworm as builder
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Copy the current directory contents into the container at /usr/src/app
+COPY . .
+
+# Install dependencies for Chrome and ChromeDriver
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    xvfb \
+    unzip \
+    curl \
+    gnupg2 \
+    ca-certificates \
+    apt-transport-https \
+    software-properties-common \
+    && mkdir -p /etc/apt/keyrings \
+    && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
+    && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
+    && apt-get update \
+    && apt-get install -y google-chrome-stable \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y chromium-chromedriver
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install spacy torch torchvision torchaudio
+
+# Set display port and dbus env to avoid hanging
+ENV DISPLAY=:99
+ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Define environment variable
+ENV PYTHONUNBUFFERED 1
+
+# Run uvicorn
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.0 🕷️🤖
+# Crawl4AI v0.2.2 🕷️🤖

 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -10,8 +10,14 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information

 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)

-## Recent Changes v0.2.0
+## Recent Changes 

+### v0.2.2
+- Support multiple JS scripts
+- Fixed some of bugs
+- Resolved a few issue relevant to Colab installation
+
+### v0.2.0
 - 🚀 10x faster!!
 - 📜 Execute custom JavaScript before crawling!
 - 🤝 Colab friendly!
@@ -30,8 +36,6 @@ from crawl4ai import WebCrawler
 # Create the WebCrawler instance 
 crawler = WebCrawler() 

-
-
 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(url="https://www.nbcnews.com/business")
 print(result) # {url, html, markdown, extracted_content, metadata}
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -103,12 +103,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
            )
            
            # Execute JS code if provided
-            if self.js_code:
+            if self.js_code and type(self.js_code) == str:
                self.driver.execute_script(self.js_code)
                # Optionally, wait for some condition after executing the JS code
                WebDriverWait(self.driver, 10).until(
                    lambda driver: driver.execute_script("return document.readyState") == "complete"
                )
+            elif self.js_code and type(self.js_code) == list:
+                for js in self.js_code:
+                    self.driver.execute_script(js)
+                    WebDriverWait(self.driver, 10).until(
+                        lambda driver: driver.execute_script("return document.readyState") == "complete"
+                    )
            
            html = self.driver.page_source
            
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -188,14 +188,15 @@ class CosineStrategy(ExtractionStrategy):
        if self.verbose:
            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")

-        if False and self.device.type == "cpu":
-            self.model = load_onnx_all_MiniLM_l6_v2()
-            self.tokenizer = self.model.tokenizer
-            self.get_embedding_method = "direct"
-        else:
-            self.tokenizer, self.model = load_bge_small_en_v1_5()
-            self.model.eval()  
-            self.get_embedding_method = "batch"
+        # if False and self.device.type == "cpu":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+        # else:
+
+        self.tokenizer, self.model = load_bge_small_en_v1_5()
+        self.model.eval()  
+        self.get_embedding_method = "batch"
        
        self.buffer_embeddings = np.array([])

--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -2,6 +2,7 @@ from functools import lru_cache
 from pathlib import Path
 import subprocess, os
 import shutil
+import tarfile
 from crawl4ai.config import MODEL_REPO_BRANCH
 import argparse
 import urllib.request
@@ -34,8 +35,7 @@ def calculate_batch_size(device):
        else:
            return 32
    else:
-        return 16  # Default batch size
-    
+        return 16  # Default batch size   
    
@lru_cache()
 def get_device():
@@ -82,12 +82,19 @@ def load_bge_small_en_v1_5():
@lru_cache()
 def load_onnx_all_MiniLM_l6_v2():
    from crawl4ai.onnx_embedding import DefaultEmbeddingModel
-    model_path = "models/onnx/model.onnx"
-    model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx"
-    download_path = os.path.join(__location__, model_path)

+    model_path = "models/onnx.tar.gz"
+    model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz"
+    __location__ = os.path.realpath(
+        os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    download_path = os.path.join(__location__, model_path)
+    onnx_dir = os.path.join(__location__, "models/onnx")
+    
+    # Create the models directory if it does not exist
+    os.makedirs(os.path.dirname(download_path), exist_ok=True)
+
+    # Download the tar.gz file if it does not exist
    if not os.path.exists(download_path):
-        # Define a download function with a simple progress display
        def download_with_progress(url, filename):
            def reporthook(block_num, block_size, total_size):
                downloaded = block_num * block_size
@@ -95,12 +102,22 @@ def load_onnx_all_MiniLM_l6_v2():
                if downloaded < total_size:
                    print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
                else:
-                    print("\rDownload complete!                              ")
+                    print("\rDownload complete!")

            urllib.request.urlretrieve(url, filename, reporthook)

        download_with_progress(model_url, download_path)

+    # Extract the tar.gz file if the onnx directory does not exist
+    if not os.path.exists(onnx_dir):
+        with tarfile.open(download_path, "r:gz") as tar:
+            tar.extractall(path=os.path.join(__location__, "models"))
+        
+        # remove the tar.gz file
+        os.remove(download_path)
+    
+    
+    
    model = DefaultEmbeddingModel()
    return model

@@ -240,8 +257,8 @@ def download_all_models(remove_existing=False):
    # load_bert_base_uncased()
    # print("[LOG] Downloading BGE Small EN v1.5...")
    # load_bge_small_en_v1_5()
-    print("[LOG] Downloading ONNX model...")
-    load_onnx_all_MiniLM_l6_v2()
+    # print("[LOG] Downloading ONNX model...")
+    # load_onnx_all_MiniLM_l6_v2()
    print("[LOG] Downloading text classifier...")
    _, device = load_text_multilabel_classifier()
    print(f"[LOG] Text classifier loaded on {device}")
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -164,6 +164,22 @@ def interactive_extraction(crawler):
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)

+def multiple_scrip(crawler):
+    # Passing JavaScript code to interact with the page
+    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
+    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
+    js_code = ["""
+    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
+    loadMoreButton && loadMoreButton.click();
+    """] * 2
+    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    result = crawler.run(
+        url="https://www.nbcnews.com/business",
+    )
+    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
+    print_result(result)
+
 def main():
    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
    cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
@@ -180,6 +196,7 @@ def main():
    add_llm_extraction_strategy(crawler)
    targeted_extraction(crawler)
    interactive_extraction(crawler)
+    multiple_scrip(crawler)

    cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")

--- a/main.py
+++ b/main.py
@@ -66,7 +66,7 @@ async def read_index(request: Request):

    for filename in os.listdir(partials_dir):
        if filename.endswith(".html"):
-            with open(os.path.join(partials_dir, filename), "r") as file:
+            with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
                partials[filename[:-5]] = file.read()

    return templates.TemplateResponse("index.html", {"request": request, **partials})
--- a/requirements.crawl.txt
+++ b/requirements.crawl.txt
@@ -0,0 +1,13 @@
+aiohttp
+aiosqlite
+bs4
+fastapi
+html2text
+httpx
+pydantic
+python-dotenv
+requests
+rich
+selenium
+uvicorn
+chromedriver-autoinstaller
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,20 +1,20 @@
-aiohttp==3.9.5
-aiosqlite==0.20.0
-bs4==0.0.2
-fastapi==0.111.0
-html2text==2024.2.26
-httpx==0.27.0
-litellm==1.37.11
-nltk==3.8.1
-pydantic==2.7.1
-python-dotenv==1.0.1
-requests==2.31.0
-rich==13.7.1
-scikit-learn==1.4.2
-selenium==4.20.0
-uvicorn==0.29.0
-transformers==4.40.2
-chromedriver-autoinstaller==0.6.4
-torch==2.3.0
-onnxruntime==1.14.1
-tokenizers==0.13.2
+aiohttp
+aiosqlite
+bs4
+fastapi
+html2text
+httpx
+litellm
+nltk
+pydantic
+python-dotenv
+requests
+rich
+scikit-learn
+selenium
+uvicorn
+transformers
+chromedriver-autoinstaller
+torch
+onnxruntime
+tokenizers
--- a/setup.py
+++ b/setup.py
@@ -7,11 +7,16 @@ from setuptools.command.install import install
 with open("requirements.txt") as f:
    requirements = f.read().splitlines()

+# Read the requirements from requirements.txt
+with open("requirements.crawl.txt") as f:
+    requirements_crawl_only = f.read().splitlines()
+
 # Define the requirements for different environments
 requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
 requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
 requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
 requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
+requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]

 class CustomInstallCommand(install):
    """Customized setuptools install command to install spacy without dependencies."""
@@ -34,7 +39,7 @@ setup(
    extras_require={
        "all": requirements,  # Include all requirements
        "colab": requirements_without_torch,  # Exclude torch for Colab
-        "crawl": requirements_without_torch_transformers_nlkt
+        "crawl": requirements_crawl_only,  # Include only crawl requirements
    },
    cmdclass={
        'install': CustomInstallCommand,