chore: Add custom headers to LocalSeleniumCrawlerStrategy

chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy
2024-06-17 15:50:03 +08:00 · 2024-06-17 15:37:18 +08:00
10 changed files with 107 additions and 191 deletions
--- a/.files/screenshot.png
+++ b/.files/screenshot.png
--- a/.gitignore
+++ b/.gitignore
@@ -179,6 +179,3 @@ docs/examples/.chainlit/
 docs/examples/.chainlit/*
 .chainlit/config.toml
 .chainlit/translations/en-US.json
-
-local/
-.files/
--- a/37
+++ b/37
@@ -1,37 +0,0 @@
-
-# First stage: Build and install dependencies
-FROM python:3.10-slim-bookworm as builder
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    curl \
-    unzip 
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /usr/local/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Print helloworld when the container launches
-CMD ["echo", "Hello, World!"]
--- a/73
+++ b/73
@@ -1,73 +0,0 @@
-# First stage: Build and install dependencies
-FROM pytorch/pytorch:latest as builder
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    git \
-    curl \
-    unzip \
-    gnupg \
-    xvfb \
-    ca-certificates \
-    apt-transport-https \
-    software-properties-common && \
-    rm -rf /var/lib/apt/lists/*    
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy onnxruntime && \
-    python -m spacy download en_core_web_sm
-
-# Install Google Chrome and ChromeDriver
-RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
-    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
-    apt-get update && \
-    apt-get install -y google-chrome-stable && \
-    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
-    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
-
-# Second stage: Create the final image
-FROM pytorch/pytorch:latest
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Copy Chromedriver and Chrome from the builder stage
-COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
-COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome
-
-# Copy installed Python packages from builder stage
-COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
-COPY --from=builder /opt/conda/bin /opt/conda/bin
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-#  pip install -e .[all]
-RUN pip install --no-cache-dir -e .[all]
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /opt/conda/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Download models call cli "crawl4ai-download-models"
-RUN crawl4ai-download-models
-# RUN python crawl4ai/model_loader.py
-
-# Run uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/61
+++ b/61
@@ -1,61 +0,0 @@
-# First stage: Build and install dependencies
-FROM pytorch/pytorch:latest 
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    git \
-    curl \
-    unzip \
-    gnupg \
-    xvfb \
-    ca-certificates \
-    apt-transport-https \
-    software-properties-common && \
-    rm -rf /var/lib/apt/lists/*    
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy onnxruntime && \
-    python -m spacy download en_core_web_sm
-
-# Install Google Chrome and ChromeDriver
-RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
-    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
-    apt-get update && \
-    apt-get install -y google-chrome-stable && \
-    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
-    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-#  pip install -e .[all]
-RUN pip install --no-cache-dir -e .[all]
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /opt/conda/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Download models call cli "crawl4ai-download-models"
-RUN crawl4ai-download-models
-# RUN python crawl4ai/model_loader.py
-
-# Run uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
-
-
--- a/README.md
+++ b/README.md
@@ -188,7 +188,7 @@ pip install -e .[all]
 # docker build --platform linux/amd64 -t crawl4ai .
 # For other users
 # docker build -t crawl4ai .
-docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai
+docker run -d -p 8000:80 crawl4ai
 ```


--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -10,7 +10,7 @@ import logging
 import base64
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
-from typing import List
+from typing import List, Callable
 import requests
 import os
 from pathlib import Path
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
    @abstractmethod
    def update_user_agent(self, user_agent: str):
        pass
+    
+    @abstractmethod
+    def set_hook(self, hook_type: str, hook: Callable):
+        pass

 class CloudCrawlerStrategy(CrawlerStrategy):
    def __init__(self, use_cached_html = False):
@@ -96,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        self.use_cached_html = use_cached_html
        self.js_code = js_code
        self.verbose = kwargs.get("verbose", False)
+        
+        # Hooks
+        self.hooks = {
+            'on_driver_created': None,
+            'before_get_url': None,
+            'after_get_url': None,
+            'before_return_html': None
+        }

        # chromedriver_autoinstaller.install()
        import chromedriver_autoinstaller
@@ -103,10 +115,36 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        self.service.log_path = "NUL"
        self.driver = webdriver.Chrome(service=self.service, options=self.options)

+    def set_hook(self, hook_type: str, hook: Callable):
+        if hook_type in self.hooks:
+            self.hooks[hook_type] = hook
+        else:
+            raise ValueError(f"Invalid hook type: {hook_type}")
+    
+    def execute_hook(self, hook_type: str, *args):
+        hook = self.hooks.get(hook_type)
+        if hook:
+            result = hook(*args)
+            if result is not None:
+                if isinstance(result, webdriver.Chrome):
+                    return result
+                else:
+                    raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
+        # If the hook returns None or there is no hook, return self.driver
+        return self.driver
+
    def update_user_agent(self, user_agent: str):
        self.options.add_argument(f"user-agent={user_agent}")
        self.driver.quit()
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
+        self.driver = self.execute_hook('on_driver_created', self.driver)
+
+    def set_custom_headers(self, headers: dict):
+        # Enable Network domain for sending headers
+        self.driver.execute_cdp_cmd('Network.enable', {})
+        # Set extra HTTP headers
+        self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
+

    def crawl(self, url: str) -> str:
        # Create md5 hash of the URL
@@ -120,12 +158,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                    return f.read()

        try:
+            self.driver = self.execute_hook('before_get_url', self.driver)
            if self.verbose:
                print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
            self.driver.get(url)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
            )
+            self.driver = self.execute_hook('after_get_url', self.driver)
            
            # Execute JS code if provided
            if self.js_code and type(self.js_code) == str:
@@ -142,6 +182,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                    )
            
            html = self.driver.page_source
+            self.driver = self.execute_hook('before_return_html', self.driver, html)
            
            # Store in cache
            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -53,6 +53,7 @@ def set_model_device(model):
    model.to(device)    
    return model, device

+@lru_cache()
 def get_home_folder():
    home_folder = os.path.join(Path.home(), ".crawl4ai")
    os.makedirs(home_folder, exist_ok=True)
@@ -201,7 +202,7 @@ def load_spacy_model():
        repo_folder = os.path.join(home_folder, "crawl4ai")
        model_folder = os.path.join(home_folder, name)

-        print("[LOG] ⏬ Downloading Spacy model for the first time...")
+        # print("[LOG] ⏬ Downloading Spacy model for the first time...")

        # Remove existing repo folder if it exists
        if Path(repo_folder).exists():
@@ -229,7 +230,7 @@ def load_spacy_model():
            shutil.rmtree(repo_folder)

            # Print completion message
-            print("[LOG] ✅ Spacy Model downloaded successfully")
+            # print("[LOG] ✅ Spacy Model downloaded successfully")
        except subprocess.CalledProcessError as e:
            print(f"An error occurred while cloning the repository: {e}")
        except Exception as e:
@@ -254,8 +255,8 @@ def download_all_models(remove_existing=False):
    # Load each model to trigger download
    # print("[LOG] Downloading BERT Base Uncased...")
    # load_bert_base_uncased()
-    print("[LOG] Downloading BGE Small EN v1.5...")
-    load_bge_small_en_v1_5()
+    # print("[LOG] Downloading BGE Small EN v1.5...")
+    # load_bge_small_en_v1_5()
    # print("[LOG] Downloading ONNX model...")
    # load_onnx_all_MiniLM_l6_v2()
    print("[LOG] Downloading text classifier...")
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)

+def using_crawler_hooks(crawler):
+    # Example usage of the hooks for authentication and setting a cookie
+    def on_driver_created(driver):
+        print("[HOOK] on_driver_created")
+        # Example customization: maximize the window
+        driver.maximize_window()
+        
+        # Example customization: logging in to a hypothetical website
+        driver.get('https://example.com/login')
+        
+        from selenium.webdriver.support.ui import WebDriverWait
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.NAME, 'username'))
+        )
+        driver.find_element(By.NAME, 'username').send_keys('testuser')
+        driver.find_element(By.NAME, 'password').send_keys('password123')
+        driver.find_element(By.NAME, 'login').click()
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.ID, 'welcome'))
+        )
+        # Add a custom cookie
+        driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
+        return driver        
+        
+
+    def before_get_url(driver):
+        print("[HOOK] before_get_url")
+        # Example customization: add a custom header
+        # Enable Network domain for sending headers
+        driver.execute_cdp_cmd('Network.enable', {})
+        # Add a custom header
+        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
+        return driver
+    
+    def after_get_url(driver):
+        print("[HOOK] after_get_url")
+        # Example customization: log the URL
+        print(driver.current_url)
+        return driver
+
+    def before_return_html(driver, html):
+        print("[HOOK] before_return_html")
+        # Example customization: log the HTML
+        print(len(html))
+        return driver
+    
+    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
+    
+    crawler.set_hook('on_driver_created', on_driver_created)
+    crawler.set_hook('before_get_url', before_get_url)
+    crawler.set_hook('after_get_url', after_get_url)
+    crawler.set_hook('before_return_html', before_return_html)
+    
+    result = crawler.run(url="https://example.com")
+    
+    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
+    print_result(result= result)
+
 def main():
    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
    cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,8 @@
 from setuptools import setup, find_packages
-import os, sys
-from pathlib import Path
+import os
 import subprocess
 from setuptools.command.install import install

-def get_home_folder():
-    home_folder = os.path.join(Path.home(), ".crawl4ai")
-    os.makedirs(home_folder, exist_ok=True)
-    os.makedirs(f"{home_folder}/cache", exist_ok=True)
-    os.makedirs(f"{home_folder}/models", exist_ok=True)
-    return home_folder 
-
-home_folder = get_home_folder()
-
 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
    requirements = f.read().splitlines()
Author	SHA1	Message	Date
unclecode	77da48050d	chore: Add custom headers to LocalSeleniumCrawlerStrategy	2024-06-17 15:50:03 +08:00
unclecode	9a97aacd85	chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy	2024-06-17 15:37:18 +08:00