Compare commits

..

2 Commits

Author SHA1 Message Date
unclecode
77da48050d chore: Add custom headers to LocalSeleniumCrawlerStrategy 2024-06-17 15:50:03 +08:00
unclecode
9a97aacd85 chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy 2024-06-17 15:37:18 +08:00
10 changed files with 107 additions and 191 deletions

BIN
.files/screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

3
.gitignore vendored
View File

@@ -179,6 +179,3 @@ docs/examples/.chainlit/
docs/examples/.chainlit/* docs/examples/.chainlit/*
.chainlit/config.toml .chainlit/config.toml
.chainlit/translations/en-US.json .chainlit/translations/en-US.json
local/
.files/

View File

@@ -1,37 +0,0 @@
# First stage: Build and install dependencies
FROM python:3.10-slim-bookworm as builder
# Set the working directory in the container
WORKDIR /usr/src/app
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
curl \
unzip
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir spacy
# Copy the rest of the application code
COPY . .
# Set environment to use Chrome and ChromeDriver properly
ENV CHROME_BIN=/usr/bin/google-chrome \
CHROMEDRIVER=/usr/local/bin/chromedriver \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH /usr/local/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Print helloworld when the container launches
CMD ["echo", "Hello, World!"]

View File

@@ -1,73 +0,0 @@
# First stage: Build and install dependencies
FROM pytorch/pytorch:latest as builder
# Set the working directory in the container
WORKDIR /usr/src/app
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
git \
curl \
unzip \
gnupg \
xvfb \
ca-certificates \
apt-transport-https \
software-properties-common && \
rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir spacy onnxruntime && \
python -m spacy download en_core_web_sm
# Install Google Chrome and ChromeDriver
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
apt-get update && \
apt-get install -y google-chrome-stable && \
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
# Second stage: Create the final image
FROM pytorch/pytorch:latest
# Set the working directory in the container
WORKDIR /usr/src/app
# Copy Chromedriver and Chrome from the builder stage
COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome
# Copy installed Python packages from builder stage
COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
COPY --from=builder /opt/conda/bin /opt/conda/bin
# Copy the rest of the application code
COPY . .
# Set environment to use Chrome and ChromeDriver properly
ENV CHROME_BIN=/usr/bin/google-chrome \
CHROMEDRIVER=/usr/local/bin/chromedriver \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# pip install -e .[all]
RUN pip install --no-cache-dir -e .[all]
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH /opt/conda/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Download models call cli "crawl4ai-download-models"
RUN crawl4ai-download-models
# RUN python crawl4ai/model_loader.py
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

View File

@@ -1,61 +0,0 @@
# First stage: Build and install dependencies
FROM pytorch/pytorch:latest
# Set the working directory in the container
WORKDIR /usr/src/app
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
git \
curl \
unzip \
gnupg \
xvfb \
ca-certificates \
apt-transport-https \
software-properties-common && \
rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir spacy onnxruntime && \
python -m spacy download en_core_web_sm
# Install Google Chrome and ChromeDriver
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
apt-get update && \
apt-get install -y google-chrome-stable && \
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
# Copy the rest of the application code
COPY . .
# Set environment to use Chrome and ChromeDriver properly
ENV CHROME_BIN=/usr/bin/google-chrome \
CHROMEDRIVER=/usr/local/bin/chromedriver \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# pip install -e .[all]
RUN pip install --no-cache-dir -e .[all]
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH /opt/conda/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Download models call cli "crawl4ai-download-models"
RUN crawl4ai-download-models
# RUN python crawl4ai/model_loader.py
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

View File

@@ -188,7 +188,7 @@ pip install -e .[all]
# docker build --platform linux/amd64 -t crawl4ai . # docker build --platform linux/amd64 -t crawl4ai .
# For other users # For other users
# docker build -t crawl4ai . # docker build -t crawl4ai .
docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai docker run -d -p 8000:80 crawl4ai
``` ```

View File

@@ -10,7 +10,7 @@ import logging
import base64 import base64
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from io import BytesIO from io import BytesIO
from typing import List from typing import List, Callable
import requests import requests
import os import os
from pathlib import Path from pathlib import Path
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
@abstractmethod @abstractmethod
def update_user_agent(self, user_agent: str): def update_user_agent(self, user_agent: str):
pass pass
@abstractmethod
def set_hook(self, hook_type: str, hook: Callable):
pass
class CloudCrawlerStrategy(CrawlerStrategy): class CloudCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html = False): def __init__(self, use_cached_html = False):
@@ -96,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.use_cached_html = use_cached_html self.use_cached_html = use_cached_html
self.js_code = js_code self.js_code = js_code
self.verbose = kwargs.get("verbose", False) self.verbose = kwargs.get("verbose", False)
# Hooks
self.hooks = {
'on_driver_created': None,
'before_get_url': None,
'after_get_url': None,
'before_return_html': None
}
# chromedriver_autoinstaller.install() # chromedriver_autoinstaller.install()
import chromedriver_autoinstaller import chromedriver_autoinstaller
@@ -103,10 +115,36 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.service.log_path = "NUL" self.service.log_path = "NUL"
self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = webdriver.Chrome(service=self.service, options=self.options)
def set_hook(self, hook_type: str, hook: Callable):
if hook_type in self.hooks:
self.hooks[hook_type] = hook
else:
raise ValueError(f"Invalid hook type: {hook_type}")
def execute_hook(self, hook_type: str, *args):
hook = self.hooks.get(hook_type)
if hook:
result = hook(*args)
if result is not None:
if isinstance(result, webdriver.Chrome):
return result
else:
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
# If the hook returns None or there is no hook, return self.driver
return self.driver
def update_user_agent(self, user_agent: str): def update_user_agent(self, user_agent: str):
self.options.add_argument(f"user-agent={user_agent}") self.options.add_argument(f"user-agent={user_agent}")
self.driver.quit() self.driver.quit()
self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = webdriver.Chrome(service=self.service, options=self.options)
self.driver = self.execute_hook('on_driver_created', self.driver)
def set_custom_headers(self, headers: dict):
# Enable Network domain for sending headers
self.driver.execute_cdp_cmd('Network.enable', {})
# Set extra HTTP headers
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
def crawl(self, url: str) -> str: def crawl(self, url: str) -> str:
# Create md5 hash of the URL # Create md5 hash of the URL
@@ -120,12 +158,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
return f.read() return f.read()
try: try:
self.driver = self.execute_hook('before_get_url', self.driver)
if self.verbose: if self.verbose:
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...") print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
self.driver.get(url) self.driver.get(url)
WebDriverWait(self.driver, 10).until( WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "html")) EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
) )
self.driver = self.execute_hook('after_get_url', self.driver)
# Execute JS code if provided # Execute JS code if provided
if self.js_code and type(self.js_code) == str: if self.js_code and type(self.js_code) == str:
@@ -142,6 +182,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
) )
html = self.driver.page_source html = self.driver.page_source
self.driver = self.execute_hook('before_return_html', self.driver, html)
# Store in cache # Store in cache
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)

View File

@@ -53,6 +53,7 @@ def set_model_device(model):
model.to(device) model.to(device)
return model, device return model, device
@lru_cache()
def get_home_folder(): def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai") home_folder = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True) os.makedirs(home_folder, exist_ok=True)
@@ -201,7 +202,7 @@ def load_spacy_model():
repo_folder = os.path.join(home_folder, "crawl4ai") repo_folder = os.path.join(home_folder, "crawl4ai")
model_folder = os.path.join(home_folder, name) model_folder = os.path.join(home_folder, name)
print("[LOG] ⏬ Downloading Spacy model for the first time...") # print("[LOG] ⏬ Downloading Spacy model for the first time...")
# Remove existing repo folder if it exists # Remove existing repo folder if it exists
if Path(repo_folder).exists(): if Path(repo_folder).exists():
@@ -229,7 +230,7 @@ def load_spacy_model():
shutil.rmtree(repo_folder) shutil.rmtree(repo_folder)
# Print completion message # Print completion message
print("[LOG] ✅ Spacy Model downloaded successfully") # print("[LOG] ✅ Spacy Model downloaded successfully")
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"An error occurred while cloning the repository: {e}") print(f"An error occurred while cloning the repository: {e}")
except Exception as e: except Exception as e:
@@ -254,8 +255,8 @@ def download_all_models(remove_existing=False):
# Load each model to trigger download # Load each model to trigger download
# print("[LOG] Downloading BERT Base Uncased...") # print("[LOG] Downloading BERT Base Uncased...")
# load_bert_base_uncased() # load_bert_base_uncased()
print("[LOG] Downloading BGE Small EN v1.5...") # print("[LOG] Downloading BGE Small EN v1.5...")
load_bge_small_en_v1_5() # load_bge_small_en_v1_5()
# print("[LOG] Downloading ONNX model...") # print("[LOG] Downloading ONNX model...")
# load_onnx_all_MiniLM_l6_v2() # load_onnx_all_MiniLM_l6_v2()
print("[LOG] Downloading text classifier...") print("[LOG] Downloading text classifier...")

View File

@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result) print_result(result)
def using_crawler_hooks(crawler):
# Example usage of the hooks for authentication and setting a cookie
def on_driver_created(driver):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
driver.maximize_window()
# Example customization: logging in to a hypothetical website
driver.get('https://example.com/login')
from selenium.webdriver.support.ui import WebDriverWait
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, 'username'))
)
driver.find_element(By.NAME, 'username').send_keys('testuser')
driver.find_element(By.NAME, 'password').send_keys('password123')
driver.find_element(By.NAME, 'login').click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'welcome'))
)
# Add a custom cookie
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: add a custom header
# Enable Network domain for sending headers
driver.execute_cdp_cmd('Network.enable', {})
# Add a custom header
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
return driver
def after_get_url(driver):
print("[HOOK] after_get_url")
# Example customization: log the URL
print(driver.current_url)
return driver
def before_return_html(driver, html):
print("[HOOK] before_return_html")
# Example customization: log the HTML
print(len(html))
return driver
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
crawler.set_hook('on_driver_created', on_driver_created)
crawler.set_hook('before_get_url', before_get_url)
crawler.set_hook('after_get_url', after_get_url)
crawler.set_hook('before_return_html', before_return_html)
result = crawler.run(url="https://example.com")
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result= result)
def main(): def main():
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]") cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")

View File

@@ -1,18 +1,8 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
import os, sys import os
from pathlib import Path
import subprocess import subprocess
from setuptools.command.install import install from setuptools.command.install import install
def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True)
os.makedirs(f"{home_folder}/cache", exist_ok=True)
os.makedirs(f"{home_folder}/models", exist_ok=True)
return home_folder
home_folder = get_home_folder()
# Read the requirements from requirements.txt # Read the requirements from requirements.txt
with open("requirements.txt") as f: with open("requirements.txt") as f:
requirements = f.read().splitlines() requirements = f.read().splitlines()