Compare commits
2 Commits
docker-tes
...
hooks
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77da48050d | ||
|
|
9a97aacd85 |
BIN
.files/screenshot.png
Normal file
BIN
.files/screenshot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.5 MiB |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -179,6 +179,3 @@ docs/examples/.chainlit/
|
||||
docs/examples/.chainlit/*
|
||||
.chainlit/config.toml
|
||||
.chainlit/translations/en-US.json
|
||||
|
||||
local/
|
||||
.files/
|
||||
@@ -1,37 +0,0 @@
|
||||
|
||||
# First stage: Build and install dependencies
|
||||
FROM python:3.10-slim-bookworm as builder
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
curl \
|
||||
unzip
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||
pip install --no-cache-dir spacy
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . .
|
||||
|
||||
# Set environment to use Chrome and ChromeDriver properly
|
||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||
DISPLAY=:99 \
|
||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# Ensure the PATH environment variable includes the location of the installed packages
|
||||
ENV PATH /usr/local/bin:$PATH
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Print helloworld when the container launches
|
||||
CMD ["echo", "Hello, World!"]
|
||||
@@ -1,73 +0,0 @@
|
||||
# First stage: Build and install dependencies
|
||||
FROM pytorch/pytorch:latest as builder
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
git \
|
||||
curl \
|
||||
unzip \
|
||||
gnupg \
|
||||
xvfb \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||
pip install --no-cache-dir spacy onnxruntime && \
|
||||
python -m spacy download en_core_web_sm
|
||||
|
||||
# Install Google Chrome and ChromeDriver
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable && \
|
||||
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
|
||||
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
|
||||
|
||||
# Second stage: Create the final image
|
||||
FROM pytorch/pytorch:latest
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy Chromedriver and Chrome from the builder stage
|
||||
COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
|
||||
COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome
|
||||
|
||||
# Copy installed Python packages from builder stage
|
||||
COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
|
||||
COPY --from=builder /opt/conda/bin /opt/conda/bin
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . .
|
||||
|
||||
# Set environment to use Chrome and ChromeDriver properly
|
||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||
DISPLAY=:99 \
|
||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# pip install -e .[all]
|
||||
RUN pip install --no-cache-dir -e .[all]
|
||||
|
||||
# Ensure the PATH environment variable includes the location of the installed packages
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Download models call cli "crawl4ai-download-models"
|
||||
RUN crawl4ai-download-models
|
||||
# RUN python crawl4ai/model_loader.py
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
@@ -1,61 +0,0 @@
|
||||
# First stage: Build and install dependencies
|
||||
FROM pytorch/pytorch:latest
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
git \
|
||||
curl \
|
||||
unzip \
|
||||
gnupg \
|
||||
xvfb \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||
pip install --no-cache-dir spacy onnxruntime && \
|
||||
python -m spacy download en_core_web_sm
|
||||
|
||||
# Install Google Chrome and ChromeDriver
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable && \
|
||||
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
|
||||
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . .
|
||||
|
||||
# Set environment to use Chrome and ChromeDriver properly
|
||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||
DISPLAY=:99 \
|
||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# pip install -e .[all]
|
||||
RUN pip install --no-cache-dir -e .[all]
|
||||
|
||||
# Ensure the PATH environment variable includes the location of the installed packages
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Download models call cli "crawl4ai-download-models"
|
||||
RUN crawl4ai-download-models
|
||||
# RUN python crawl4ai/model_loader.py
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
|
||||
|
||||
@@ -188,7 +188,7 @@ pip install -e .[all]
|
||||
# docker build --platform linux/amd64 -t crawl4ai .
|
||||
# For other users
|
||||
# docker build -t crawl4ai .
|
||||
docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai
|
||||
docker run -d -p 8000:80 crawl4ai
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ import logging
|
||||
import base64
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from io import BytesIO
|
||||
from typing import List
|
||||
from typing import List, Callable
|
||||
import requests
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
|
||||
@abstractmethod
|
||||
def update_user_agent(self, user_agent: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
pass
|
||||
|
||||
class CloudCrawlerStrategy(CrawlerStrategy):
|
||||
def __init__(self, use_cached_html = False):
|
||||
@@ -96,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.use_cached_html = use_cached_html
|
||||
self.js_code = js_code
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
# Hooks
|
||||
self.hooks = {
|
||||
'on_driver_created': None,
|
||||
'before_get_url': None,
|
||||
'after_get_url': None,
|
||||
'before_return_html': None
|
||||
}
|
||||
|
||||
# chromedriver_autoinstaller.install()
|
||||
import chromedriver_autoinstaller
|
||||
@@ -103,10 +115,36 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.service.log_path = "NUL"
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
if hook_type in self.hooks:
|
||||
self.hooks[hook_type] = hook
|
||||
else:
|
||||
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||
|
||||
def execute_hook(self, hook_type: str, *args):
|
||||
hook = self.hooks.get(hook_type)
|
||||
if hook:
|
||||
result = hook(*args)
|
||||
if result is not None:
|
||||
if isinstance(result, webdriver.Chrome):
|
||||
return result
|
||||
else:
|
||||
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
|
||||
# If the hook returns None or there is no hook, return self.driver
|
||||
return self.driver
|
||||
|
||||
def update_user_agent(self, user_agent: str):
|
||||
self.options.add_argument(f"user-agent={user_agent}")
|
||||
self.driver.quit()
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
self.driver = self.execute_hook('on_driver_created', self.driver)
|
||||
|
||||
def set_custom_headers(self, headers: dict):
|
||||
# Enable Network domain for sending headers
|
||||
self.driver.execute_cdp_cmd('Network.enable', {})
|
||||
# Set extra HTTP headers
|
||||
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
|
||||
|
||||
|
||||
def crawl(self, url: str) -> str:
|
||||
# Create md5 hash of the URL
|
||||
@@ -120,12 +158,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
return f.read()
|
||||
|
||||
try:
|
||||
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
||||
self.driver.get(url)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||
)
|
||||
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||
|
||||
# Execute JS code if provided
|
||||
if self.js_code and type(self.js_code) == str:
|
||||
@@ -142,6 +182,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
)
|
||||
|
||||
html = self.driver.page_source
|
||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||
|
||||
# Store in cache
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||
|
||||
@@ -53,6 +53,7 @@ def set_model_device(model):
|
||||
model.to(device)
|
||||
return model, device
|
||||
|
||||
@lru_cache()
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
@@ -201,7 +202,7 @@ def load_spacy_model():
|
||||
repo_folder = os.path.join(home_folder, "crawl4ai")
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
print("[LOG] ⏬ Downloading Spacy model for the first time...")
|
||||
# print("[LOG] ⏬ Downloading Spacy model for the first time...")
|
||||
|
||||
# Remove existing repo folder if it exists
|
||||
if Path(repo_folder).exists():
|
||||
@@ -229,7 +230,7 @@ def load_spacy_model():
|
||||
shutil.rmtree(repo_folder)
|
||||
|
||||
# Print completion message
|
||||
print("[LOG] ✅ Spacy Model downloaded successfully")
|
||||
# print("[LOG] ✅ Spacy Model downloaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while cloning the repository: {e}")
|
||||
except Exception as e:
|
||||
@@ -254,8 +255,8 @@ def download_all_models(remove_existing=False):
|
||||
# Load each model to trigger download
|
||||
# print("[LOG] Downloading BERT Base Uncased...")
|
||||
# load_bert_base_uncased()
|
||||
print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
load_bge_small_en_v1_5()
|
||||
# print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
# load_bge_small_en_v1_5()
|
||||
# print("[LOG] Downloading ONNX model...")
|
||||
# load_onnx_all_MiniLM_l6_v2()
|
||||
print("[LOG] Downloading text classifier...")
|
||||
|
||||
@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def using_crawler_hooks(crawler):
|
||||
# Example usage of the hooks for authentication and setting a cookie
|
||||
def on_driver_created(driver):
|
||||
print("[HOOK] on_driver_created")
|
||||
# Example customization: maximize the window
|
||||
driver.maximize_window()
|
||||
|
||||
# Example customization: logging in to a hypothetical website
|
||||
driver.get('https://example.com/login')
|
||||
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.NAME, 'username'))
|
||||
)
|
||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
||||
driver.find_element(By.NAME, 'login').click()
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.ID, 'welcome'))
|
||||
)
|
||||
# Add a custom cookie
|
||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
||||
return driver
|
||||
|
||||
|
||||
def before_get_url(driver):
|
||||
print("[HOOK] before_get_url")
|
||||
# Example customization: add a custom header
|
||||
# Enable Network domain for sending headers
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
# Add a custom header
|
||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
||||
return driver
|
||||
|
||||
def after_get_url(driver):
|
||||
print("[HOOK] after_get_url")
|
||||
# Example customization: log the URL
|
||||
print(driver.current_url)
|
||||
return driver
|
||||
|
||||
def before_return_html(driver, html):
|
||||
print("[HOOK] before_return_html")
|
||||
# Example customization: log the HTML
|
||||
print(len(html))
|
||||
return driver
|
||||
|
||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||
|
||||
crawler.set_hook('on_driver_created', on_driver_created)
|
||||
crawler.set_hook('before_get_url', before_get_url)
|
||||
crawler.set_hook('after_get_url', after_get_url)
|
||||
crawler.set_hook('before_return_html', before_return_html)
|
||||
|
||||
result = crawler.run(url="https://example.com")
|
||||
|
||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print_result(result= result)
|
||||
|
||||
def main():
|
||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||
|
||||
12
setup.py
12
setup.py
@@ -1,18 +1,8 @@
|
||||
from setuptools import setup, find_packages
|
||||
import os, sys
|
||||
from pathlib import Path
|
||||
import os
|
||||
import subprocess
|
||||
from setuptools.command.install import install
|
||||
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
return home_folder
|
||||
|
||||
home_folder = get_home_folder()
|
||||
|
||||
# Read the requirements from requirements.txt
|
||||
with open("requirements.txt") as f:
|
||||
requirements = f.read().splitlines()
|
||||
|
||||
Reference in New Issue
Block a user