This commit is contained in:
Unclecode
2024-06-02 07:56:00 +00:00
12 changed files with 214 additions and 71 deletions

1
.gitignore vendored
View File

@@ -174,3 +174,4 @@ requirements0.txt
a.txt a.txt
*.sh *.sh
.idea

View File

@@ -1,43 +1,77 @@
# Use an official Python runtime as a parent image
FROM python:3.10-slim # First stage: Build and install dependencies
FROM python:3.10-slim-bookworm as builder
# Set the working directory in the container # Set the working directory in the container
WORKDIR /usr/src/app WORKDIR /usr/src/app
# Copy the current directory contents into the container at /usr/src/app # Install build dependencies
COPY . . RUN apt-get update && \
apt-get install -y --no-install-recommends \
# Install dependencies for Chrome and ChromeDriver
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \ wget \
xvfb \
unzip \
curl \ curl \
unzip
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
python -m spacy download en_core_web_sm
# Download and install ChromeDriver
RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
unzip /tmp/chromedriver_linux64.zip -d /tmp && \
mv /tmp/chromedriver /usr/local/bin/chromedriver && \
chmod +x /usr/local/bin/chromedriver && \
rm /tmp/chromedriver_linux64.zip
# Second stage: Create final runtime image
FROM python:3.10-slim-bookworm
# Set the working directory in the container
WORKDIR /usr/src/app
# Install runtime dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
git \
xvfb \
gnupg2 \ gnupg2 \
ca-certificates \ ca-certificates \
apt-transport-https \ apt-transport-https \
software-properties-common \ software-properties-common && \
&& mkdir -p /etc/apt/keyrings \ wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \ echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \ apt-get update && \
&& apt-get update \ apt-get install -y --no-install-recommends google-chrome-stable && \
&& apt-get install -y google-chrome-stable \ rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list
&& rm -rf /var/lib/apt/lists/* \
&& apt-get install -y chromium-chromedriver
# Install Python dependencies # Copy Chromedriver from the builder stage
RUN pip install --no-cache-dir -r requirements.txt COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
RUN pip install spacy torch torchvision torchaudio
# Set display port and dbus env to avoid hanging # Copy installed Python packages from builder stage
ENV DISPLAY=:99 COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null COPY --from=builder /usr/local/bin /usr/local/bin
# Copy the rest of the application code
COPY . .
# Set environment to use Chrome and ChromeDriver properly
ENV CHROME_BIN=/usr/bin/google-chrome \
CHROMEDRIVER=/usr/local/bin/chromedriver \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH /usr/local/bin:$PATH
# Make port 80 available to the world outside this container # Make port 80 available to the world outside this container
EXPOSE 80 EXPOSE 80
# Define environment variable
ENV PYTHONUNBUFFERED 1
# Run uvicorn # Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"] CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

45
Dockerfile-version-0 Normal file
View File

@@ -0,0 +1,45 @@
# Use an official Python runtime as a parent image
FROM python:3.10-slim
# In case you had some weird issues, try this Image
# FROM python:3.10-slim-bookworm as builder
# Set the working directory in the container
WORKDIR /usr/src/app
# Copy the current directory contents into the container at /usr/src/app
COPY . .
# Install dependencies for Chrome and ChromeDriver
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
xvfb \
unzip \
curl \
gnupg2 \
ca-certificates \
apt-transport-https \
software-properties-common \
&& mkdir -p /etc/apt/keyrings \
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
&& apt-get update \
&& apt-get install -y google-chrome-stable \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get install -y chromium-chromedriver
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install spacy torch torchvision torchaudio
# Set display port and dbus env to avoid hanging
ENV DISPLAY=:99
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
# Make port 80 available to the world outside this container
EXPOSE 80
# Define environment variable
ENV PYTHONUNBUFFERED 1
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.0 🕷️🤖 # Crawl4AI v0.2.2 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -10,8 +10,14 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
## Recent Changes v0.2.0 ## Recent Changes
### v0.2.2
- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation
### v0.2.0
- 🚀 10x faster!! - 🚀 10x faster!!
- 📜 Execute custom JavaScript before crawling! - 📜 Execute custom JavaScript before crawling!
- 🤝 Colab friendly! - 🤝 Colab friendly!
@@ -30,8 +36,6 @@ from crawl4ai import WebCrawler
# Create the WebCrawler instance # Create the WebCrawler instance
crawler = WebCrawler() crawler = WebCrawler()
# Run the crawler with keyword filtering and CSS selector # Run the crawler with keyword filtering and CSS selector
result = crawler.run(url="https://www.nbcnews.com/business") result = crawler.run(url="https://www.nbcnews.com/business")
print(result) # {url, html, markdown, extracted_content, metadata} print(result) # {url, html, markdown, extracted_content, metadata}

View File

@@ -103,12 +103,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
) )
# Execute JS code if provided # Execute JS code if provided
if self.js_code: if self.js_code and type(self.js_code) == str:
self.driver.execute_script(self.js_code) self.driver.execute_script(self.js_code)
# Optionally, wait for some condition after executing the JS code # Optionally, wait for some condition after executing the JS code
WebDriverWait(self.driver, 10).until( WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script("return document.readyState") == "complete" lambda driver: driver.execute_script("return document.readyState") == "complete"
) )
elif self.js_code and type(self.js_code) == list:
for js in self.js_code:
self.driver.execute_script(js)
WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
html = self.driver.page_source html = self.driver.page_source

View File

@@ -188,11 +188,12 @@ class CosineStrategy(ExtractionStrategy):
if self.verbose: if self.verbose:
print(f"[LOG] Loading Extraction Model for {self.device.type} device.") print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
if False and self.device.type == "cpu": # if False and self.device.type == "cpu":
self.model = load_onnx_all_MiniLM_l6_v2() # self.model = load_onnx_all_MiniLM_l6_v2()
self.tokenizer = self.model.tokenizer # self.tokenizer = self.model.tokenizer
self.get_embedding_method = "direct" # self.get_embedding_method = "direct"
else: # else:
self.tokenizer, self.model = load_bge_small_en_v1_5() self.tokenizer, self.model = load_bge_small_en_v1_5()
self.model.eval() self.model.eval()
self.get_embedding_method = "batch" self.get_embedding_method = "batch"

View File

@@ -2,6 +2,7 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
import subprocess, os import subprocess, os
import shutil import shutil
import tarfile
from crawl4ai.config import MODEL_REPO_BRANCH from crawl4ai.config import MODEL_REPO_BRANCH
import argparse import argparse
import urllib.request import urllib.request
@@ -36,7 +37,6 @@ def calculate_batch_size(device):
else: else:
return 16 # Default batch size return 16 # Default batch size
@lru_cache() @lru_cache()
def get_device(): def get_device():
import torch import torch
@@ -82,12 +82,19 @@ def load_bge_small_en_v1_5():
@lru_cache() @lru_cache()
def load_onnx_all_MiniLM_l6_v2(): def load_onnx_all_MiniLM_l6_v2():
from crawl4ai.onnx_embedding import DefaultEmbeddingModel from crawl4ai.onnx_embedding import DefaultEmbeddingModel
model_path = "models/onnx/model.onnx"
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx"
download_path = os.path.join(__location__, model_path)
model_path = "models/onnx.tar.gz"
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz"
__location__ = os.path.realpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))
download_path = os.path.join(__location__, model_path)
onnx_dir = os.path.join(__location__, "models/onnx")
# Create the models directory if it does not exist
os.makedirs(os.path.dirname(download_path), exist_ok=True)
# Download the tar.gz file if it does not exist
if not os.path.exists(download_path): if not os.path.exists(download_path):
# Define a download function with a simple progress display
def download_with_progress(url, filename): def download_with_progress(url, filename):
def reporthook(block_num, block_size, total_size): def reporthook(block_num, block_size, total_size):
downloaded = block_num * block_size downloaded = block_num * block_size
@@ -101,6 +108,16 @@ def load_onnx_all_MiniLM_l6_v2():
download_with_progress(model_url, download_path) download_with_progress(model_url, download_path)
# Extract the tar.gz file if the onnx directory does not exist
if not os.path.exists(onnx_dir):
with tarfile.open(download_path, "r:gz") as tar:
tar.extractall(path=os.path.join(__location__, "models"))
# remove the tar.gz file
os.remove(download_path)
model = DefaultEmbeddingModel() model = DefaultEmbeddingModel()
return model return model
@@ -240,8 +257,8 @@ def download_all_models(remove_existing=False):
# load_bert_base_uncased() # load_bert_base_uncased()
# print("[LOG] Downloading BGE Small EN v1.5...") # print("[LOG] Downloading BGE Small EN v1.5...")
# load_bge_small_en_v1_5() # load_bge_small_en_v1_5()
print("[LOG] Downloading ONNX model...") # print("[LOG] Downloading ONNX model...")
load_onnx_all_MiniLM_l6_v2() # load_onnx_all_MiniLM_l6_v2()
print("[LOG] Downloading text classifier...") print("[LOG] Downloading text classifier...")
_, device = load_text_multilabel_classifier() _, device = load_text_multilabel_classifier()
print(f"[LOG] Text classifier loaded on {device}") print(f"[LOG] Text classifier loaded on {device}")

View File

@@ -164,6 +164,22 @@ def interactive_extraction(crawler):
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result) print_result(result)
def multiple_scrip(crawler):
# Passing JavaScript code to interact with the page
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
js_code = ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""] * 2
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(
url="https://www.nbcnews.com/business",
)
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)
def main(): def main():
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]") cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
@@ -180,6 +196,7 @@ def main():
add_llm_extraction_strategy(crawler) add_llm_extraction_strategy(crawler)
targeted_extraction(crawler) targeted_extraction(crawler)
interactive_extraction(crawler) interactive_extraction(crawler)
multiple_scrip(crawler)
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]") cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")

View File

@@ -66,7 +66,7 @@ async def read_index(request: Request):
for filename in os.listdir(partials_dir): for filename in os.listdir(partials_dir):
if filename.endswith(".html"): if filename.endswith(".html"):
with open(os.path.join(partials_dir, filename), "r") as file: with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
partials[filename[:-5]] = file.read() partials[filename[:-5]] = file.read()
return templates.TemplateResponse("index.html", {"request": request, **partials}) return templates.TemplateResponse("index.html", {"request": request, **partials})

13
requirements.crawl.txt Normal file
View File

@@ -0,0 +1,13 @@
aiohttp
aiosqlite
bs4
fastapi
html2text
httpx
pydantic
python-dotenv
requests
rich
selenium
uvicorn
chromedriver-autoinstaller

View File

@@ -1,20 +1,20 @@
aiohttp==3.9.5 aiohttp
aiosqlite==0.20.0 aiosqlite
bs4==0.0.2 bs4
fastapi==0.111.0 fastapi
html2text==2024.2.26 html2text
httpx==0.27.0 httpx
litellm==1.37.11 litellm
nltk==3.8.1 nltk
pydantic==2.7.1 pydantic
python-dotenv==1.0.1 python-dotenv
requests==2.31.0 requests
rich==13.7.1 rich
scikit-learn==1.4.2 scikit-learn
selenium==4.20.0 selenium
uvicorn==0.29.0 uvicorn
transformers==4.40.2 transformers
chromedriver-autoinstaller==0.6.4 chromedriver-autoinstaller
torch==2.3.0 torch
onnxruntime==1.14.1 onnxruntime
tokenizers==0.13.2 tokenizers

View File

@@ -7,11 +7,16 @@ from setuptools.command.install import install
with open("requirements.txt") as f: with open("requirements.txt") as f:
requirements = f.read().splitlines() requirements = f.read().splitlines()
# Read the requirements from requirements.txt
with open("requirements.crawl.txt") as f:
requirements_crawl_only = f.read().splitlines()
# Define the requirements for different environments # Define the requirements for different environments
requirements_without_torch = [req for req in requirements if not req.startswith("torch")] requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")] requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")] requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")] requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
class CustomInstallCommand(install): class CustomInstallCommand(install):
"""Customized setuptools install command to install spacy without dependencies.""" """Customized setuptools install command to install spacy without dependencies."""
@@ -34,7 +39,7 @@ setup(
extras_require={ extras_require={
"all": requirements, # Include all requirements "all": requirements, # Include all requirements
"colab": requirements_without_torch, # Exclude torch for Colab "colab": requirements_without_torch, # Exclude torch for Colab
"crawl": requirements_without_torch_transformers_nlkt "crawl": requirements_crawl_only, # Include only crawl requirements
}, },
cmdclass={ cmdclass={
'install': CustomInstallCommand, 'install': CustomInstallCommand,