Compare commits
104 Commits
v0.1.0
...
format-inl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2f246d19f4 | ||
|
|
413595542a | ||
|
|
42a5da854d | ||
|
|
d1d83a6ef7 | ||
|
|
194050705d | ||
|
|
989f8c91c8 | ||
|
|
edba5fb5e9 | ||
|
|
faa1defa5c | ||
|
|
f7e0cee1b0 | ||
|
|
b3a0edaa6d | ||
|
|
9c34b30723 | ||
|
|
36a5847df5 | ||
|
|
a19379aa58 | ||
|
|
768d048e1c | ||
|
|
94c11a0262 | ||
|
|
649b0bfd02 | ||
|
|
57a00ec677 | ||
|
|
aeb2114170 | ||
|
|
b8d405fddd | ||
|
|
b32013cb97 | ||
|
|
226a62a3c0 | ||
|
|
8e73a482a2 | ||
|
|
0533aeb814 | ||
|
|
aead6de888 | ||
|
|
8d82fd4cfe | ||
|
|
8f44db6499 | ||
|
|
c7553b1280 | ||
|
|
8b8683f22e | ||
|
|
774ace6e3b | ||
|
|
4a8f91a0fc | ||
|
|
18c9784b61 | ||
|
|
e5d401c67c | ||
|
|
ae77589a98 | ||
|
|
ad373c0e19 | ||
|
|
51f26d12fe | ||
|
|
f1b60b2016 | ||
|
|
8c2dc2b1e4 | ||
|
|
dc9a44c12a | ||
|
|
d9753b6349 | ||
|
|
a554c0b143 | ||
|
|
7381fa95e6 | ||
|
|
53d1176d53 | ||
|
|
52c4be0696 | ||
|
|
13a3b21d19 | ||
|
|
5cee084340 | ||
|
|
bf00c26a83 | ||
|
|
3846648c12 | ||
|
|
eb6423875f | ||
|
|
e3524a10a7 | ||
|
|
468dad6169 | ||
|
|
bc27982992 | ||
|
|
57e5decb55 | ||
|
|
b6319c6f6e | ||
|
|
0a902f562f | ||
|
|
454135856e | ||
|
|
33fddc27ad | ||
|
|
ce052a4eb5 | ||
|
|
b43d77a56b | ||
|
|
1635a92218 | ||
|
|
2a8a1b27e1 | ||
|
|
f5f3cce2c8 | ||
|
|
a085e6315b | ||
|
|
6f96dcd649 | ||
|
|
957a2458b1 | ||
|
|
36e46be23d | ||
|
|
32c87f0388 | ||
|
|
647cfda225 | ||
|
|
1cc67df301 | ||
|
|
d7b37e849d | ||
|
|
f52f526002 | ||
|
|
3593f017d7 | ||
|
|
e7bb76f19b | ||
|
|
593b928967 | ||
|
|
bb3d37face | ||
|
|
3f8576f870 | ||
|
|
bf3b040f10 | ||
|
|
a317dc5e1d | ||
|
|
a5f9d07dbf | ||
|
|
f85df91ca6 | ||
|
|
6fcaf26b4f | ||
|
|
5b4a586b2d | ||
|
|
a856319499 | ||
|
|
5ce1dc1622 | ||
|
|
ea16dec587 | ||
|
|
d19488a821 | ||
|
|
199c66114c | ||
|
|
45569d058d | ||
|
|
5bb0b0b378 | ||
|
|
4006f5f4e2 | ||
|
|
7e0682e0de | ||
|
|
8e28eb9efb | ||
|
|
c8589f8da3 | ||
|
|
6a6365ae0a | ||
|
|
5b80be956d | ||
|
|
4a2e17447b | ||
|
|
f6e59157bf | ||
|
|
5fea6c064b | ||
|
|
11393183f7 | ||
|
|
7679064521 | ||
|
|
cf087cfa58 | ||
|
|
5693e324a4 | ||
|
|
b38bf64490 | ||
|
|
82706129f5 | ||
|
|
7039e3c1ee |
BIN
.files/screenshot.png
Normal file
|
After Width: | Height: | Size: 1.5 MiB |
9
.gitignore
vendored
@@ -171,4 +171,11 @@ test_pad*.py
|
||||
Crawl4AI.egg-info/
|
||||
|
||||
requirements0.txt
|
||||
a.txt
|
||||
a.txt
|
||||
|
||||
*.sh
|
||||
.idea
|
||||
docs/examples/.chainlit/
|
||||
docs/examples/.chainlit/*
|
||||
.chainlit/config.toml
|
||||
.chainlit/translations/en-US.json
|
||||
|
||||
29
CHANGELOG.md
@@ -1,31 +1,10 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.2.5] - 2024-06-17
|
||||
### Added
|
||||
- 🔧 Separate Crawl and Extract JSON Semantic Chunk: Enhancing flexibility and efficiency in large-scale web crawling tasks.
|
||||
- 🔍 Colab Integration: Exploring integration with Google Colab for easy experimentation in a collaborative notebook environment.
|
||||
- 🎯 XPath and CSS Selector Support: Adding support for selective retrieval of specific elements from web pages.
|
||||
- 📷 Image Captioning: Incorporating image captioning capabilities to extract meaningful descriptions from images.
|
||||
- 💾 Embedding Data Generation and Storage: Developing functionalities to generate and store embedding data for each crawled website.
|
||||
- 🔍 Semantic Search Engine: Building a semantic search engine that fetches content, performs vector search similarity, and generates labeled chunk data based on user queries and URLs.
|
||||
- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
|
||||
|
||||
### Changed
|
||||
- None
|
||||
|
||||
### Deprecated
|
||||
- None
|
||||
|
||||
### Removed
|
||||
- None
|
||||
|
||||
## [0.2.4] - 2024-06-17
|
||||
### Fixed
|
||||
- None
|
||||
|
||||
### Security
|
||||
- None
|
||||
|
||||
## [1.0.0] - YYYY-MM-DD
|
||||
- Initial release
|
||||
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
|
||||
87
Dockerfile
@@ -1,40 +1,77 @@
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM python:3.10-slim
|
||||
|
||||
# First stage: Build and install dependencies
|
||||
FROM python:3.10-slim-bookworm as builder
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy the current directory contents into the container at /usr/src/app
|
||||
COPY . .
|
||||
|
||||
# Install any needed packages specified in requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install dependencies for Chrome and ChromeDriver
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
xvfb \
|
||||
unzip \
|
||||
curl \
|
||||
unzip
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||
pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
|
||||
python -m spacy download en_core_web_sm
|
||||
|
||||
# Download and install ChromeDriver
|
||||
RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
|
||||
wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
|
||||
unzip /tmp/chromedriver_linux64.zip -d /tmp && \
|
||||
mv /tmp/chromedriver /usr/local/bin/chromedriver && \
|
||||
chmod +x /usr/local/bin/chromedriver && \
|
||||
rm /tmp/chromedriver_linux64.zip
|
||||
|
||||
# Second stage: Create final runtime image
|
||||
FROM python:3.10-slim-bookworm
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
git \
|
||||
xvfb \
|
||||
gnupg2 \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
software-properties-common && \
|
||||
wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends google-chrome-stable && \
|
||||
rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list
|
||||
|
||||
# Set display port and dbus env to avoid hanging
|
||||
ENV DISPLAY=:99
|
||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||
# Copy Chromedriver from the builder stage
|
||||
COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
|
||||
|
||||
# Copy installed Python packages from builder stage
|
||||
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
|
||||
COPY --from=builder /usr/local/bin /usr/local/bin
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . .
|
||||
|
||||
# Set environment to use Chrome and ChromeDriver properly
|
||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||
DISPLAY=:99 \
|
||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# Ensure the PATH environment variable includes the location of the installed packages
|
||||
ENV PATH /usr/local/bin:$PATH
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Define environment variable
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
|
||||
|
||||
|
||||
45
Dockerfile-version-0
Normal file
@@ -0,0 +1,45 @@
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM python:3.10-slim
|
||||
# In case you had some weird issues, try this Image
|
||||
# FROM python:3.10-slim-bookworm as builder
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy the current directory contents into the container at /usr/src/app
|
||||
COPY . .
|
||||
|
||||
# Install dependencies for Chrome and ChromeDriver
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
xvfb \
|
||||
unzip \
|
||||
curl \
|
||||
gnupg2 \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common \
|
||||
&& mkdir -p /etc/apt/keyrings \
|
||||
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
|
||||
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get install -y chromium-chromedriver
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install spacy torch torchvision torchaudio
|
||||
|
||||
# Set display port and dbus env to avoid hanging
|
||||
ENV DISPLAY=:99
|
||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Define environment variable
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
44
Dockerfile_mac
Normal file
@@ -0,0 +1,44 @@
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM python:3.10-slim
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy the current directory contents into the container at /usr/src/app
|
||||
COPY . .
|
||||
|
||||
# Install any needed packages specified in requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install dependencies for Chrome and ChromeDriver
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
xvfb \
|
||||
unzip \
|
||||
curl \
|
||||
gnupg2 \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt install chromium-chromedriver -y
|
||||
|
||||
# Install spacy library using pip
|
||||
RUN pip install spacy
|
||||
|
||||
# Set display port and dbus env to avoid hanging
|
||||
ENV DISPLAY=:99
|
||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Define environment variable
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
697
README.md
@@ -1,4 +1,4 @@
|
||||
# Crawl4AI 🕷️🤖
|
||||
# Crawl4AI v0.2.3 🕷️🤖
|
||||
|
||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||
@@ -6,18 +6,142 @@
|
||||
[](https://github.com/unclecode/crawl4ai/pulls)
|
||||
[](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
|
||||
|
||||
Crawl4AI is a powerful, free web crawling service designed to extract useful information from web pages and make it accessible for large language models (LLMs) and AI applications. 🆓🌐
|
||||
Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
|
||||
|
||||
## 🚧 Work in Progress 👷♂️
|
||||
- Use as REST API: Check [](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
|
||||
- Use as Python library: [](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
||||
|
||||
- 🔧 Separate Crawl and Extract Semantic Chunk: Enhancing efficiency in large-scale tasks.
|
||||
- 🔍 Colab Integration: Exploring integration with Google Colab for easy experimentation.
|
||||
- 🎯 XPath and CSS Selector Support: Adding support for selective retrieval of specific elements.
|
||||
- 📷 Image Captioning: Incorporating image captioning capabilities to extract descriptions from images.
|
||||
- 💾 Embedding Vector Data: Generate and store embedding data for each crawled website.
|
||||
- 🔍 Semantic Search Engine: Building a semantic search engine that fetches content, performs vector search similarity, and generates labeled chunk data based on user queries and URLs.
|
||||
## Recent Changes
|
||||
|
||||
### v0.2.5
|
||||
- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
|
||||
|
||||
### v0.2.4
|
||||
- 🐞 Resolve the issue with the long url. (Issue #22)
|
||||
|
||||
### v0.2.3
|
||||
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
|
||||
- 🔗 Extrat all external and internal links. Check `result.links`
|
||||
- 📚 Extract metadata from the page. Check `result.metadata`
|
||||
- 🕵️ Support `user_agent` parameter to set the user agent for the HTTP requests.
|
||||
- 🖼️ Take [screenshots](#taking-screenshots) of the page.
|
||||
|
||||
### v0.2.2
|
||||
- Support multiple JS scripts
|
||||
- Fixed some of bugs
|
||||
- Resolved a few issue relevant to Colab installation
|
||||
|
||||
### v0.2.0
|
||||
- 🚀 10x faster!!
|
||||
- 📜 Execute custom JavaScript before crawling!
|
||||
- 🤝 Colab friendly!
|
||||
- 📚 Chunking strategies: topic-based, regex, sentence, and more!
|
||||
- 🧠 Extraction strategies: cosine clustering, LLM, and more!
|
||||
- 🎯 CSS selector support
|
||||
- 📝 Pass instructions/keywords to refine extraction
|
||||
|
||||
## Power and Simplicity of Crawl4AI 🚀
|
||||
|
||||
The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"screenshot": True
|
||||
}
|
||||
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
|
||||
response_data = response.json()
|
||||
print(response_data['results'][0].keys())
|
||||
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
|
||||
# 'links', 'screenshot', 'markdown', 'extracted_content',
|
||||
# 'metadata', 'error_message'])
|
||||
```
|
||||
|
||||
But you muore control then take a look at the first example of using the Python library.
|
||||
|
||||
```python
|
||||
from crawl4ai import WebCrawler
|
||||
|
||||
# Create the WebCrawler instance
|
||||
crawler = WebCrawler()
|
||||
|
||||
# Run the crawler with keyword filtering and CSS selector
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots}
|
||||
```
|
||||
|
||||
Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
|
||||
|
||||
1. Instantiate a WebCrawler object.
|
||||
2. Execute custom JavaScript to click a "Load More" button.
|
||||
3. Extract semantical chunks of content and filter the data to include only content related to technology.
|
||||
4. Use a CSS selector to extract only paragraphs (`<p>` tags).
|
||||
|
||||
```python
|
||||
# Import necessary modules
|
||||
from crawl4ai import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
# Define the JavaScript code to click the "Load More" button
|
||||
js_code = ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""]
|
||||
|
||||
crawler = WebCrawler(verbose=True)
|
||||
crawler.warmup()
|
||||
# Run the crawler with keyword filtering and CSS selector
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code,
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="technology",
|
||||
),
|
||||
)
|
||||
|
||||
# Run the crawler with LLM extraction strategy
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Extract only content related to technology"
|
||||
),
|
||||
css_selector="p"
|
||||
)
|
||||
|
||||
# Display the extracted result
|
||||
print(result)
|
||||
```
|
||||
|
||||
With Crawl4AI, you can perform advanced web crawling and data extraction tasks with just a few lines of code. This example demonstrates how you can harness the power of Crawl4AI to simplify your workflow and get the data you need efficiently.
|
||||
|
||||
---
|
||||
|
||||
*Continue reading to learn more about the features, installation process, usage, and more.*
|
||||
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Features](#features-)
|
||||
2. [Installation](#installation-)
|
||||
3. [REST API/Local Server](#using-the-local-server-ot-rest-api-)
|
||||
4. [Python Library Usage](#python-library-usage-)
|
||||
5. [Parameters](#parameters-)
|
||||
6. [Chunking Strategies](#chunking-strategies-)
|
||||
7. [Extraction Strategies](#extraction-strategies-)
|
||||
8. [Contributing](#contributing-)
|
||||
9. [License](#license-)
|
||||
10. [Contact](#contact-)
|
||||
|
||||
For more details, refer to the [CHANGELOG.md](https://github.com/unclecode/crawl4ai/edit/main/CHANGELOG.md) file.
|
||||
|
||||
## Features ✨
|
||||
|
||||
@@ -26,223 +150,405 @@ For more details, refer to the [CHANGELOG.md](https://github.com/unclecode/crawl
|
||||
- 🌍 Supports crawling multiple URLs simultaneously
|
||||
- 🌃 Replace media tags with ALT.
|
||||
- 🆓 Completely free to use and open-source
|
||||
|
||||
## Getting Started 🚀
|
||||
|
||||
To get started with Crawl4AI, simply visit our web application at [https://crawl4ai.uccode.io](https://crawl4ai.uccode.io) (Available now!) and enter the URL(s) you want to crawl. The application will process the URLs and provide you with the extracted data in various formats.
|
||||
- 📜 Execute custom JavaScript before crawling
|
||||
- 📚 Chunking strategies: topic-based, regex, sentence, and more
|
||||
- 🧠 Extraction strategies: cosine clustering, LLM, and more
|
||||
- 🎯 CSS selector support
|
||||
- 📝 Pass instructions/keywords to refine extraction
|
||||
|
||||
## Installation 💻
|
||||
|
||||
There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local server.
|
||||
|
||||
### Using Crawl4AI as a Library 📚
|
||||
There are three ways to use Crawl4AI:
|
||||
1. As a library (Recommended)
|
||||
2. As a local server (Docker) or using the REST API
|
||||
4. As a Google Colab notebook. [](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
||||
|
||||
To install Crawl4AI as a library, follow these steps:
|
||||
|
||||
1. Install the package from GitHub:
|
||||
```sh
|
||||
pip install git+https://github.com/unclecode/crawl4ai.git
|
||||
```bash
|
||||
virtualenv venv
|
||||
source venv/bin/activate
|
||||
pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
|
||||
```
|
||||
|
||||
Alternatively, you can clone the repository and install the package locally:
|
||||
```sh
|
||||
💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
|
||||
|
||||
crawl4ai-download-models
|
||||
|
||||
2. Alternatively, you can clone the repository and install the package locally:
|
||||
```bash
|
||||
virtualenv venv
|
||||
source venv/bin/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .
|
||||
pip install -e .[all]
|
||||
```
|
||||
|
||||
2. Import the necessary modules in your Python script:
|
||||
3. Use docker to run the local server:
|
||||
```bash
|
||||
# For Mac users
|
||||
# docker build --platform linux/amd64 -t crawl4ai .
|
||||
# For other users
|
||||
# docker build -t crawl4ai .
|
||||
docker run -d -p 8000:80 crawl4ai
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Using the Local server ot REST API 🌐
|
||||
|
||||
You can also use Crawl4AI through the REST API. This method allows you to send HTTP requests to the Crawl4AI server and receive structured data in response. The base URL for the API is `https://crawl4ai.com/crawl` [Available now, on a CPU server, of course will be faster on GPU]. If you run the local server, you can use `http://localhost:8000/crawl`. (Port is dependent on your docker configuration)
|
||||
|
||||
### Example Usage
|
||||
|
||||
To use the REST API, send a POST request to `http://localhost:8000/crawl` with the following parameters in the request body.
|
||||
|
||||
**Example Request:**
|
||||
```json
|
||||
{
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"include_raw_html": false,
|
||||
"bypass_cache": true,
|
||||
"word_count_threshold": 5,
|
||||
"extraction_strategy": "CosineStrategy",
|
||||
"chunking_strategy": "RegexChunking",
|
||||
"css_selector": "p",
|
||||
"verbose": true,
|
||||
"extraction_strategy_args": {
|
||||
"semantic_filter": "finance economy and stock market",
|
||||
"word_count_threshold": 20,
|
||||
"max_dist": 0.2,
|
||||
"linkage_method": "ward",
|
||||
"top_k": 3
|
||||
},
|
||||
"chunking_strategy_args": {
|
||||
"patterns": ["\n\n"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Example Response:**
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data": [
|
||||
{
|
||||
"url": "https://www.nbcnews.com/business",
|
||||
"extracted_content": "...",
|
||||
"html": "...",
|
||||
"cleaned_html": "...",
|
||||
"markdown": "...",
|
||||
"media": {...},
|
||||
"links": {...},
|
||||
"metadata": {...},
|
||||
"screenshots": "...",
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters-) section.
|
||||
|
||||
|
||||
## Python Library Usage 🚀
|
||||
|
||||
🔥 A great way to try out Crawl4AI is to run `quickstart.py` in the `docs/examples` directory. This script demonstrates how to use Crawl4AI to crawl a website and extract content from it.
|
||||
|
||||
### Quickstart Guide
|
||||
|
||||
Create an instance of WebCrawler and call the `warmup()` function.
|
||||
```python
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.models import UrlModel
|
||||
import os
|
||||
|
||||
crawler = WebCrawler(db_path='crawler_data.db')
|
||||
|
||||
# Single page crawl
|
||||
single_url = UrlModel(url='https://kidocode.com', forced=False)
|
||||
result = crawl4ai.fetch_page(
|
||||
single_url,
|
||||
provider= "openai/gpt-3.5-turbo",
|
||||
api_token = os.getenv('OPENAI_API_KEY'),
|
||||
# Set `extract_blocks_flag` to True to enable the LLM to generate semantically clustered chunks
|
||||
# and return them as JSON. Depending on the model and data size, this may take up to 1 minute.
|
||||
# Without this setting, it will take between 5 to 20 seconds.
|
||||
extract_blocks_flag=False
|
||||
word_count_threshold=5 # Minimum word count for a HTML tag to be considered as a worthy block
|
||||
)
|
||||
print(result.model_dump())
|
||||
|
||||
# Multiple page crawl
|
||||
urls = [
|
||||
UrlModel(url='http://example.com', forced=False),
|
||||
UrlModel(url='http://example.org', forced=False)
|
||||
]
|
||||
results = crawl4ai.fetch_pages(
|
||||
urls,
|
||||
provider= "openai/gpt-3.5-turbo",
|
||||
api_token = os.getenv('OPENAI_API_KEY'),
|
||||
extract_blocks_flag=True,
|
||||
word_count_threshold=5
|
||||
)
|
||||
|
||||
for res in results:
|
||||
print(res.model_dump())
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
```
|
||||
|
||||
Running for the first time will download the chrome driver for selenium. Also creates a SQLite database file `crawler_data.db` in the current directory. This file will store the crawled data for future reference.
|
||||
### Understanding 'bypass_cache' and 'include_raw_html' parameters
|
||||
|
||||
The response model is a `CrawlResponse` object that contains the following attributes:
|
||||
First crawl (caches the result):
|
||||
```python
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
```
|
||||
|
||||
Second crawl (Force to crawl again):
|
||||
```python
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||
```
|
||||
💡 Don't forget to set `bypass_cache` to True if you want to try different strategies for the same URL. Otherwise, the cached result will be returned. You can also set `always_by_pass_cache` in constructor to True to always bypass the cache.
|
||||
|
||||
Crawl result without raw HTML content:
|
||||
```python
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
||||
```
|
||||
|
||||
### Result Structure
|
||||
|
||||
The result object contains the following fields:
|
||||
```python
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
success: bool
|
||||
cleaned_html: str = None
|
||||
markdown: str = None
|
||||
parsed_json: str = None
|
||||
error_message: str = None
|
||||
cleaned_html: Optional[str] = None
|
||||
media: Dict[str, List[Dict]] = {} # Media tags in the page {"images": [], "audio": [], "video": []}
|
||||
links: Dict[str, List[Dict]] = {} # Links in the page {"external": [], "internal": []}
|
||||
screenshot: Optional[str] = None # Base64 encoded screenshot
|
||||
markdown: Optional[str] = None
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
```
|
||||
|
||||
### Running Crawl4AI as a Local Server 🚀
|
||||
|
||||
To run Crawl4AI as a standalone local server, follow these steps:
|
||||
|
||||
1. Clone the repository:
|
||||
```sh
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
```
|
||||
|
||||
2. Navigate to the project directory:
|
||||
```sh
|
||||
cd crawl4ai
|
||||
```
|
||||
|
||||
3. Open `crawler/config.py` and set your favorite LLM provider and API token.
|
||||
|
||||
4. Build the Docker image:
|
||||
```sh
|
||||
docker build -t crawl4ai .
|
||||
```
|
||||
For Mac users, use the following command instead:
|
||||
```sh
|
||||
docker build --platform linux/amd64 -t crawl4ai .
|
||||
```
|
||||
|
||||
5. Run the Docker container:
|
||||
```sh
|
||||
docker run -d -p 8000:80 crawl4ai
|
||||
```
|
||||
|
||||
6. Access the application at `http://localhost:8000`.
|
||||
|
||||
- CURL Example:
|
||||
Set the api_token to your OpenAI API key or any other provider you are using.
|
||||
```sh
|
||||
curl -X POST -H "Content-Type: application/json" -d '{"urls":["https://techcrunch.com/"],"provider_model":"openai/gpt-3.5-turbo","api_token":"your_api_token","include_raw_html":true,"forced":false,"extract_blocks_flag":false,"word_count_threshold":10}' http://localhost:8000/crawl
|
||||
```
|
||||
Set `extract_blocks_flag` to True to enable the LLM to generate semantically clustered chunks and return them as JSON. Depending on the model and data size, this may take up to 1 minute. Without this setting, it will take between 5 to 20 seconds.
|
||||
|
||||
- Python Example:
|
||||
```python
|
||||
import requests
|
||||
import os
|
||||
|
||||
url = "http://localhost:8000/crawl" # Replace with the appropriate server URL
|
||||
data = {
|
||||
"urls": [
|
||||
"https://example.com"
|
||||
],
|
||||
"provider_model": "groq/llama3-70b-8192",
|
||||
"api_token": "your_api_token",
|
||||
"include_raw_html": true,
|
||||
"forced": false,
|
||||
# Set `extract_blocks_flag` to True to enable the LLM to generate semantically clustered chunks
|
||||
# and return them as JSON. Depending on the model and data size, this may take up to 1 minute.
|
||||
# Without this setting, it will take between 5 to 20 seconds.
|
||||
"extract_blocks_flag": False,
|
||||
"word_count_threshold": 5
|
||||
}
|
||||
|
||||
response = requests.post(url, json=data)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()["results"][0]
|
||||
print("Parsed JSON:")
|
||||
print(result["parsed_json"])
|
||||
print("\nCleaned HTML:")
|
||||
print(result["cleaned_html"])
|
||||
print("\nMarkdown:")
|
||||
print(result["markdown"])
|
||||
else:
|
||||
print("Error:", response.status_code, response.text)
|
||||
```
|
||||
|
||||
This code sends a POST request to the Crawl4AI server running on localhost, specifying the target URL (`https://example.com`) and the desired options (`grq_api_token`, `include_raw_html`, and `forced`). The server processes the request and returns the crawled data in JSON format.
|
||||
|
||||
The response from the server includes the parsed JSON, cleaned HTML, and markdown representations of the crawled webpage. You can access and use this data in your Python application as needed.
|
||||
|
||||
Make sure to replace `"http://localhost:8000/crawl"` with the appropriate server URL if your Crawl4AI server is running on a different host or port.
|
||||
|
||||
Choose the approach that best suits your needs. If you want to integrate Crawl4AI into your existing Python projects, installing it as a library is the way to go. If you prefer to run Crawl4AI as a standalone service and interact with it via API endpoints, running it as a local server using Docker is the recommended approach.
|
||||
|
||||
**Make sure to check the config.py tp set required environment variables.**
|
||||
|
||||
That's it! You can now integrate Crawl4AI into your Python projects and leverage its web crawling capabilities. 🎉
|
||||
|
||||
## 📖 Parameters
|
||||
|
||||
| Parameter | Description | Required | Default Value |
|
||||
|----------------------|-------------------------------------------------------------------------------------------------|----------|---------------|
|
||||
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
||||
| `provider_model` | The provider and model to use for extracting relevant information (e.g., "groq/llama3-70b-8192"). | Yes | - |
|
||||
| `api_token` | Your API token for the specified provider. | Yes | - |
|
||||
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
||||
| `forced` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
||||
| `extract_blocks_flag`| Whether to extract semantical blocks of text from the HTML. | No | `false` |
|
||||
| `word_count_threshold` | The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
||||
|
||||
## 🛠️ Configuration
|
||||
Crawl4AI allows you to configure various parameters and settings in the `crawler/config.py` file. Here's an example of how you can adjust the parameters:
|
||||
### Taking Screenshots
|
||||
|
||||
```python
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
|
||||
# Default provider
|
||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||
|
||||
# Provider-model dictionary
|
||||
PROVIDER_MODELS = {
|
||||
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
}
|
||||
|
||||
# Chunk token threshold
|
||||
CHUNK_TOKEN_THRESHOLD = 1000
|
||||
|
||||
# Threshold for the minimum number of words in an HTML tag to be considered
|
||||
MIN_WORD_THRESHOLD = 5
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
```
|
||||
In the `crawler/config.py` file, you can:
|
||||
|
||||
- Set the default provider using the `DEFAULT_PROVIDER` variable.
|
||||
- Add or modify the provider-model dictionary (`PROVIDER_MODELS`) to include your desired providers and their corresponding API keys. Crawl4AI supports various providers such as Groq, OpenAI, Anthropic, and more. You can add any provider supported by LiteLLM, as well as Ollama.
|
||||
- Adjust the `CHUNK_TOKEN_THRESHOLD` value to control the splitting of web content into chunks for parallel processing. A higher value means fewer chunks and faster processing, but it may cause issues with weaker LLMs during extraction.
|
||||
- Modify the `MIN_WORD_THRESHOLD` value to set the minimum number of words an HTML tag must contain to be considered a meaningful block.
|
||||
### Adding a chunking strategy: RegexChunking
|
||||
|
||||
Make sure to set the appropriate API keys for each provider in the `PROVIDER_MODELS` dictionary. You can either directly provide the API key or use environment variables to store them securely.
|
||||
Using RegexChunking:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
)
|
||||
```
|
||||
|
||||
Remember to update the `crawler/config.py` file based on your specific requirements and the providers you want to use with Crawl4AI.
|
||||
Using NlpSentenceChunking:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)
|
||||
```
|
||||
|
||||
### Extraction strategy: CosineStrategy
|
||||
|
||||
So far, the extracted content is just the result of chunking. To extract meaningful content, you can use extraction strategies. These strategies cluster consecutive chunks into meaningful blocks, keeping the same order as the text in the HTML. This approach is perfect for use in RAG applications and semantical search queries.
|
||||
|
||||
Using CosineStrategy:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="",
|
||||
word_count_threshold=10,
|
||||
max_dist=0.2,
|
||||
linkage_method="ward",
|
||||
top_k=3
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
You can set `semantic_filter` to filter relevant documents before clustering. Documents are filtered based on their cosine similarity to the keyword filter embedding.
|
||||
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="finance economy and stock market",
|
||||
word_count_threshold=10,
|
||||
max_dist=0.2,
|
||||
linkage_method="ward",
|
||||
top_k=3
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Using LLMExtractionStrategy
|
||||
|
||||
Without instructions:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY')
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
With instructions:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="I am interested in only financial news"
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Targeted extraction using CSS selector
|
||||
|
||||
Extract only H2 tags:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector="h2"
|
||||
)
|
||||
```
|
||||
|
||||
### Passing JavaScript code to click 'Load More' button
|
||||
|
||||
Using JavaScript to click 'Load More' button:
|
||||
```python
|
||||
js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
```
|
||||
|
||||
## Parameters 📖
|
||||
|
||||
| Parameter | Description | Required | Default Value |
|
||||
|-----------------------|-------------------------------------------------------------------------------------------------------|----------|---------------------|
|
||||
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
||||
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
||||
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
||||
| `screenshots` | Whether to take screenshots of the page. | No | `false` |
|
||||
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
||||
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
|
||||
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
||||
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
|
||||
| `user_agent` | The user agent to use for the HTTP requests. | No | `Mozilla/5.0` |
|
||||
| `verbose` | Whether to enable verbose logging. | No | `true` |
|
||||
|
||||
## Chunking Strategies 📚
|
||||
|
||||
### RegexChunking
|
||||
|
||||
`RegexChunking` is a text chunking strategy that splits a given text into smaller parts using regular expressions. This is useful for preparing large texts for processing by language models, ensuring they are divided into manageable segments.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `patterns` (list, optional): A list of regular expression patterns used to split the text. Default is to split by double newlines (`['\n\n']`).
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = RegexChunking(patterns=[r'\n\n', r'\. '])
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into chunks.")
|
||||
```
|
||||
|
||||
### NlpSentenceChunking
|
||||
|
||||
`NlpSentenceChunking` uses a natural language processing model to chunk a given text into sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- None.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = NlpSentenceChunking()
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into sentences.")
|
||||
```
|
||||
|
||||
### TopicSegmentationChunking
|
||||
|
||||
`TopicSegmentationChunking` uses the TextTiling algorithm to segment a given text into topic-based chunks. This method identifies thematic boundaries in the text.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `num_keywords` (int, optional): The number of keywords to extract for each topic segment. Default is `3`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = TopicSegmentationChunking(num_keywords=3)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into topic-based segments.")
|
||||
```
|
||||
|
||||
### FixedLengthWordChunking
|
||||
|
||||
`FixedLengthWordChunking` splits a given text into chunks of fixed length, based on the number of words.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `chunk_size` (int, optional): The number of words in each chunk. Default is `100`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = FixedLengthWordChunking(chunk_size=100)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into fixed-length word chunks.")
|
||||
```
|
||||
|
||||
### SlidingWindowChunking
|
||||
|
||||
`SlidingWindowChunking` uses a sliding window approach to chunk a given text. Each chunk has a fixed length, and the window slides by a specified step size.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `window_size` (int, optional): The number of words in each chunk. Default is `100`.
|
||||
- `step` (int, optional): The number of words to slide the window. Default is `50`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = SlidingWindowChunking(window_size=100, step=50)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split using a sliding window approach.")
|
||||
```
|
||||
|
||||
## Extraction Strategies 🧠
|
||||
|
||||
### NoExtractionStrategy
|
||||
|
||||
`NoExtractionStrategy` is a basic extraction strategy that returns the entire HTML content without any modification. It is useful for cases where no specific extraction is required.
|
||||
|
||||
**Constructor Parameters:**
|
||||
None.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
extractor = NoExtractionStrategy()
|
||||
extracted_content = extractor.extract(url, html)
|
||||
```
|
||||
|
||||
### LLMExtractionStrategy
|
||||
|
||||
`LLMExtractionStrategy` uses a Language Model (LLM) to extract meaningful blocks or chunks from the given HTML content. This strategy leverages an external provider for language model completions.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `provider` (str, optional): The provider to use for the language model completions. Default is `DEFAULT_PROVIDER` (e.g., openai/gpt-4).
|
||||
- `api_token` (str, optional): The API token for the provider. If not provided, it will try to load from the environment variable `OPENAI_API_KEY`.
|
||||
- `instruction` (str, optional): An instruction to guide the LLM on how to perform the extraction. This allows users to specify the type of data they are interested in or set the tone of the response. Default is `None`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
extractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')
|
||||
extracted_content = extractor.extract(url, html)
|
||||
```
|
||||
|
||||
### CosineStrategy
|
||||
|
||||
`CosineStrategy` uses hierarchical clustering based on cosine similarity to extract clusters of text from the given HTML content. This strategy is suitable for identifying related content sections.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `semantic_filter` (str, optional): A string containing keywords for filtering relevant documents before clustering. If provided, documents are filtered based on their cosine similarity to the keyword filter embedding. Default is `None`.
|
||||
- `word_count_threshold` (int, optional): Minimum number of words per cluster. Default is `20`.
|
||||
- `max_dist` (float, optional): The maximum cophenetic distance on the dendrogram to form clusters. Default is `0.2`.
|
||||
- `linkage_method` (str, optional): The linkage method for hierarchical clustering. Default is `'ward'`.
|
||||
- `top_k` (int, optional): Number of top categories to extract. Default is `3`.
|
||||
- `model_name` (str, optional): The model name for embedding generation. Default is `'BAAI/bge-small-en-v1.5'`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
extractor = CosineStrategy(semantic_filter='finance rental prices', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')
|
||||
extracted_content = extractor.extract(url, html)
|
||||
```
|
||||
|
||||
### TopicExtractionStrategy
|
||||
|
||||
`TopicExtractionStrategy` uses the TextTiling algorithm to segment the HTML content into topics and extracts keywords for each segment. This strategy is useful for identifying and summarizing thematic content.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `num_keywords` (int, optional): Number of keywords to represent each topic segment. Default is `3`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
extractor = TopicExtractionStrategy(num_keywords=3)
|
||||
extracted_content = extractor.extract(url, html)
|
||||
```
|
||||
|
||||
## Contributing 🤝
|
||||
|
||||
@@ -266,5 +572,6 @@ If you have any questions, suggestions, or feedback, please feel free to reach o
|
||||
|
||||
- GitHub: [unclecode](https://github.com/unclecode)
|
||||
- Twitter: [@unclecode](https://twitter.com/unclecode)
|
||||
- Website: [crawl4ai.com](https://crawl4ai.com)
|
||||
|
||||
Let's work together to make the web more accessible and useful for AI applications! 💪🌐🤖
|
||||
|
||||
105
crawl4ai/chunking_strategy.py
Normal file
@@ -0,0 +1,105 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import re
|
||||
from collections import Counter
|
||||
import string
|
||||
from .model_loader import load_nltk_punkt
|
||||
|
||||
# Define the abstract base class for chunking strategies
|
||||
class ChunkingStrategy(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def chunk(self, text: str) -> list:
|
||||
"""
|
||||
Abstract method to chunk the given text.
|
||||
"""
|
||||
pass
|
||||
|
||||
# Regex-based chunking
|
||||
class RegexChunking(ChunkingStrategy):
|
||||
def __init__(self, patterns=None, **kwargs):
|
||||
if patterns is None:
|
||||
patterns = [r'\n\n'] # Default split pattern
|
||||
self.patterns = patterns
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
paragraphs = [text]
|
||||
for pattern in self.patterns:
|
||||
new_paragraphs = []
|
||||
for paragraph in paragraphs:
|
||||
new_paragraphs.extend(re.split(pattern, paragraph))
|
||||
paragraphs = new_paragraphs
|
||||
return paragraphs
|
||||
|
||||
# NLP-based sentence chunking
|
||||
class NlpSentenceChunking(ChunkingStrategy):
|
||||
def __init__(self, **kwargs):
|
||||
load_nltk_punkt()
|
||||
pass
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
# Improved regex for sentence splitting
|
||||
# sentence_endings = re.compile(
|
||||
# r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
|
||||
# )
|
||||
# sentences = sentence_endings.split(text)
|
||||
# sens = [sent.strip() for sent in sentences if sent]
|
||||
from nltk.tokenize import sent_tokenize
|
||||
sentences = sent_tokenize(text)
|
||||
sens = [sent.strip() for sent in sentences]
|
||||
|
||||
return list(set(sens))
|
||||
|
||||
# Topic-based segmentation using TextTiling
|
||||
class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
def __init__(self, num_keywords=3, **kwargs):
|
||||
import nltk as nl
|
||||
self.tokenizer = nl.toknize.TextTilingTokenizer()
|
||||
self.num_keywords = num_keywords
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
# Use the TextTilingTokenizer to segment the text
|
||||
segmented_topics = self.tokenizer.tokenize(text)
|
||||
return segmented_topics
|
||||
|
||||
def extract_keywords(self, text: str) -> list:
|
||||
# Tokenize and remove stopwords and punctuation
|
||||
import nltk as nl
|
||||
tokens = nl.toknize.word_tokenize(text)
|
||||
tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation]
|
||||
|
||||
# Calculate frequency distribution
|
||||
freq_dist = Counter(tokens)
|
||||
keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)]
|
||||
return keywords
|
||||
|
||||
def chunk_with_topics(self, text: str) -> list:
|
||||
# Segment the text into topics
|
||||
segments = self.chunk(text)
|
||||
# Extract keywords for each topic segment
|
||||
segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments]
|
||||
return segments_with_topics
|
||||
|
||||
# Fixed-length word chunks
|
||||
class FixedLengthWordChunking(ChunkingStrategy):
|
||||
def __init__(self, chunk_size=100, **kwargs):
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
words = text.split()
|
||||
return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
|
||||
|
||||
# Sliding window chunking
|
||||
class SlidingWindowChunking(ChunkingStrategy):
|
||||
def __init__(self, window_size=100, step=50, **kwargs):
|
||||
self.window_size = window_size
|
||||
self.step = step
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
words = text.split()
|
||||
chunks = []
|
||||
for i in range(0, len(words), self.step):
|
||||
chunks.append(' '.join(words[i:i + self.window_size]))
|
||||
return chunks
|
||||
|
||||
|
||||
@@ -3,15 +3,17 @@ from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
|
||||
# Default provider
|
||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||
|
||||
# Provider-model dictionary
|
||||
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
PROVIDER_MODELS = {
|
||||
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
||||
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
|
||||
218
crawl4ai/crawler_strategy.py
Normal file
@@ -0,0 +1,218 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import InvalidArgumentException
|
||||
import logging
|
||||
import base64
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from io import BytesIO
|
||||
from typing import List
|
||||
import requests
|
||||
import os
|
||||
from pathlib import Path
|
||||
from .utils import wrap_text
|
||||
|
||||
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
logger_driver = logging.getLogger('selenium.webdriver.common.service')
|
||||
logger_driver.setLevel(logging.WARNING)
|
||||
|
||||
urllib3_logger = logging.getLogger('urllib3.connectionpool')
|
||||
urllib3_logger.setLevel(logging.WARNING)
|
||||
|
||||
# Disable http.client logging
|
||||
http_client_logger = logging.getLogger('http.client')
|
||||
http_client_logger.setLevel(logging.WARNING)
|
||||
|
||||
# Disable driver_finder and service logging
|
||||
driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finder')
|
||||
driver_finder_logger.setLevel(logging.WARNING)
|
||||
|
||||
|
||||
|
||||
|
||||
class CrawlerStrategy(ABC):
|
||||
@abstractmethod
|
||||
def crawl(self, url: str, **kwargs) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def take_screenshot(self, save_path: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update_user_agent(self, user_agent: str):
|
||||
pass
|
||||
|
||||
class CloudCrawlerStrategy(CrawlerStrategy):
|
||||
def __init__(self, use_cached_html = False):
|
||||
super().__init__()
|
||||
self.use_cached_html = use_cached_html
|
||||
|
||||
def crawl(self, url: str) -> str:
|
||||
data = {
|
||||
"urls": [url],
|
||||
"include_raw_html": True,
|
||||
"forced": True,
|
||||
"extract_blocks": False,
|
||||
}
|
||||
|
||||
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
||||
response = response.json()
|
||||
html = response["results"][0]["html"]
|
||||
return html
|
||||
|
||||
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||
super().__init__()
|
||||
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||
self.options = Options()
|
||||
self.options.headless = True
|
||||
if kwargs.get("user_agent"):
|
||||
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
||||
self.options.add_argument("--no-sandbox")
|
||||
self.options.add_argument("--headless")
|
||||
# self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--disable-gpu")
|
||||
# self.options.add_argument("--disable-extensions")
|
||||
# self.options.add_argument("--disable-infobars")
|
||||
# self.options.add_argument("--disable-logging")
|
||||
# self.options.add_argument("--disable-popup-blocking")
|
||||
# self.options.add_argument("--disable-translate")
|
||||
# self.options.add_argument("--disable-default-apps")
|
||||
# self.options.add_argument("--disable-background-networking")
|
||||
# self.options.add_argument("--disable-sync")
|
||||
# self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
|
||||
# self.options.add_argument("--disable-browser-side-navigation")
|
||||
# self.options.add_argument("--dns-prefetch-disable")
|
||||
# self.options.add_argument("--disable-web-security")
|
||||
self.options.add_argument("--log-level=3")
|
||||
self.use_cached_html = use_cached_html
|
||||
self.use_cached_html = use_cached_html
|
||||
self.js_code = js_code
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
# chromedriver_autoinstaller.install()
|
||||
import chromedriver_autoinstaller
|
||||
self.service = Service(chromedriver_autoinstaller.install())
|
||||
self.service.log_path = "NUL"
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
|
||||
def update_user_agent(self, user_agent: str):
|
||||
self.options.add_argument(f"user-agent={user_agent}")
|
||||
self.driver.quit()
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
|
||||
def crawl(self, url: str) -> str:
|
||||
# Create md5 hash of the URL
|
||||
import hashlib
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||
if os.path.exists(cache_file_path):
|
||||
with open(cache_file_path, "r") as f:
|
||||
return f.read()
|
||||
|
||||
try:
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
||||
self.driver.get(url)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||
)
|
||||
|
||||
# Execute JS code if provided
|
||||
if self.js_code and type(self.js_code) == str:
|
||||
self.driver.execute_script(self.js_code)
|
||||
# Optionally, wait for some condition after executing the JS code
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
elif self.js_code and type(self.js_code) == list:
|
||||
for js in self.js_code:
|
||||
self.driver.execute_script(js)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
|
||||
html = self.driver.page_source
|
||||
|
||||
# Store in cache
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||
with open(cache_file_path, "w") as f:
|
||||
f.write(html)
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
return html
|
||||
except InvalidArgumentException:
|
||||
raise InvalidArgumentException(f"Invalid URL {url}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
||||
|
||||
def take_screenshot(self) -> str:
|
||||
try:
|
||||
# Get the dimensions of the page
|
||||
total_width = self.driver.execute_script("return document.body.scrollWidth")
|
||||
total_height = self.driver.execute_script("return document.body.scrollHeight")
|
||||
|
||||
# Set the window size to the dimensions of the page
|
||||
self.driver.set_window_size(total_width, total_height)
|
||||
|
||||
# Take screenshot
|
||||
screenshot = self.driver.get_screenshot_as_png()
|
||||
|
||||
# Open the screenshot with PIL
|
||||
image = Image.open(BytesIO(screenshot))
|
||||
|
||||
# Convert to JPEG and compress
|
||||
buffered = BytesIO()
|
||||
image.save(buffered, format="JPEG", quality=85)
|
||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
||||
|
||||
return img_base64
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Failed to take screenshot: {str(e)}"
|
||||
print(error_message)
|
||||
|
||||
# Generate an image with black background
|
||||
img = Image.new('RGB', (800, 600), color='black')
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Load a font
|
||||
try:
|
||||
font = ImageFont.truetype("arial.ttf", 40)
|
||||
except IOError:
|
||||
font = ImageFont.load_default(size=40)
|
||||
|
||||
# Define text color and wrap the text
|
||||
text_color = (255, 255, 255)
|
||||
max_width = 780
|
||||
wrapped_text = wrap_text(draw, error_message, font, max_width)
|
||||
|
||||
# Calculate text position
|
||||
text_position = (10, 10)
|
||||
|
||||
# Draw the text on the image
|
||||
draw.text(text_position, wrapped_text, fill=text_color, font=font)
|
||||
|
||||
# Convert to base64
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
return img_base64
|
||||
|
||||
def quit(self):
|
||||
self.driver.quit()
|
||||
@@ -1,8 +1,15 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
from typing import Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
def init_db(db_path: str):
|
||||
conn = sqlite3.connect(db_path)
|
||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(DB_PATH, exist_ok=True)
|
||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||
|
||||
def init_db():
|
||||
global DB_PATH
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS crawled_data (
|
||||
@@ -10,52 +17,116 @@ def init_db(db_path: str):
|
||||
html TEXT,
|
||||
cleaned_html TEXT,
|
||||
markdown TEXT,
|
||||
parsed_json TEXT,
|
||||
success BOOLEAN
|
||||
extracted_content TEXT,
|
||||
success BOOLEAN,
|
||||
media TEXT DEFAULT "{}",
|
||||
link TEXT DEFAULT "{}",
|
||||
metadata TEXT DEFAULT "{}",
|
||||
screenshot TEXT DEFAULT ""
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def get_cached_url(db_path: str, url: str) -> Optional[tuple]:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result
|
||||
|
||||
def cache_url(db_path: str, url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
parsed_json = excluded.parsed_json,
|
||||
success = excluded.success
|
||||
''', (str(url), html, cleaned_html, markdown, parsed_json, success))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def get_total_count(db_path: str) -> int:
|
||||
def alter_db_add_screenshot(new_column: str = "media"):
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error altering database to add screenshot column: {e}")
|
||||
|
||||
def check_db_path():
|
||||
if not DB_PATH:
|
||||
raise ValueError("Database path is not set or is empty.")
|
||||
|
||||
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"Error retrieving cached URL: {e}")
|
||||
return None
|
||||
|
||||
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
extracted_content = excluded.extracted_content,
|
||||
success = excluded.success,
|
||||
media = excluded.media,
|
||||
links = excluded.links,
|
||||
metadata = excluded.metadata,
|
||||
screenshot = excluded.screenshot
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error caching URL: {e}")
|
||||
|
||||
def get_total_count() -> int:
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT COUNT(*) FROM crawled_data')
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result[0]
|
||||
except Exception as e:
|
||||
print(f"Error getting total count: {e}")
|
||||
return 0
|
||||
|
||||
# Crete function to cler the database
|
||||
def clear_db(db_path: str):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM crawled_data')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def clear_db():
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM crawled_data')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error clearing database: {e}")
|
||||
|
||||
def flush_db():
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DROP TABLE crawled_data')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error flushing database: {e}")
|
||||
|
||||
def update_existing_records(new_column: str = "media", default_value: str = "{}"):
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error updating existing records: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
init_db() # Initialize the database if not already initialized
|
||||
alter_db_add_screenshot("metadata") # Add the new column to the table
|
||||
update_existing_records("metadata") # Update existing records to set the new column to an empty string
|
||||
|
||||
553
crawl4ai/extraction_strategy.py
Normal file
@@ -0,0 +1,553 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, List, Dict, Optional, Union
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import json, time
|
||||
# from optimum.intel import IPEXModel
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
||||
from .config import *
|
||||
from .utils import *
|
||||
from functools import partial
|
||||
from .model_loader import *
|
||||
|
||||
|
||||
import numpy as np
|
||||
class ExtractionStrategy(ABC):
|
||||
"""
|
||||
Abstract base class for all extraction strategies.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.DEL = "<|DEL|>"
|
||||
self.name = self.__class__.__name__
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract meaningful blocks or chunks from the given HTML.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param html: The HTML content of the webpage.
|
||||
:return: A list of extracted blocks or chunks.
|
||||
"""
|
||||
pass
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process sections of text in parallel by default.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to process.
|
||||
:return: A list of processed JSON blocks.
|
||||
"""
|
||||
extracted_content = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections]
|
||||
for future in as_completed(futures):
|
||||
extracted_content.extend(future.result())
|
||||
return extracted_content
|
||||
|
||||
class NoExtractionStrategy(ExtractionStrategy):
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": 0, "content": html}]
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
||||
|
||||
class LLMExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
|
||||
:param provider: The provider to use for extraction.
|
||||
:param api_token: The API token for the provider.
|
||||
:param instruction: The instruction to use for the LLM model.
|
||||
"""
|
||||
super().__init__()
|
||||
self.provider = provider
|
||||
self.api_token = api_token or PROVIDER_MODELS.get(provider, None) or os.getenv("OPENAI_API_KEY")
|
||||
self.instruction = instruction
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
if not self.api_token:
|
||||
raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
|
||||
|
||||
|
||||
def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
|
||||
# print("[LOG] Extracting blocks from URL:", url)
|
||||
print(f"[LOG] Call LLM for {url} - block index: {ix}")
|
||||
variable_values = {
|
||||
"URL": url,
|
||||
"HTML": escape_json_string(sanitize_html(html)),
|
||||
}
|
||||
|
||||
if self.instruction:
|
||||
variable_values["REQUEST"] = self.instruction
|
||||
|
||||
prompt_with_variables = PROMPT_EXTRACT_BLOCKS if not self.instruction else PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
||||
for variable in variable_values:
|
||||
prompt_with_variables = prompt_with_variables.replace(
|
||||
"{" + variable + "}", variable_values[variable]
|
||||
)
|
||||
|
||||
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token)
|
||||
try:
|
||||
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||
blocks = json.loads(blocks)
|
||||
for block in blocks:
|
||||
block['error'] = False
|
||||
except Exception as e:
|
||||
print("Error extracting blocks:", str(e))
|
||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||
blocks = parsed
|
||||
if unparsed:
|
||||
blocks.append({
|
||||
"index": 0,
|
||||
"error": True,
|
||||
"tags": ["error"],
|
||||
"content": unparsed
|
||||
})
|
||||
|
||||
if self.verbose:
|
||||
print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
|
||||
return blocks
|
||||
|
||||
def _merge(self, documents):
|
||||
chunks = []
|
||||
sections = []
|
||||
total_token_so_far = 0
|
||||
|
||||
for document in documents:
|
||||
if total_token_so_far < CHUNK_TOKEN_THRESHOLD:
|
||||
chunk = document.split(' ')
|
||||
total_token_so_far += len(chunk) * 1.3
|
||||
chunks.append(document)
|
||||
else:
|
||||
sections.append('\n\n'.join(chunks))
|
||||
chunks = [document]
|
||||
total_token_so_far = len(document.split(' ')) * 1.3
|
||||
|
||||
if chunks:
|
||||
sections.append('\n\n'.join(chunks))
|
||||
|
||||
return sections
|
||||
|
||||
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
|
||||
"""
|
||||
|
||||
merged_sections = self._merge(sections)
|
||||
extracted_content = []
|
||||
if self.provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
for ix, section in enumerate(merged_sections):
|
||||
extract_func = partial(self.extract, url)
|
||||
extracted_content.extend(extract_func(ix, section))
|
||||
time.sleep(0.5) # 500 ms delay between each processing
|
||||
else:
|
||||
# Parallel processing using ThreadPoolExecutor
|
||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||
extract_func = partial(self.extract, url)
|
||||
futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
|
||||
|
||||
for future in as_completed(futures):
|
||||
extracted_content.extend(future.result())
|
||||
|
||||
|
||||
return extracted_content
|
||||
|
||||
class CosineStrategy(ExtractionStrategy):
|
||||
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
|
||||
:param semantic_filter: A keyword filter for document filtering.
|
||||
:param word_count_threshold: Minimum number of words per cluster.
|
||||
:param max_dist: The maximum cophenetic distance on the dendrogram to form clusters.
|
||||
:param linkage_method: The linkage method for hierarchical clustering.
|
||||
:param top_k: Number of top categories to extract.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.semantic_filter = semantic_filter
|
||||
self.word_count_threshold = word_count_threshold
|
||||
self.max_dist = max_dist
|
||||
self.linkage_method = linkage_method
|
||||
self.top_k = top_k
|
||||
self.sim_threshold = sim_threshold
|
||||
self.timer = time.time()
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
self.buffer_embeddings = np.array([])
|
||||
self.get_embedding_method = "direct"
|
||||
|
||||
self.device = get_device()
|
||||
self.default_batch_size = calculate_batch_size(self.device)
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
||||
|
||||
# if False and self.device.type == "cpu":
|
||||
# self.model = load_onnx_all_MiniLM_l6_v2()
|
||||
# self.tokenizer = self.model.tokenizer
|
||||
# self.get_embedding_method = "direct"
|
||||
# else:
|
||||
|
||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
self.model.eval()
|
||||
self.get_embedding_method = "batch"
|
||||
|
||||
self.buffer_embeddings = np.array([])
|
||||
|
||||
# if model_name == "bert-base-uncased":
|
||||
# self.tokenizer, self.model = load_bert_base_uncased()
|
||||
# self.model.eval() # Ensure the model is in evaluation mode
|
||||
# self.get_embedding_method = "batch"
|
||||
# elif model_name == "BAAI/bge-small-en-v1.5":
|
||||
# self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
# self.model.eval() # Ensure the model is in evaluation mode
|
||||
# self.get_embedding_method = "batch"
|
||||
# elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
|
||||
# self.model = load_onnx_all_MiniLM_l6_v2()
|
||||
# self.tokenizer = self.model.tokenizer
|
||||
# self.get_embedding_method = "direct"
|
||||
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
|
||||
|
||||
self.nlp, self.device = load_text_multilabel_classifier()
|
||||
# self.default_batch_size = 16 if self.device.type == 'cpu' else 64
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
||||
|
||||
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
|
||||
"""
|
||||
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||||
|
||||
:param documents: List of text chunks (documents).
|
||||
:param semantic_filter: A string containing the keywords for filtering.
|
||||
:param threshold: Cosine similarity threshold for filtering documents.
|
||||
:param at_least_k: Minimum number of documents to return.
|
||||
:return: List of filtered documents, ensuring at least `at_least_k` documents.
|
||||
"""
|
||||
|
||||
if not semantic_filter:
|
||||
return documents
|
||||
|
||||
if len(documents) < at_least_k:
|
||||
at_least_k = len(documents) // 2
|
||||
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
# Compute embedding for the keyword filter
|
||||
query_embedding = self.get_embeddings([semantic_filter])[0]
|
||||
|
||||
# Compute embeddings for the documents
|
||||
document_embeddings = self.get_embeddings(documents)
|
||||
|
||||
# Calculate cosine similarity between the query embedding and document embeddings
|
||||
similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
|
||||
|
||||
# Filter documents based on the similarity threshold
|
||||
filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
|
||||
|
||||
# If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
|
||||
if len(filtered_docs) < at_least_k:
|
||||
remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
|
||||
remaining_docs.sort(key=lambda x: x[1], reverse=True)
|
||||
filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
|
||||
|
||||
# Extract the document texts from the tuples
|
||||
filtered_docs = [doc for doc, _ in filtered_docs]
|
||||
|
||||
return filtered_docs[:at_least_k]
|
||||
|
||||
def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
|
||||
"""
|
||||
Get BERT embeddings for a list of sentences.
|
||||
|
||||
:param sentences: List of text chunks (sentences).
|
||||
:return: NumPy array of embeddings.
|
||||
"""
|
||||
# if self.buffer_embeddings.any() and not bypass_buffer:
|
||||
# return self.buffer_embeddings
|
||||
|
||||
if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
|
||||
import torch
|
||||
# Tokenize sentences and convert to tensor
|
||||
if batch_size is None:
|
||||
batch_size = self.default_batch_size
|
||||
|
||||
all_embeddings = []
|
||||
for i in range(0, len(sentences), batch_size):
|
||||
batch_sentences = sentences[i:i + batch_size]
|
||||
encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
|
||||
|
||||
# Ensure no gradients are calculated
|
||||
with torch.no_grad():
|
||||
model_output = self.model(**encoded_input)
|
||||
|
||||
# Get embeddings from the last hidden state (mean pooling)
|
||||
embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
|
||||
all_embeddings.append(embeddings)
|
||||
|
||||
self.buffer_embeddings = np.vstack(all_embeddings)
|
||||
elif self.device.type == "cpu":
|
||||
# self.buffer_embeddings = self.model(sentences)
|
||||
if batch_size is None:
|
||||
batch_size = self.default_batch_size
|
||||
|
||||
all_embeddings = []
|
||||
for i in range(0, len(sentences), batch_size):
|
||||
batch_sentences = sentences[i:i + batch_size]
|
||||
embeddings = self.model(batch_sentences)
|
||||
all_embeddings.append(embeddings)
|
||||
|
||||
self.buffer_embeddings = np.vstack(all_embeddings)
|
||||
return self.buffer_embeddings
|
||||
|
||||
def hierarchical_clustering(self, sentences: List[str], embeddings = None):
|
||||
"""
|
||||
Perform hierarchical clustering on sentences and return cluster labels.
|
||||
|
||||
:param sentences: List of text chunks (sentences).
|
||||
:return: NumPy array of cluster labels.
|
||||
"""
|
||||
# Get embeddings
|
||||
from scipy.cluster.hierarchy import linkage, fcluster
|
||||
from scipy.spatial.distance import pdist
|
||||
self.timer = time.time()
|
||||
embeddings = self.get_embeddings(sentences, bypass_buffer=True)
|
||||
# print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
|
||||
# Compute pairwise cosine distances
|
||||
distance_matrix = pdist(embeddings, 'cosine')
|
||||
# Perform agglomerative clustering respecting order
|
||||
linked = linkage(distance_matrix, method=self.linkage_method)
|
||||
# Form flat clusters
|
||||
labels = fcluster(linked, self.max_dist, criterion='distance')
|
||||
return labels
|
||||
|
||||
def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]):
|
||||
"""
|
||||
Filter clusters to remove those with a word count below the threshold.
|
||||
|
||||
:param clusters: Dictionary of clusters.
|
||||
:return: Filtered dictionary of clusters.
|
||||
"""
|
||||
filtered_clusters = {}
|
||||
for cluster_id, texts in clusters.items():
|
||||
# Concatenate texts for analysis
|
||||
full_text = " ".join(texts)
|
||||
# Count words
|
||||
word_count = len(full_text.split())
|
||||
|
||||
# Keep clusters with word count above the threshold
|
||||
if word_count >= self.word_count_threshold:
|
||||
filtered_clusters[cluster_id] = texts
|
||||
|
||||
return filtered_clusters
|
||||
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract clusters from HTML content using hierarchical clustering.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param html: The HTML content of the webpage.
|
||||
:return: A list of dictionaries representing the clusters.
|
||||
"""
|
||||
# Assume `html` is a list of text chunks for this strategy
|
||||
t = time.time()
|
||||
text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
|
||||
|
||||
# Pre-filter documents using embeddings and semantic_filter
|
||||
text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter)
|
||||
|
||||
if not text_chunks:
|
||||
return []
|
||||
|
||||
# Perform clustering
|
||||
labels = self.hierarchical_clustering(text_chunks)
|
||||
# print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
|
||||
|
||||
# Organize texts by their cluster labels, retaining order
|
||||
t = time.time()
|
||||
clusters = {}
|
||||
for index, label in enumerate(labels):
|
||||
clusters.setdefault(label, []).append(text_chunks[index])
|
||||
|
||||
# Filter clusters by word count
|
||||
filtered_clusters = self.filter_clusters_by_word_count(clusters)
|
||||
|
||||
# Convert filtered clusters to a sorted list of dictionaries
|
||||
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Assign tags using {self.device}")
|
||||
|
||||
if self.device.type in ["gpu", "cuda", "mps"]:
|
||||
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
||||
|
||||
for cluster, label in zip(cluster_list, labels):
|
||||
cluster['tags'] = label
|
||||
elif self.device == "cpu":
|
||||
# Process the text with the loaded model
|
||||
texts = [cluster['content'] for cluster in cluster_list]
|
||||
# Batch process texts
|
||||
docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
|
||||
|
||||
for doc, cluster in zip(docs, cluster_list):
|
||||
tok_k = self.top_k
|
||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
|
||||
# for cluster in cluster_list:
|
||||
# doc = self.nlp(cluster['content'])
|
||||
# tok_k = self.top_k
|
||||
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||||
|
||||
return cluster_list
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process sections using hierarchical clustering.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to process.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of processed JSON blocks.
|
||||
"""
|
||||
# This strategy processes all sections together
|
||||
|
||||
return self.extract(url, self.DEL.join(sections), **kwargs)
|
||||
|
||||
class TopicExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, num_keywords: int = 3, **kwargs):
|
||||
"""
|
||||
Initialize the topic extraction strategy with parameters for topic segmentation.
|
||||
|
||||
:param num_keywords: Number of keywords to represent each topic segment.
|
||||
"""
|
||||
import nltk
|
||||
super().__init__()
|
||||
self.num_keywords = num_keywords
|
||||
self.tokenizer = nltk.TextTilingTokenizer()
|
||||
|
||||
def extract_keywords(self, text: str) -> List[str]:
|
||||
"""
|
||||
Extract keywords from a given text segment using simple frequency analysis.
|
||||
|
||||
:param text: The text segment from which to extract keywords.
|
||||
:return: A list of keyword strings.
|
||||
"""
|
||||
import nltk
|
||||
# Tokenize the text and compute word frequency
|
||||
words = nltk.word_tokenize(text)
|
||||
freq_dist = nltk.FreqDist(words)
|
||||
# Get the most common words as keywords
|
||||
keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
|
||||
return keywords
|
||||
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param html: The HTML content of the webpage.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of dictionaries representing the topics.
|
||||
"""
|
||||
# Use TextTiling to segment the text into topics
|
||||
segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed
|
||||
|
||||
# Prepare the output as a list of dictionaries
|
||||
topic_list = []
|
||||
for i, segment in enumerate(segmented_topics):
|
||||
# Extract keywords for each segment
|
||||
keywords = self.extract_keywords(segment)
|
||||
topic_list.append({
|
||||
"index": i,
|
||||
"content": segment,
|
||||
"keywords": keywords
|
||||
})
|
||||
|
||||
return topic_list
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process sections using topic segmentation and keyword extraction.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to process.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of processed JSON blocks.
|
||||
"""
|
||||
# Concatenate sections into a single text for coherent topic segmentation
|
||||
|
||||
|
||||
return self.extract(url, self.DEL.join(sections), **kwargs)
|
||||
|
||||
class ContentSummarizationStrategy(ExtractionStrategy):
|
||||
def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs):
|
||||
"""
|
||||
Initialize the content summarization strategy with a specific model.
|
||||
|
||||
:param model_name: The model to use for summarization.
|
||||
"""
|
||||
from transformers import pipeline
|
||||
self.summarizer = pipeline("summarization", model=model_name)
|
||||
|
||||
def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Summarize a single section of text.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param text: A section of text to summarize.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A dictionary with the summary.
|
||||
"""
|
||||
try:
|
||||
summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)
|
||||
return {"summary": summary[0]['summary_text']}
|
||||
except Exception as e:
|
||||
print(f"Error summarizing text: {e}")
|
||||
return {"summary": text} # Fallback to original text if summarization fails
|
||||
|
||||
def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process each section in parallel to produce summaries.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to summarize.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of dictionaries with summaries for each section.
|
||||
"""
|
||||
# Use a ThreadPoolExecutor to summarize in parallel
|
||||
summaries = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
# Create a future for each section's summarization
|
||||
future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)}
|
||||
for future in as_completed(future_to_section):
|
||||
section_index = future_to_section[future]
|
||||
try:
|
||||
summary_result = future.result()
|
||||
summaries.append((section_index, summary_result))
|
||||
except Exception as e:
|
||||
print(f"Error processing section {section_index}: {e}")
|
||||
summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text
|
||||
|
||||
# Sort summaries by the original section index to maintain order
|
||||
summaries.sort(key=lambda x: x[0])
|
||||
return [summary for _, summary in summaries]
|
||||
279
crawl4ai/model_loader.py
Normal file
@@ -0,0 +1,279 @@
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
import subprocess, os
|
||||
import shutil
|
||||
import tarfile
|
||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
||||
import argparse
|
||||
import urllib.request
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
@lru_cache()
|
||||
def get_available_memory(device):
|
||||
import torch
|
||||
if device.type == 'cuda':
|
||||
return torch.cuda.get_device_properties(device).total_memory
|
||||
elif device.type == 'mps':
|
||||
return 48 * 1024 ** 3 # Assuming 8GB for MPS, as a conservative estimate
|
||||
else:
|
||||
return 0
|
||||
|
||||
@lru_cache()
|
||||
def calculate_batch_size(device):
|
||||
available_memory = get_available_memory(device)
|
||||
|
||||
if device.type == 'cpu':
|
||||
return 16
|
||||
elif device.type in ['cuda', 'mps']:
|
||||
# Adjust these thresholds based on your model size and available memory
|
||||
if available_memory >= 31 * 1024 ** 3: # > 32GB
|
||||
return 256
|
||||
elif available_memory >= 15 * 1024 ** 3: # > 16GB to 32GB
|
||||
return 128
|
||||
elif available_memory >= 8 * 1024 ** 3: # 8GB to 16GB
|
||||
return 64
|
||||
else:
|
||||
return 32
|
||||
else:
|
||||
return 16 # Default batch size
|
||||
|
||||
@lru_cache()
|
||||
def get_device():
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device('cuda')
|
||||
elif torch.backends.mps.is_available():
|
||||
device = torch.device('mps')
|
||||
else:
|
||||
device = torch.device('cpu')
|
||||
return device
|
||||
|
||||
def set_model_device(model):
|
||||
device = get_device()
|
||||
model.to(device)
|
||||
return model, device
|
||||
|
||||
@lru_cache()
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
return home_folder
|
||||
|
||||
@lru_cache()
|
||||
def load_bert_base_uncased():
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_bge_small_en_v1_5():
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_onnx_all_MiniLM_l6_v2():
|
||||
from crawl4ai.onnx_embedding import DefaultEmbeddingModel
|
||||
|
||||
model_path = "models/onnx.tar.gz"
|
||||
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz"
|
||||
__location__ = os.path.realpath(
|
||||
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
download_path = os.path.join(__location__, model_path)
|
||||
onnx_dir = os.path.join(__location__, "models/onnx")
|
||||
|
||||
# Create the models directory if it does not exist
|
||||
os.makedirs(os.path.dirname(download_path), exist_ok=True)
|
||||
|
||||
# Download the tar.gz file if it does not exist
|
||||
if not os.path.exists(download_path):
|
||||
def download_with_progress(url, filename):
|
||||
def reporthook(block_num, block_size, total_size):
|
||||
downloaded = block_num * block_size
|
||||
percentage = 100 * downloaded / total_size
|
||||
if downloaded < total_size:
|
||||
print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
|
||||
else:
|
||||
print("\rDownload complete!")
|
||||
|
||||
urllib.request.urlretrieve(url, filename, reporthook)
|
||||
|
||||
download_with_progress(model_url, download_path)
|
||||
|
||||
# Extract the tar.gz file if the onnx directory does not exist
|
||||
if not os.path.exists(onnx_dir):
|
||||
with tarfile.open(download_path, "r:gz") as tar:
|
||||
tar.extractall(path=os.path.join(__location__, "models"))
|
||||
|
||||
# remove the tar.gz file
|
||||
os.remove(download_path)
|
||||
|
||||
|
||||
|
||||
model = DefaultEmbeddingModel()
|
||||
return model
|
||||
|
||||
@lru_cache()
|
||||
def load_text_classifier():
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
from transformers import pipeline
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
||||
return pipe
|
||||
|
||||
@lru_cache()
|
||||
def load_text_multilabel_classifier():
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
import numpy as np
|
||||
from scipy.special import expit
|
||||
import torch
|
||||
|
||||
# Check for available device: CUDA, MPS (for Apple Silicon), or CPU
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device("cuda")
|
||||
elif torch.backends.mps.is_available():
|
||||
device = torch.device("mps")
|
||||
else:
|
||||
return load_spacy_model(), torch.device("cpu")
|
||||
|
||||
|
||||
MODEL = "cardiffnlp/tweet-topic-21-multi"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
class_mapping = model.config.id2label
|
||||
|
||||
def _classifier(texts, threshold=0.5, max_length=64):
|
||||
tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
|
||||
tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
|
||||
|
||||
with torch.no_grad():
|
||||
output = model(**tokens)
|
||||
|
||||
scores = output.logits.detach().cpu().numpy()
|
||||
scores = expit(scores)
|
||||
predictions = (scores >= threshold) * 1
|
||||
|
||||
batch_labels = []
|
||||
for prediction in predictions:
|
||||
labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
|
||||
batch_labels.append(labels)
|
||||
|
||||
return batch_labels
|
||||
|
||||
return _classifier, device
|
||||
|
||||
@lru_cache()
|
||||
def load_nltk_punkt():
|
||||
import nltk
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
return nltk.data.find('tokenizers/punkt')
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_model():
|
||||
import spacy
|
||||
name = "models/reuters"
|
||||
home_folder = get_home_folder()
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
# Check if the model directory already exists
|
||||
if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
|
||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
||||
# branch = "main"
|
||||
branch = MODEL_REPO_BRANCH
|
||||
repo_folder = os.path.join(home_folder, "crawl4ai")
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
# print("[LOG] ⏬ Downloading Spacy model for the first time...")
|
||||
|
||||
# Remove existing repo folder if it exists
|
||||
if Path(repo_folder).exists():
|
||||
shutil.rmtree(repo_folder)
|
||||
shutil.rmtree(model_folder)
|
||||
|
||||
try:
|
||||
# Clone the repository
|
||||
subprocess.run(
|
||||
["git", "clone", "-b", branch, repo_url, repo_folder],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True
|
||||
)
|
||||
|
||||
# Create the models directory if it doesn't exist
|
||||
models_folder = os.path.join(home_folder, "models")
|
||||
os.makedirs(models_folder, exist_ok=True)
|
||||
|
||||
# Copy the reuters model folder to the models directory
|
||||
source_folder = os.path.join(repo_folder, "models/reuters")
|
||||
shutil.copytree(source_folder, model_folder)
|
||||
|
||||
# Remove the cloned repository
|
||||
shutil.rmtree(repo_folder)
|
||||
|
||||
# Print completion message
|
||||
# print("[LOG] ✅ Spacy Model downloaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while cloning the repository: {e}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
return spacy.load(model_folder)
|
||||
|
||||
def download_all_models(remove_existing=False):
|
||||
"""Download all models required for Crawl4AI."""
|
||||
if remove_existing:
|
||||
print("[LOG] Removing existing models...")
|
||||
home_folder = get_home_folder()
|
||||
model_folders = [
|
||||
os.path.join(home_folder, "models/reuters"),
|
||||
os.path.join(home_folder, "models"),
|
||||
]
|
||||
for folder in model_folders:
|
||||
if Path(folder).exists():
|
||||
shutil.rmtree(folder)
|
||||
print("[LOG] Existing models removed.")
|
||||
|
||||
# Load each model to trigger download
|
||||
# print("[LOG] Downloading BERT Base Uncased...")
|
||||
# load_bert_base_uncased()
|
||||
# print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
# load_bge_small_en_v1_5()
|
||||
# print("[LOG] Downloading ONNX model...")
|
||||
# load_onnx_all_MiniLM_l6_v2()
|
||||
print("[LOG] Downloading text classifier...")
|
||||
_, device = load_text_multilabel_classifier()
|
||||
print(f"[LOG] Text classifier loaded on {device}")
|
||||
print("[LOG] Downloading custom NLTK Punkt model...")
|
||||
load_nltk_punkt()
|
||||
print("[LOG] ✅ All models downloaded successfully.")
|
||||
|
||||
def main():
|
||||
print("[LOG] Welcome to the Crawl4AI Model Downloader!")
|
||||
print("[LOG] This script will download all the models required for Crawl4AI.")
|
||||
parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
|
||||
parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
|
||||
args = parser.parse_args()
|
||||
|
||||
download_all_models(remove_existing=args.remove_existing)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,5 +1,5 @@
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
class UrlModel(BaseModel):
|
||||
url: HttpUrl
|
||||
@@ -9,7 +9,11 @@ class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
success: bool
|
||||
cleaned_html: str = None
|
||||
markdown: str = None
|
||||
parsed_json: str = None
|
||||
error_message: str = None
|
||||
cleaned_html: Optional[str] = None
|
||||
media: Dict[str, List[Dict]] = {}
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
screenshot: Optional[str] = None
|
||||
markdown: Optional[str] = None
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
25
crawl4ai/models/onnx/config.json
Normal file
@@ -0,0 +1,25 @@
|
||||
{
|
||||
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
|
||||
"architectures": [
|
||||
"BertModel"
|
||||
],
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"classifier_dropout": null,
|
||||
"gradient_checkpointing": false,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 384,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 1536,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"max_position_embeddings": 512,
|
||||
"model_type": "bert",
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 6,
|
||||
"pad_token_id": 0,
|
||||
"position_embedding_type": "absolute",
|
||||
"transformers_version": "4.27.4",
|
||||
"type_vocab_size": 2,
|
||||
"use_cache": true,
|
||||
"vocab_size": 30522
|
||||
}
|
||||
BIN
crawl4ai/models/onnx/model.onnx
Normal file
7
crawl4ai/models/onnx/special_tokens_map.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"cls_token": "[CLS]",
|
||||
"mask_token": "[MASK]",
|
||||
"pad_token": "[PAD]",
|
||||
"sep_token": "[SEP]",
|
||||
"unk_token": "[UNK]"
|
||||
}
|
||||
30686
crawl4ai/models/onnx/tokenizer.json
Normal file
15
crawl4ai/models/onnx/tokenizer_config.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"cls_token": "[CLS]",
|
||||
"do_basic_tokenize": true,
|
||||
"do_lower_case": true,
|
||||
"mask_token": "[MASK]",
|
||||
"model_max_length": 512,
|
||||
"never_split": null,
|
||||
"pad_token": "[PAD]",
|
||||
"sep_token": "[SEP]",
|
||||
"special_tokens_map_file": "/Users/hammad/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/7dbbc90392e2f80f3d3c277d6e90027e55de9125/special_tokens_map.json",
|
||||
"strip_accents": null,
|
||||
"tokenize_chinese_chars": true,
|
||||
"tokenizer_class": "BertTokenizer",
|
||||
"unk_token": "[UNK]"
|
||||
}
|
||||
30522
crawl4ai/models/onnx/vocab.txt
Normal file
50
crawl4ai/onnx_embedding.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# A dependency-light way to run the onnx model
|
||||
|
||||
|
||||
import numpy as np
|
||||
from typing import List
|
||||
import os
|
||||
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
|
||||
def normalize(v):
|
||||
norm = np.linalg.norm(v, axis=1)
|
||||
norm[norm == 0] = 1e-12
|
||||
return v / norm[:, np.newaxis]
|
||||
|
||||
# Sampel implementation of the default sentence-transformers model using ONNX
|
||||
class DefaultEmbeddingModel():
|
||||
|
||||
def __init__(self):
|
||||
from tokenizers import Tokenizer
|
||||
import onnxruntime as ort
|
||||
# max_seq_length = 256, for some reason sentence-transformers uses 256 even though the HF config has a max length of 128
|
||||
# https://github.com/UKPLab/sentence-transformers/blob/3e1929fddef16df94f8bc6e3b10598a98f46e62d/docs/_static/html/models_en_sentence_embeddings.html#LL480
|
||||
self.tokenizer = Tokenizer.from_file(os.path.join(__location__, "models/onnx/tokenizer.json"))
|
||||
self.tokenizer.enable_truncation(max_length=256)
|
||||
self.tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=256)
|
||||
self.model = ort.InferenceSession(os.path.join(__location__,"models/onnx/model.onnx"))
|
||||
|
||||
|
||||
def __call__(self, documents: List[str], batch_size: int = 32):
|
||||
all_embeddings = []
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i:i + batch_size]
|
||||
encoded = [self.tokenizer.encode(d) for d in batch]
|
||||
input_ids = np.array([e.ids for e in encoded])
|
||||
attention_mask = np.array([e.attention_mask for e in encoded])
|
||||
onnx_input = {
|
||||
"input_ids": np.array(input_ids, dtype=np.int64),
|
||||
"attention_mask": np.array(attention_mask, dtype=np.int64),
|
||||
"token_type_ids": np.array([np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64),
|
||||
}
|
||||
model_output = self.model.run(None, onnx_input)
|
||||
last_hidden_state = model_output[0]
|
||||
# Perform mean pooling with attention weighting
|
||||
input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), last_hidden_state.shape)
|
||||
embeddings = np.sum(last_hidden_state * input_mask_expanded, 1) / np.clip(input_mask_expanded.sum(1), a_min=1e-9, a_max=None)
|
||||
embeddings = normalize(embeddings).astype(np.float32)
|
||||
all_embeddings.append(embeddings)
|
||||
return np.concatenate(all_embeddings)
|
||||
|
||||
@@ -59,7 +59,7 @@ Please provide your output within <blocks> tags, like this:
|
||||
|
||||
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
||||
|
||||
PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
|
||||
PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
|
||||
<url>{URL}</url>
|
||||
|
||||
And here is the cleaned HTML content of that webpage:
|
||||
@@ -107,4 +107,61 @@ Please provide your output within <blocks> tags, like this:
|
||||
}]
|
||||
</blocks>
|
||||
|
||||
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
||||
|
||||
PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION = """Here is the URL of the webpage:
|
||||
<url>{URL}</url>
|
||||
|
||||
And here is the cleaned HTML content of that webpage:
|
||||
<html>
|
||||
{HTML}
|
||||
</html>
|
||||
|
||||
Your task is to break down this HTML content into semantically relevant blocks, following the provided user's REQUEST, and for each block, generate a JSON object with the following keys:
|
||||
|
||||
- index: an integer representing the index of the block in the content
|
||||
- content: a list of strings containing the text content of the block
|
||||
|
||||
This is the user's REQUEST, pay attention to it:
|
||||
<request>
|
||||
{REQUEST}
|
||||
</request>
|
||||
|
||||
To generate the JSON objects:
|
||||
|
||||
1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
|
||||
|
||||
2. For each block:
|
||||
a. Assign it an index based on its order in the content.
|
||||
b. Analyze the content and generate ONE semantic tag that describe what the block is about.
|
||||
c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
|
||||
|
||||
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
||||
|
||||
4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
|
||||
|
||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||
|
||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
|
||||
7. Never alter the extracted content, just copy and paste it as it is.
|
||||
|
||||
Please provide your output within <blocks> tags, like this:
|
||||
|
||||
<blocks>
|
||||
[{
|
||||
"index": 0,
|
||||
"tags": ["introduction"],
|
||||
"content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"tags": ["background"],
|
||||
"content": ["This is the second paragraph, which delves into the history and background of the topic.",
|
||||
"It provides context and sets the stage for the rest of the article."]
|
||||
}]
|
||||
</blocks>
|
||||
|
||||
**Make sure to follow the user instruction to extract blocks aligin with the instruction.**
|
||||
|
||||
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
||||
146
crawl4ai/train.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import spacy
|
||||
from spacy.training import Example
|
||||
import random
|
||||
import nltk
|
||||
from nltk.corpus import reuters
|
||||
import torch
|
||||
|
||||
def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
|
||||
# Extract the TextCategorizer component
|
||||
textcat = nlp.get_pipe("textcat_multilabel")
|
||||
|
||||
# Convert the weights to a PyTorch state dictionary
|
||||
state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
|
||||
|
||||
# Save the state dictionary
|
||||
torch.save(state_dict, f"{model_dir}/model_weights.pth")
|
||||
|
||||
# Extract and save the vocabulary
|
||||
vocab = extract_vocab(nlp)
|
||||
with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
|
||||
for word, idx in vocab.items():
|
||||
vocab_file.write(f"{word}\t{idx}\n")
|
||||
|
||||
print(f"Model weights and vocabulary saved to: {model_dir}")
|
||||
|
||||
def extract_vocab(nlp):
|
||||
# Extract vocabulary from the SpaCy model
|
||||
vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
|
||||
return vocab
|
||||
|
||||
nlp = spacy.load("models/reuters")
|
||||
save_spacy_model_as_torch(nlp, model_dir="models")
|
||||
|
||||
def train_and_save_reuters_model(model_dir="models/reuters"):
|
||||
# Ensure the Reuters corpus is downloaded
|
||||
nltk.download('reuters')
|
||||
nltk.download('punkt')
|
||||
if not reuters.fileids():
|
||||
print("Reuters corpus not found.")
|
||||
return
|
||||
|
||||
# Load a blank English spaCy model
|
||||
nlp = spacy.blank("en")
|
||||
|
||||
# Create a TextCategorizer with the ensemble model for multi-label classification
|
||||
textcat = nlp.add_pipe("textcat_multilabel")
|
||||
|
||||
# Add labels to text classifier
|
||||
for label in reuters.categories():
|
||||
textcat.add_label(label)
|
||||
|
||||
# Prepare training data
|
||||
train_examples = []
|
||||
for fileid in reuters.fileids():
|
||||
categories = reuters.categories(fileid)
|
||||
text = reuters.raw(fileid)
|
||||
cats = {label: label in categories for label in reuters.categories()}
|
||||
# Prepare spacy Example objects
|
||||
doc = nlp.make_doc(text)
|
||||
example = Example.from_dict(doc, {'cats': cats})
|
||||
train_examples.append(example)
|
||||
|
||||
# Initialize the text categorizer with the example objects
|
||||
nlp.initialize(lambda: train_examples)
|
||||
|
||||
# Train the model
|
||||
random.seed(1)
|
||||
spacy.util.fix_random_seed(1)
|
||||
for i in range(5): # Adjust iterations for better accuracy
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
# Create batches of data
|
||||
batches = spacy.util.minibatch(train_examples, size=8)
|
||||
for batch in batches:
|
||||
nlp.update(batch, drop=0.2, losses=losses)
|
||||
print(f"Losses at iteration {i}: {losses}")
|
||||
|
||||
# Save the trained model
|
||||
nlp.to_disk(model_dir)
|
||||
print(f"Model saved to: {model_dir}")
|
||||
|
||||
def train_model(model_dir, additional_epochs=0):
|
||||
# Load the model if it exists, otherwise start with a blank model
|
||||
try:
|
||||
nlp = spacy.load(model_dir)
|
||||
print("Model loaded from disk.")
|
||||
except IOError:
|
||||
print("No existing model found. Starting with a new model.")
|
||||
nlp = spacy.blank("en")
|
||||
textcat = nlp.add_pipe("textcat_multilabel")
|
||||
for label in reuters.categories():
|
||||
textcat.add_label(label)
|
||||
|
||||
# Prepare training data
|
||||
train_examples = []
|
||||
for fileid in reuters.fileids():
|
||||
categories = reuters.categories(fileid)
|
||||
text = reuters.raw(fileid)
|
||||
cats = {label: label in categories for label in reuters.categories()}
|
||||
doc = nlp.make_doc(text)
|
||||
example = Example.from_dict(doc, {'cats': cats})
|
||||
train_examples.append(example)
|
||||
|
||||
# Initialize the model if it was newly created
|
||||
if 'textcat_multilabel' not in nlp.pipe_names:
|
||||
nlp.initialize(lambda: train_examples)
|
||||
else:
|
||||
print("Continuing training with existing model.")
|
||||
|
||||
# Train the model
|
||||
random.seed(1)
|
||||
spacy.util.fix_random_seed(1)
|
||||
num_epochs = 5 + additional_epochs
|
||||
for i in range(num_epochs):
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
batches = spacy.util.minibatch(train_examples, size=8)
|
||||
for batch in batches:
|
||||
nlp.update(batch, drop=0.2, losses=losses)
|
||||
print(f"Losses at iteration {i}: {losses}")
|
||||
|
||||
# Save the trained model
|
||||
nlp.to_disk(model_dir)
|
||||
print(f"Model saved to: {model_dir}")
|
||||
|
||||
def load_model_and_predict(model_dir, text, tok_k = 3):
|
||||
# Load the trained model from the specified directory
|
||||
nlp = spacy.load(model_dir)
|
||||
|
||||
# Process the text with the loaded model
|
||||
doc = nlp(text)
|
||||
|
||||
# gee top 3 categories
|
||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
print(f"Top {tok_k} categories:")
|
||||
|
||||
return top_categories
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_and_save_reuters_model()
|
||||
train_model("models/reuters", additional_epochs=5)
|
||||
model_directory = "reuters_model_10"
|
||||
print(reuters.categories())
|
||||
example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
|
||||
r =load_model_and_predict(model_directory, example_text)
|
||||
print(r)
|
||||
@@ -1,16 +1,26 @@
|
||||
import requests
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||
import html2text
|
||||
import json
|
||||
import html
|
||||
import re
|
||||
import os
|
||||
import litellm
|
||||
from litellm import completion, batch_completion
|
||||
from html2text import HTML2Text
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||
from .config import *
|
||||
import re
|
||||
import html
|
||||
from pathlib import Path
|
||||
|
||||
class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
return home_folder
|
||||
|
||||
def beautify_html(escaped_html):
|
||||
"""
|
||||
@@ -77,7 +87,8 @@ def split_and_parse_json_objects(json_string):
|
||||
|
||||
def sanitize_html(html):
|
||||
# Replace all weird and special characters with an empty string
|
||||
sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
|
||||
sanitized_html = html
|
||||
# sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
|
||||
|
||||
# Escape all double and single quotes
|
||||
sanitized_html = sanitized_html.replace('"', '\\"').replace("'", "\\'")
|
||||
@@ -113,14 +124,106 @@ def escape_json_string(s):
|
||||
|
||||
return s
|
||||
|
||||
class CustomHTML2Text(HTML2Text):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.ignore_links = True
|
||||
self.inside_pre = False
|
||||
self.inside_code = False
|
||||
|
||||
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||
def handle_tag(self, tag, attrs, start):
|
||||
if tag == 'pre':
|
||||
if start:
|
||||
self.o('```\n')
|
||||
self.inside_pre = True
|
||||
else:
|
||||
self.o('\n```')
|
||||
self.inside_pre = False
|
||||
# elif tag == 'code' and not self.inside_pre:
|
||||
# if start:
|
||||
# if not self.inside_pre:
|
||||
# self.o('`')
|
||||
# self.inside_code = True
|
||||
# else:
|
||||
# if not self.inside_pre:
|
||||
# self.o('`')
|
||||
# self.inside_code = False
|
||||
|
||||
super().handle_tag(tag, attrs, start)
|
||||
|
||||
def replace_inline_tags(soup, tags):
|
||||
tag_replacements = {
|
||||
'b': lambda tag: f"**{tag.text}**",
|
||||
'i': lambda tag: f"*{tag.text}*",
|
||||
'u': lambda tag: f"__{tag.text}__",
|
||||
'span': lambda tag: f"{tag.text}",
|
||||
'del': lambda tag: f"~~{tag.text}~~",
|
||||
'ins': lambda tag: f"++{tag.text}++",
|
||||
'sub': lambda tag: f"~{tag.text}~",
|
||||
'sup': lambda tag: f"^^{tag.text}^^",
|
||||
'strong': lambda tag: f"**{tag.text}**",
|
||||
'em': lambda tag: f"*{tag.text}*",
|
||||
'code': lambda tag: f"`{tag.text}`",
|
||||
'kbd': lambda tag: f"`{tag.text}`",
|
||||
'var': lambda tag: f"_{tag.text}_",
|
||||
's': lambda tag: f"~~{tag.text}~~",
|
||||
'q': lambda tag: f'"{tag.text}"',
|
||||
'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
|
||||
'cite': lambda tag: f"_{tag.text}_",
|
||||
'dfn': lambda tag: f"_{tag.text}_",
|
||||
'time': lambda tag: f"{tag.text}",
|
||||
'small': lambda tag: f"<small>{tag.text}</small>",
|
||||
'mark': lambda tag: f"=={tag.text}=="
|
||||
}
|
||||
|
||||
for tag_name in tags:
|
||||
for tag in soup.find_all(tag_name):
|
||||
replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
|
||||
tag.replace_with(replacement_text)
|
||||
|
||||
return soup
|
||||
|
||||
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
||||
try:
|
||||
if not html:
|
||||
return None
|
||||
# Parse HTML content with BeautifulSoup
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Get the content within the <body> tag
|
||||
body = soup.body
|
||||
|
||||
# If css_selector is provided, extract content based on the selector
|
||||
if css_selector:
|
||||
selected_elements = body.select(css_selector)
|
||||
if not selected_elements:
|
||||
raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}")
|
||||
div_tag = soup.new_tag('div')
|
||||
for el in selected_elements:
|
||||
div_tag.append(el)
|
||||
body = div_tag
|
||||
|
||||
links = {
|
||||
'internal': [],
|
||||
'external': []
|
||||
}
|
||||
|
||||
# Extract all internal and external links
|
||||
for a in body.find_all('a', href=True):
|
||||
href = a['href']
|
||||
url_base = url.split('/')[2]
|
||||
if href.startswith('http') and url_base not in href:
|
||||
links['external'].append({
|
||||
'href': href,
|
||||
'text': a.get_text()
|
||||
})
|
||||
else:
|
||||
links['internal'].append(
|
||||
{
|
||||
'href': href,
|
||||
'text': a.get_text()
|
||||
}
|
||||
)
|
||||
|
||||
# Remove script, style, and other tags that don't carry useful content from body
|
||||
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
|
||||
@@ -131,6 +234,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||
if tag.name != 'img':
|
||||
tag.attrs = {}
|
||||
|
||||
# Extract all img tgas inti [{src: '', alt: ''}]
|
||||
media = {
|
||||
'images': [],
|
||||
'videos': [],
|
||||
'audios': []
|
||||
}
|
||||
for img in body.find_all('img'):
|
||||
media['images'].append({
|
||||
'src': img.get('src'),
|
||||
'alt': img.get('alt'),
|
||||
"type": "image"
|
||||
})
|
||||
|
||||
# Extract all video tags into [{src: '', alt: ''}]
|
||||
for video in body.find_all('video'):
|
||||
media['videos'].append({
|
||||
'src': video.get('src'),
|
||||
'alt': video.get('alt'),
|
||||
"type": "video"
|
||||
})
|
||||
|
||||
# Extract all audio tags into [{src: '', alt: ''}]
|
||||
for audio in body.find_all('audio'):
|
||||
media['audios'].append({
|
||||
'src': audio.get('src'),
|
||||
'alt': audio.get('alt'),
|
||||
"type": "audio"
|
||||
})
|
||||
|
||||
# Replace images with their alt text or remove them if no alt text is available
|
||||
for img in body.find_all('img'):
|
||||
alt_text = img.get('alt')
|
||||
@@ -139,17 +271,31 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||
else:
|
||||
img.decompose()
|
||||
|
||||
|
||||
# Create a function that replace content of all"pre" tage with its inner text
|
||||
def replace_pre_tags_with_text(node):
|
||||
for child in node.find_all('pre'):
|
||||
# set child inner html to its text
|
||||
child.string = child.get_text()
|
||||
return node
|
||||
|
||||
# Replace all "pre" tags with their inner text
|
||||
body = replace_pre_tags_with_text(body)
|
||||
|
||||
# Replace inline tags with their text content
|
||||
body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])
|
||||
|
||||
# Recursively remove empty elements, their parent elements, and elements with word count below threshold
|
||||
def remove_empty_and_low_word_count_elements(node):
|
||||
def remove_empty_and_low_word_count_elements(node, word_count_threshold):
|
||||
for child in node.contents:
|
||||
if isinstance(child, element.Tag):
|
||||
remove_empty_and_low_word_count_elements(child)
|
||||
remove_empty_and_low_word_count_elements(child, word_count_threshold)
|
||||
word_count = len(child.get_text(strip=True).split())
|
||||
if (len(child.contents) == 0 and not child.get_text(strip=True)) or word_count < word_count_threshold:
|
||||
child.decompose()
|
||||
return node
|
||||
|
||||
body = remove_empty_and_low_word_count_elements(body)
|
||||
body = remove_empty_and_low_word_count_elements(body, word_count_threshold)
|
||||
|
||||
def remove_small_text_tags(body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD):
|
||||
# We'll use a list to collect all tags that don't meet the word count requirement
|
||||
@@ -214,9 +360,11 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||
return node
|
||||
|
||||
body = flatten_nested_elements(body)
|
||||
|
||||
|
||||
|
||||
# Remove comments
|
||||
for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
||||
# Remove consecutive empty newlines and replace multiple spaces with a single space
|
||||
@@ -228,26 +376,65 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||
|
||||
# Convert cleaned HTML to Markdown
|
||||
h = html2text.HTML2Text()
|
||||
h = CustomHTML2Text()
|
||||
h.ignore_links = True
|
||||
markdown = h.handle(cleaned_html)
|
||||
|
||||
markdown = markdown.replace(' ```', '```')
|
||||
|
||||
# Return the Markdown content
|
||||
return{
|
||||
'markdown': markdown,
|
||||
'cleaned_html': cleaned_html,
|
||||
'success': True
|
||||
'success': True,
|
||||
'media': media,
|
||||
'links': links
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print('Error processing HTML content:', str(e))
|
||||
return None
|
||||
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
|
||||
|
||||
# Example usage
|
||||
# word_count_threshold = 5 # Adjust this value according to your desired threshold
|
||||
# markdown_content = get_content_of_website(word_count_threshold)
|
||||
# print(markdown_content)
|
||||
|
||||
|
||||
def extract_metadata(html):
|
||||
metadata = {}
|
||||
|
||||
if not html:
|
||||
return metadata
|
||||
|
||||
# Parse HTML content with BeautifulSoup
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Title
|
||||
title_tag = soup.find('title')
|
||||
metadata['title'] = title_tag.string if title_tag else None
|
||||
|
||||
# Meta description
|
||||
description_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
metadata['description'] = description_tag['content'] if description_tag else None
|
||||
|
||||
# Meta keywords
|
||||
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
|
||||
metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
|
||||
|
||||
# Meta author
|
||||
author_tag = soup.find('meta', attrs={'name': 'author'})
|
||||
metadata['author'] = author_tag['content'] if author_tag else None
|
||||
|
||||
# Open Graph metadata
|
||||
og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
|
||||
for tag in og_tags:
|
||||
property_name = tag['property']
|
||||
metadata[property_name] = tag['content']
|
||||
|
||||
# Twitter Card metadata
|
||||
twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
|
||||
for tag in twitter_tags:
|
||||
property_name = tag['name']
|
||||
metadata[property_name] = tag['content']
|
||||
|
||||
return metadata
|
||||
|
||||
def extract_xml_tags(string):
|
||||
tags = re.findall(r'<(\w+)>', string)
|
||||
return list(set(tags))
|
||||
@@ -265,17 +452,16 @@ def extract_xml_data(tags, string):
|
||||
|
||||
return data
|
||||
|
||||
import time
|
||||
import litellm
|
||||
|
||||
# Function to perform the completion with exponential backoff
|
||||
def perform_completion_with_backoff(provider, prompt_with_variables, api_token):
|
||||
from litellm import completion
|
||||
from litellm.exceptions import RateLimitError
|
||||
max_attempts = 3
|
||||
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
|
||||
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
response = completion(
|
||||
response =completion(
|
||||
model=provider,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt_with_variables}
|
||||
@@ -284,7 +470,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token):
|
||||
api_key=api_token
|
||||
)
|
||||
return response # Return the successful response
|
||||
except litellm.exceptions.RateLimitError as e:
|
||||
except RateLimitError as e:
|
||||
print("Rate limit error:", str(e))
|
||||
|
||||
# Check if we have exhausted our max attempts
|
||||
@@ -318,23 +504,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
||||
|
||||
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
|
||||
|
||||
# try:
|
||||
# response = completion(
|
||||
# model = provider,
|
||||
# messages = [
|
||||
# {"role": "user", "content": prompt_with_variables}
|
||||
# ],
|
||||
# temperature = 0.01,
|
||||
# api_key = api_token
|
||||
# )
|
||||
# except litellm.exceptions.RateLimitError as e:
|
||||
# print("Rate limit error:", str(e))
|
||||
# return [{
|
||||
# "index": 0,
|
||||
# "tags": ["error"],
|
||||
# "content": ["Rate limit error. Please try again later."]
|
||||
# }]
|
||||
|
||||
try:
|
||||
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||
blocks = json.loads(blocks)
|
||||
@@ -357,7 +526,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
||||
|
||||
def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
|
||||
api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||
|
||||
from litellm import batch_completion
|
||||
messages = []
|
||||
|
||||
for url, html in batch_data:
|
||||
@@ -397,4 +566,62 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
|
||||
}]
|
||||
all_blocks.append(blocks)
|
||||
|
||||
return sum(all_blocks, [])
|
||||
return sum(all_blocks, [])
|
||||
|
||||
|
||||
def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
||||
"""
|
||||
Merges small chunks into larger ones based on the total token threshold.
|
||||
|
||||
:param chunks: List of text chunks to be merged based on token count.
|
||||
:param token_threshold: Max number of tokens for each merged chunk.
|
||||
:return: List of merged text chunks.
|
||||
"""
|
||||
merged_sections = []
|
||||
current_chunk = []
|
||||
total_token_so_far = 0
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_token_count = len(chunk.split()) * 1.3 # Estimate token count with a factor
|
||||
if total_token_so_far + chunk_token_count < token_threshold:
|
||||
current_chunk.append(chunk)
|
||||
total_token_so_far += chunk_token_count
|
||||
else:
|
||||
if current_chunk:
|
||||
merged_sections.append('\n\n'.join(current_chunk))
|
||||
current_chunk = [chunk]
|
||||
total_token_so_far = chunk_token_count
|
||||
|
||||
# Add the last chunk if it exists
|
||||
if current_chunk:
|
||||
merged_sections.append('\n\n'.join(current_chunk))
|
||||
|
||||
return merged_sections
|
||||
|
||||
def process_sections(url: str, sections: list, provider: str, api_token: str) -> list:
|
||||
extracted_content = []
|
||||
if provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
for section in sections:
|
||||
extracted_content.extend(extract_blocks(url, section, provider, api_token))
|
||||
time.sleep(0.5) # 500 ms delay between each processing
|
||||
else:
|
||||
# Parallel processing using ThreadPoolExecutor
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(extract_blocks, url, section, provider, api_token) for section in sections]
|
||||
for future in as_completed(futures):
|
||||
extracted_content.extend(future.result())
|
||||
|
||||
return extracted_content
|
||||
|
||||
|
||||
def wrap_text(draw, text, font, max_width):
|
||||
# Wrap the text to fit within the specified width
|
||||
lines = []
|
||||
words = text.split()
|
||||
while words:
|
||||
line = ''
|
||||
while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
|
||||
line += (words.pop(0) + ' ')
|
||||
lines.append(line)
|
||||
return '\n'.join(lines)
|
||||
357
crawl4ai/web_crawler.back.py
Normal file
@@ -0,0 +1,357 @@
|
||||
import os, time
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
from pathlib import Path
|
||||
|
||||
from .models import UrlModel, CrawlResult
|
||||
from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db
|
||||
from .utils import *
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .config import *
|
||||
|
||||
|
||||
class WebCrawler:
|
||||
def __init__(
|
||||
self,
|
||||
# db_path: str = None,
|
||||
crawler_strategy: CrawlerStrategy = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
verbose: bool = False,
|
||||
):
|
||||
# self.db_path = db_path
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
|
||||
# If db_path is not provided, use the default path
|
||||
# if not db_path:
|
||||
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
|
||||
|
||||
# flush_db()
|
||||
init_db()
|
||||
|
||||
self.ready = False
|
||||
|
||||
def warmup(self):
|
||||
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||
result = self.run(
|
||||
url='https://crawl4ai.uccode.io/',
|
||||
word_count_threshold=5,
|
||||
extraction_strategy= NoExtractionStrategy(),
|
||||
bypass_cache=False,
|
||||
verbose = False
|
||||
)
|
||||
self.ready = True
|
||||
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
||||
|
||||
def fetch_page(
|
||||
self,
|
||||
url_model: UrlModel,
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: str = None,
|
||||
extract_blocks_flag: bool = True,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
use_cached_html: bool = False,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
return self.run(
|
||||
url_model.url,
|
||||
word_count_threshold,
|
||||
extraction_strategy or NoExtractionStrategy(),
|
||||
chunking_strategy,
|
||||
bypass_cache=url_model.forced,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
**kwargs,
|
||||
)
|
||||
pass
|
||||
|
||||
def run_old(
|
||||
self,
|
||||
url: str,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
|
||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
word_count_threshold = MIN_WORD_THRESHOLD
|
||||
|
||||
# Check cache first
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
if cached:
|
||||
return CrawlResult(
|
||||
**{
|
||||
"url": cached[0],
|
||||
"html": cached[1],
|
||||
"cleaned_html": cached[2],
|
||||
"markdown": cached[3],
|
||||
"extracted_content": cached[4],
|
||||
"success": cached[5],
|
||||
"media": json.loads(cached[6] or "{}"),
|
||||
"links": json.loads(cached[7] or "{}"),
|
||||
"metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
|
||||
"screenshot": cached[9],
|
||||
"error_message": "",
|
||||
}
|
||||
)
|
||||
|
||||
# Initialize WebDriver for crawling
|
||||
t = time.time()
|
||||
if kwargs.get("js", None):
|
||||
self.crawler_strategy.js_code = kwargs.get("js")
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
base64_image = None
|
||||
if screenshot:
|
||||
base64_image = self.crawler_strategy.take_screenshot()
|
||||
success = True
|
||||
error_message = ""
|
||||
# Extract content from HTML
|
||||
try:
|
||||
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||
metadata = extract_metadata(html)
|
||||
if result is None:
|
||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
cleaned_html = result.get("cleaned_html", "")
|
||||
markdown = result.get("markdown", "")
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
|
||||
# Print a profession LOG style message, show time taken and say crawling is done
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
|
||||
)
|
||||
|
||||
extracted_content = []
|
||||
if verbose:
|
||||
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||
t = time.time()
|
||||
# Split markdown into sections
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
|
||||
|
||||
extracted_content = extraction_strategy.run(
|
||||
url, sections,
|
||||
)
|
||||
extracted_content = json.dumps(extracted_content)
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
|
||||
)
|
||||
|
||||
# Cache the result
|
||||
cleaned_html = beautify_html(cleaned_html)
|
||||
cache_url(
|
||||
url,
|
||||
html,
|
||||
cleaned_html,
|
||||
markdown,
|
||||
extracted_content,
|
||||
success,
|
||||
json.dumps(media),
|
||||
json.dumps(links),
|
||||
json.dumps(metadata),
|
||||
screenshot=base64_image,
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=cleaned_html,
|
||||
markdown=markdown,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=base64_image,
|
||||
extracted_content=extracted_content,
|
||||
success=success,
|
||||
error_message=error_message,
|
||||
)
|
||||
|
||||
def fetch_pages(
|
||||
self,
|
||||
url_models: List[UrlModel],
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: str = None,
|
||||
extract_blocks_flag: bool = True,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
use_cached_html: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
**kwargs,
|
||||
) -> List[CrawlResult]:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
def fetch_page_wrapper(url_model, *args, **kwargs):
|
||||
return self.fetch_page(url_model, *args, **kwargs)
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
results = list(
|
||||
executor.map(
|
||||
fetch_page_wrapper,
|
||||
url_models,
|
||||
[provider] * len(url_models),
|
||||
[api_token] * len(url_models),
|
||||
[extract_blocks_flag] * len(url_models),
|
||||
[word_count_threshold] * len(url_models),
|
||||
[css_selector] * len(url_models),
|
||||
[screenshot] * len(url_models),
|
||||
[use_cached_html] * len(url_models),
|
||||
[extraction_strategy] * len(url_models),
|
||||
[chunking_strategy] * len(url_models),
|
||||
*[kwargs] * len(url_models),
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def run(
|
||||
self,
|
||||
url: str,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
word_count_threshold = MIN_WORD_THRESHOLD
|
||||
|
||||
# Check cache first
|
||||
cached = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
|
||||
if cached:
|
||||
html = cached[1]
|
||||
extracted_content = cached[2]
|
||||
if screenshot:
|
||||
screenshot = cached[9]
|
||||
|
||||
else:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
if screenshot:
|
||||
screenshot = self.crawler_strategy.take_screenshot()
|
||||
|
||||
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
|
||||
|
||||
def process_html(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
extracted_content: str,
|
||||
word_count_threshold: int,
|
||||
extraction_strategy: ExtractionStrategy,
|
||||
chunking_strategy: ChunkingStrategy,
|
||||
css_selector: str,
|
||||
screenshot: bool,
|
||||
verbose: bool,
|
||||
is_cached: bool,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.time()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||
metadata = extract_metadata(html)
|
||||
if result is None:
|
||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
cleaned_html = result.get("cleaned_html", "")
|
||||
markdown = result.get("markdown", "")
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
|
||||
|
||||
if extracted_content is None:
|
||||
if verbose:
|
||||
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
extracted_content = extraction_strategy.run(url, sections)
|
||||
extracted_content = json.dumps(extracted_content)
|
||||
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
if not is_cached:
|
||||
cache_url(
|
||||
url,
|
||||
html,
|
||||
cleaned_html,
|
||||
markdown,
|
||||
extracted_content,
|
||||
True,
|
||||
json.dumps(media),
|
||||
json.dumps(links),
|
||||
json.dumps(metadata),
|
||||
screenshot=screenshot,
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=cleaned_html,
|
||||
markdown=markdown,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=screenshot,
|
||||
extracted_content=extracted_content,
|
||||
success=True,
|
||||
error_message="",
|
||||
)
|
||||
@@ -1,135 +1,233 @@
|
||||
import asyncio
|
||||
import os, time
|
||||
import json
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
import chromedriver_autoinstaller
|
||||
from pydantic import parse_obj_as
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
from pathlib import Path
|
||||
|
||||
from .models import UrlModel, CrawlResult
|
||||
from .database import init_db, get_cached_url, cache_url
|
||||
from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db
|
||||
from .utils import *
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from .config import *
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .config import *
|
||||
|
||||
|
||||
class WebCrawler:
|
||||
def __init__(self, db_path: str):
|
||||
self.db_path = db_path
|
||||
init_db(self.db_path)
|
||||
self.options = Options()
|
||||
self.options.headless = True
|
||||
self.options.add_argument("--no-sandbox")
|
||||
self.options.add_argument("--disable-dev-shm-usage")
|
||||
# make it headless
|
||||
self.options.add_argument("--headless")
|
||||
def __init__(
|
||||
self,
|
||||
# db_path: str = None,
|
||||
crawler_strategy: CrawlerStrategy = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
verbose: bool = False,
|
||||
):
|
||||
# self.db_path = db_path
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
|
||||
# Automatically install or update chromedriver
|
||||
chromedriver_autoinstaller.install()
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
|
||||
def fetch_page(self, url_model: UrlModel, provider: str = DEFAULT_PROVIDER, api_token: str = None, extract_blocks_flag: bool = True, word_count_threshold = MIN_WORD_THRESHOLD) -> CrawlResult:
|
||||
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
|
||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
word_count_threshold = MIN_WORD_THRESHOLD
|
||||
|
||||
# Check cache first
|
||||
cached = get_cached_url(self.db_path, str(url_model.url))
|
||||
if cached and not url_model.forced:
|
||||
return CrawlResult(**{
|
||||
"url": cached[0],
|
||||
"html": cached[1],
|
||||
"cleaned_html": cached[2],
|
||||
"markdown": cached[3],
|
||||
"parsed_json": cached[4],
|
||||
"success": cached[5],
|
||||
"error_message": ""
|
||||
})
|
||||
|
||||
|
||||
# Initialize WebDriver for crawling
|
||||
service = Service(chromedriver_autoinstaller.install())
|
||||
driver = webdriver.Chrome(service=service, options=self.options)
|
||||
|
||||
try:
|
||||
driver.get(str(url_model.url))
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||
)
|
||||
html = driver.page_source
|
||||
success = True
|
||||
error_message = ""
|
||||
except Exception as e:
|
||||
html = ""
|
||||
success = False
|
||||
error_message = str(e)
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
# Extract content from HTML
|
||||
result = get_content_of_website(html, word_count_threshold)
|
||||
cleaned_html = result.get('cleaned_html', html)
|
||||
markdown = result.get('markdown', "")
|
||||
# If db_path is not provided, use the default path
|
||||
# if not db_path:
|
||||
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
|
||||
|
||||
print("Crawling is done 🚀")
|
||||
|
||||
parsed_json = []
|
||||
if extract_blocks_flag:
|
||||
# Split markdown into sections
|
||||
paragraphs = markdown.split('\n\n')
|
||||
sections = []
|
||||
chunks = []
|
||||
total_token_so_far = 0
|
||||
|
||||
for paragraph in paragraphs:
|
||||
if total_token_so_far < CHUNK_TOKEN_THRESHOLD:
|
||||
chunk = paragraph.split(' ')
|
||||
total_token_so_far += len(chunk) * 1.3
|
||||
chunks.append(paragraph)
|
||||
else:
|
||||
sections.append('\n\n'.join(chunks))
|
||||
chunks = [paragraph]
|
||||
total_token_so_far = len(paragraph.split(' ')) * 1.3
|
||||
|
||||
if chunks:
|
||||
sections.append('\n\n'.join(chunks))
|
||||
|
||||
# Process sections to extract blocks
|
||||
parsed_json = []
|
||||
if provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
for section in sections:
|
||||
parsed_json.extend(extract_blocks(str(url_model.url), section, provider, api_token))
|
||||
time.sleep(0.5) # 500 ms delay between each processing
|
||||
else:
|
||||
# Parallel processing using ThreadPoolExecutor
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(extract_blocks, str(url_model.url), section, provider, api_token) for section in sections]
|
||||
for future in as_completed(futures):
|
||||
parsed_json.extend(future.result())
|
||||
|
||||
parsed_json = json.dumps(parsed_json)
|
||||
else:
|
||||
parsed_json = "{}"
|
||||
|
||||
# Cache the result
|
||||
cleaned_html = beautify_html(cleaned_html)
|
||||
cache_url(self.db_path, str(url_model.url), html, cleaned_html, markdown, parsed_json, success)
|
||||
|
||||
return CrawlResult(
|
||||
url=str(url_model.url),
|
||||
html=html,
|
||||
cleaned_html=cleaned_html,
|
||||
markdown=markdown,
|
||||
parsed_json=parsed_json,
|
||||
success=success,
|
||||
error_message=error_message
|
||||
# flush_db()
|
||||
init_db()
|
||||
|
||||
self.ready = False
|
||||
|
||||
def warmup(self):
|
||||
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||
result = self.run(
|
||||
url='https://crawl4ai.uccode.io/',
|
||||
word_count_threshold=5,
|
||||
extraction_strategy= NoExtractionStrategy(),
|
||||
bypass_cache=False,
|
||||
verbose = False
|
||||
)
|
||||
self.ready = True
|
||||
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
||||
|
||||
def fetch_page(
|
||||
self,
|
||||
url_model: UrlModel,
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: str = None,
|
||||
extract_blocks_flag: bool = True,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
use_cached_html: bool = False,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
return self.run(
|
||||
url_model.url,
|
||||
word_count_threshold,
|
||||
extraction_strategy or NoExtractionStrategy(),
|
||||
chunking_strategy,
|
||||
bypass_cache=url_model.forced,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
**kwargs,
|
||||
)
|
||||
pass
|
||||
|
||||
def fetch_pages(
|
||||
self,
|
||||
url_models: List[UrlModel],
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: str = None,
|
||||
extract_blocks_flag: bool = True,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
use_cached_html: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
**kwargs,
|
||||
) -> List[CrawlResult]:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
def fetch_page_wrapper(url_model, *args, **kwargs):
|
||||
return self.fetch_page(url_model, *args, **kwargs)
|
||||
|
||||
def fetch_pages(self, url_models: List[UrlModel], provider: str = DEFAULT_PROVIDER, api_token: str = None) -> List[CrawlResult]:
|
||||
with ThreadPoolExecutor() as executor:
|
||||
results = list(executor.map(self.fetch_page, url_models, [provider] * len(url_models), [api_token] * len(url_models)))
|
||||
return results
|
||||
results = list(
|
||||
executor.map(
|
||||
fetch_page_wrapper,
|
||||
url_models,
|
||||
[provider] * len(url_models),
|
||||
[api_token] * len(url_models),
|
||||
[extract_blocks_flag] * len(url_models),
|
||||
[word_count_threshold] * len(url_models),
|
||||
[css_selector] * len(url_models),
|
||||
[screenshot] * len(url_models),
|
||||
[use_cached_html] * len(url_models),
|
||||
[extraction_strategy] * len(url_models),
|
||||
[chunking_strategy] * len(url_models),
|
||||
*[kwargs] * len(url_models),
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def run(
|
||||
self,
|
||||
url: str,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
word_count_threshold = MIN_WORD_THRESHOLD
|
||||
|
||||
# Check cache first
|
||||
cached = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
|
||||
if cached:
|
||||
html = cached[1]
|
||||
extracted_content = cached[2]
|
||||
if screenshot:
|
||||
screenshot = cached[9]
|
||||
|
||||
else:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
if screenshot:
|
||||
screenshot = self.crawler_strategy.take_screenshot()
|
||||
|
||||
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
|
||||
|
||||
def process_html(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
extracted_content: str,
|
||||
word_count_threshold: int,
|
||||
extraction_strategy: ExtractionStrategy,
|
||||
chunking_strategy: ChunkingStrategy,
|
||||
css_selector: str,
|
||||
screenshot: bool,
|
||||
verbose: bool,
|
||||
is_cached: bool,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.time()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||
metadata = extract_metadata(html)
|
||||
if result is None:
|
||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
cleaned_html = result.get("cleaned_html", "")
|
||||
markdown = result.get("markdown", "")
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
|
||||
|
||||
if extracted_content is None:
|
||||
if verbose:
|
||||
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
extracted_content = extraction_strategy.run(url, sections)
|
||||
extracted_content = json.dumps(extracted_content)
|
||||
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
if not is_cached:
|
||||
cache_url(
|
||||
url,
|
||||
html,
|
||||
cleaned_html,
|
||||
markdown,
|
||||
extracted_content,
|
||||
True,
|
||||
json.dumps(media),
|
||||
json.dumps(links),
|
||||
json.dumps(metadata),
|
||||
screenshot=screenshot,
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=cleaned_html,
|
||||
markdown=markdown,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=screenshot,
|
||||
extracted_content=extracted_content,
|
||||
success=True,
|
||||
error_message="",
|
||||
)
|
||||
BIN
docs/.DS_Store
vendored
Normal file
12
docs/chunking_strategies.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"RegexChunking": "### RegexChunking\n\n`RegexChunking` is a text chunking strategy that splits a given text into smaller parts using regular expressions.\nThis is useful for preparing large texts for processing by language models, ensuring they are divided into manageable segments.\n\n#### Constructor Parameters:\n- `patterns` (list, optional): A list of regular expression patterns used to split the text. Default is to split by double newlines (`['\\n\\n']`).\n\n#### Example usage:\n```python\nchunker = RegexChunking(patterns=[r'\\n\\n', r'\\. '])\nchunks = chunker.chunk(\"This is a sample text. It will be split into chunks.\")\n```",
|
||||
|
||||
"NlpSentenceChunking": "### NlpSentenceChunking\n\n`NlpSentenceChunking` uses a natural language processing model to chunk a given text into sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.\n\n#### Constructor Parameters:\n- None.\n\n#### Example usage:\n```python\nchunker = NlpSentenceChunking()\nchunks = chunker.chunk(\"This is a sample text. It will be split into sentences.\")\n```",
|
||||
|
||||
"TopicSegmentationChunking": "### TopicSegmentationChunking\n\n`TopicSegmentationChunking` uses the TextTiling algorithm to segment a given text into topic-based chunks. This method identifies thematic boundaries in the text.\n\n#### Constructor Parameters:\n- `num_keywords` (int, optional): The number of keywords to extract for each topic segment. Default is `3`.\n\n#### Example usage:\n```python\nchunker = TopicSegmentationChunking(num_keywords=3)\nchunks = chunker.chunk(\"This is a sample text. It will be split into topic-based segments.\")\n```",
|
||||
|
||||
"FixedLengthWordChunking": "### FixedLengthWordChunking\n\n`FixedLengthWordChunking` splits a given text into chunks of fixed length, based on the number of words.\n\n#### Constructor Parameters:\n- `chunk_size` (int, optional): The number of words in each chunk. Default is `100`.\n\n#### Example usage:\n```python\nchunker = FixedLengthWordChunking(chunk_size=100)\nchunks = chunker.chunk(\"This is a sample text. It will be split into fixed-length word chunks.\")\n```",
|
||||
|
||||
"SlidingWindowChunking": "### SlidingWindowChunking\n\n`SlidingWindowChunking` uses a sliding window approach to chunk a given text. Each chunk has a fixed length, and the window slides by a specified step size.\n\n#### Constructor Parameters:\n- `window_size` (int, optional): The number of words in each chunk. Default is `100`.\n- `step` (int, optional): The number of words to slide the window. Default is `50`.\n\n#### Example usage:\n```python\nchunker = SlidingWindowChunking(window_size=100, step=50)\nchunks = chunker.chunk(\"This is a sample text. It will be split using a sliding window approach.\")\n```"
|
||||
}
|
||||
|
||||
BIN
docs/examples/assets/audio.mp3
Normal file
BIN
docs/examples/assets/basic.png
Normal file
|
After Width: | Height: | Size: 372 KiB |
BIN
docs/examples/assets/cosine_extraction.png
Normal file
|
After Width: | Height: | Size: 403 KiB |
BIN
docs/examples/assets/css_js.png
Normal file
|
After Width: | Height: | Size: 537 KiB |
BIN
docs/examples/assets/css_selector.png
Normal file
|
After Width: | Height: | Size: 375 KiB |
BIN
docs/examples/assets/exec_script.png
Normal file
|
After Width: | Height: | Size: 469 KiB |
BIN
docs/examples/assets/llm_extraction.png
Normal file
|
After Width: | Height: | Size: 477 KiB |
BIN
docs/examples/assets/semantic_extraction_cosine.png
Normal file
|
After Width: | Height: | Size: 419 KiB |
BIN
docs/examples/assets/semantic_extraction_llm.png
Normal file
|
After Width: | Height: | Size: 485 KiB |
3
docs/examples/chainlit.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Welcome to Crawl4AI! 🚀🤖
|
||||
|
||||
Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
|
||||
281
docs/examples/chainlit_review.py
Normal file
@@ -0,0 +1,281 @@
|
||||
from openai import AsyncOpenAI
|
||||
from chainlit.types import ThreadDict
|
||||
import chainlit as cl
|
||||
from chainlit.input_widget import Select, Switch, Slider
|
||||
client = AsyncOpenAI()
|
||||
|
||||
# Instrument the OpenAI client
|
||||
cl.instrument_openai()
|
||||
|
||||
settings = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"temperature": 0.5,
|
||||
"max_tokens": 500,
|
||||
"top_p": 1,
|
||||
"frequency_penalty": 0,
|
||||
"presence_penalty": 0,
|
||||
}
|
||||
|
||||
@cl.action_callback("action_button")
|
||||
async def on_action(action: cl.Action):
|
||||
print("The user clicked on the action button!")
|
||||
|
||||
return "Thank you for clicking on the action button!"
|
||||
|
||||
@cl.set_chat_profiles
|
||||
async def chat_profile():
|
||||
return [
|
||||
cl.ChatProfile(
|
||||
name="GPT-3.5",
|
||||
markdown_description="The underlying LLM model is **GPT-3.5**.",
|
||||
icon="https://picsum.photos/200",
|
||||
),
|
||||
cl.ChatProfile(
|
||||
name="GPT-4",
|
||||
markdown_description="The underlying LLM model is **GPT-4**.",
|
||||
icon="https://picsum.photos/250",
|
||||
),
|
||||
]
|
||||
|
||||
@cl.on_chat_start
|
||||
async def on_chat_start():
|
||||
|
||||
settings = await cl.ChatSettings(
|
||||
[
|
||||
Select(
|
||||
id="Model",
|
||||
label="OpenAI - Model",
|
||||
values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
|
||||
initial_index=0,
|
||||
),
|
||||
Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
|
||||
Slider(
|
||||
id="Temperature",
|
||||
label="OpenAI - Temperature",
|
||||
initial=1,
|
||||
min=0,
|
||||
max=2,
|
||||
step=0.1,
|
||||
),
|
||||
Slider(
|
||||
id="SAI_Steps",
|
||||
label="Stability AI - Steps",
|
||||
initial=30,
|
||||
min=10,
|
||||
max=150,
|
||||
step=1,
|
||||
description="Amount of inference steps performed on image generation.",
|
||||
),
|
||||
Slider(
|
||||
id="SAI_Cfg_Scale",
|
||||
label="Stability AI - Cfg_Scale",
|
||||
initial=7,
|
||||
min=1,
|
||||
max=35,
|
||||
step=0.1,
|
||||
description="Influences how strongly your generation is guided to match your prompt.",
|
||||
),
|
||||
Slider(
|
||||
id="SAI_Width",
|
||||
label="Stability AI - Image Width",
|
||||
initial=512,
|
||||
min=256,
|
||||
max=2048,
|
||||
step=64,
|
||||
tooltip="Measured in pixels",
|
||||
),
|
||||
Slider(
|
||||
id="SAI_Height",
|
||||
label="Stability AI - Image Height",
|
||||
initial=512,
|
||||
min=256,
|
||||
max=2048,
|
||||
step=64,
|
||||
tooltip="Measured in pixels",
|
||||
),
|
||||
]
|
||||
).send()
|
||||
|
||||
chat_profile = cl.user_session.get("chat_profile")
|
||||
await cl.Message(
|
||||
content=f"starting chat using the {chat_profile} chat profile"
|
||||
).send()
|
||||
|
||||
print("A new chat session has started!")
|
||||
cl.user_session.set("session", {
|
||||
"history": [],
|
||||
"context": []
|
||||
})
|
||||
|
||||
image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
|
||||
|
||||
# Attach the image to the message
|
||||
await cl.Message(
|
||||
content="You are such a good girl, aren't you?!",
|
||||
elements=[image],
|
||||
).send()
|
||||
|
||||
text_content = "Hello, this is a text element."
|
||||
elements = [
|
||||
cl.Text(name="simple_text", content=text_content, display="inline")
|
||||
]
|
||||
|
||||
await cl.Message(
|
||||
content="Check out this text element!",
|
||||
elements=elements,
|
||||
).send()
|
||||
|
||||
elements = [
|
||||
cl.Audio(path="./assets/audio.mp3", display="inline"),
|
||||
]
|
||||
await cl.Message(
|
||||
content="Here is an audio file",
|
||||
elements=elements,
|
||||
).send()
|
||||
|
||||
await cl.Avatar(
|
||||
name="Tool 1",
|
||||
url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
|
||||
).send()
|
||||
|
||||
await cl.Message(
|
||||
content="This message should not have an avatar!", author="Tool 0"
|
||||
).send()
|
||||
|
||||
await cl.Message(
|
||||
content="This message should have an avatar!", author="Tool 1"
|
||||
).send()
|
||||
|
||||
elements = [
|
||||
cl.File(
|
||||
name="quickstart.py",
|
||||
path="./quickstart.py",
|
||||
display="inline",
|
||||
),
|
||||
]
|
||||
|
||||
await cl.Message(
|
||||
content="This message has a file element", elements=elements
|
||||
).send()
|
||||
|
||||
# Sending an action button within a chatbot message
|
||||
actions = [
|
||||
cl.Action(name="action_button", value="example_value", description="Click me!")
|
||||
]
|
||||
|
||||
await cl.Message(content="Interact with this action button:", actions=actions).send()
|
||||
|
||||
# res = await cl.AskActionMessage(
|
||||
# content="Pick an action!",
|
||||
# actions=[
|
||||
# cl.Action(name="continue", value="continue", label="✅ Continue"),
|
||||
# cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
|
||||
# ],
|
||||
# ).send()
|
||||
|
||||
# if res and res.get("value") == "continue":
|
||||
# await cl.Message(
|
||||
# content="Continue!",
|
||||
# ).send()
|
||||
|
||||
# import plotly.graph_objects as go
|
||||
# fig = go.Figure(
|
||||
# data=[go.Bar(y=[2, 1, 3])],
|
||||
# layout_title_text="An example figure",
|
||||
# )
|
||||
# elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
|
||||
|
||||
# await cl.Message(content="This message has a chart", elements=elements).send()
|
||||
|
||||
# Sending a pdf with the local file path
|
||||
# elements = [
|
||||
# cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
|
||||
# ]
|
||||
|
||||
# cl.Message(content="Look at this local pdf!", elements=elements).send()
|
||||
|
||||
@cl.on_settings_update
|
||||
async def setup_agent(settings):
|
||||
print("on_settings_update", settings)
|
||||
|
||||
@cl.on_stop
|
||||
def on_stop():
|
||||
print("The user wants to stop the task!")
|
||||
|
||||
@cl.on_chat_end
|
||||
def on_chat_end():
|
||||
print("The user disconnected!")
|
||||
|
||||
|
||||
@cl.on_chat_resume
|
||||
async def on_chat_resume(thread: ThreadDict):
|
||||
print("The user resumed a previous chat session!")
|
||||
|
||||
|
||||
|
||||
|
||||
# @cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
cl.user_session.get("session")["history"].append({
|
||||
"role": "user",
|
||||
"content": message.content
|
||||
})
|
||||
response = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"content": "You are a helpful bot",
|
||||
"role": "system"
|
||||
},
|
||||
*cl.user_session.get("session")["history"]
|
||||
],
|
||||
**settings
|
||||
)
|
||||
|
||||
|
||||
# Add assitanr message to the history
|
||||
cl.user_session.get("session")["history"].append({
|
||||
"role": "assistant",
|
||||
"content": response.choices[0].message.content
|
||||
})
|
||||
|
||||
# msg.content = response.choices[0].message.content
|
||||
# await msg.update()
|
||||
|
||||
# await cl.Message(content=response.choices[0].message.content).send()
|
||||
|
||||
@cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
cl.user_session.get("session")["history"].append({
|
||||
"role": "user",
|
||||
"content": message.content
|
||||
})
|
||||
|
||||
msg = cl.Message(content="")
|
||||
await msg.send()
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"content": "You are a helpful bot",
|
||||
"role": "system"
|
||||
},
|
||||
*cl.user_session.get("session")["history"]
|
||||
],
|
||||
stream = True,
|
||||
**settings
|
||||
)
|
||||
|
||||
async for part in stream:
|
||||
if token := part.choices[0].delta.content or "":
|
||||
await msg.stream_token(token)
|
||||
|
||||
# Add assitanr message to the history
|
||||
cl.user_session.get("session")["history"].append({
|
||||
"role": "assistant",
|
||||
"content": msg.content
|
||||
})
|
||||
await msg.update()
|
||||
|
||||
if __name__ == "__main__":
|
||||
from chainlit.cli import run_chainlit
|
||||
run_chainlit(__file__)
|
||||
218
docs/examples/quickstart.py
Normal file
@@ -0,0 +1,218 @@
|
||||
import os
|
||||
import time
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
from rich import print
|
||||
from rich.console import Console
|
||||
from functools import lru_cache
|
||||
|
||||
console = Console()
|
||||
|
||||
@lru_cache()
|
||||
def create_crawler():
|
||||
crawler = WebCrawler(verbose=True)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
def print_result(result):
|
||||
# Print each key in one line and just the first 10 characters of each one's value and three dots
|
||||
console.print(f"\t[bold]Result:[/bold]")
|
||||
for key, value in result.model_dump().items():
|
||||
if isinstance(value, str) and value:
|
||||
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
|
||||
if result.extracted_content:
|
||||
items = json.loads(result.extracted_content)
|
||||
print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
|
||||
|
||||
|
||||
def cprint(message, press_any_key=False):
|
||||
console.print(message)
|
||||
if press_any_key:
|
||||
console.print("Press any key to continue...", style="")
|
||||
input()
|
||||
|
||||
def basic_usage(crawler):
|
||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def screenshot_usage(crawler):
|
||||
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||||
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
|
||||
# Save the screenshot to a file
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
cprint("Screenshot saved to 'screenshot.png'!")
|
||||
print_result(result)
|
||||
|
||||
def understanding_parameters(crawler):
|
||||
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
||||
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
|
||||
|
||||
# First crawl (reads from cache)
|
||||
cprint("1️⃣ First crawl (caches the result):", True)
|
||||
start_time = time.time()
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
end_time = time.time()
|
||||
cprint(f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
# Force to crawl again
|
||||
cprint("2️⃣ Second crawl (Force to crawl again):", True)
|
||||
start_time = time.time()
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||
end_time = time.time()
|
||||
cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def add_chunking_strategy(crawler):
|
||||
# Adding a chunking strategy: RegexChunking
|
||||
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
|
||||
cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
# Adding another chunking strategy: NlpSentenceChunking
|
||||
cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
|
||||
cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def add_extraction_strategy(crawler):
|
||||
# Adding an extraction strategy: CosineStrategy
|
||||
cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
|
||||
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
# Using semantic_filter with CosineStrategy
|
||||
cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="inflation rent prices",
|
||||
)
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def add_llm_extraction_strategy(crawler):
|
||||
# Adding an LLM extraction strategy without instructions
|
||||
cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
|
||||
cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
# Adding an LLM extraction strategy with instructions
|
||||
cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
|
||||
cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="I am interested in only financial news"
|
||||
)
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Extract only content related to technology"
|
||||
)
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def targeted_extraction(crawler):
|
||||
# Using a CSS selector to extract only H2 tags
|
||||
cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector="h2"
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def interactive_extraction(crawler):
|
||||
# Passing JavaScript code to interact with the page
|
||||
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||
js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def multiple_scrip(crawler):
|
||||
# Passing JavaScript code to interact with the page
|
||||
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||
js_code = ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""] * 2
|
||||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def main():
|
||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||
cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
|
||||
|
||||
crawler = create_crawler()
|
||||
|
||||
basic_usage(crawler)
|
||||
understanding_parameters(crawler)
|
||||
|
||||
crawler.always_by_pass_cache = True
|
||||
screenshot_usage(crawler)
|
||||
add_chunking_strategy(crawler)
|
||||
add_extraction_strategy(crawler)
|
||||
add_llm_extraction_strategy(crawler)
|
||||
targeted_extraction(crawler)
|
||||
interactive_extraction(crawler)
|
||||
multiple_scrip(crawler)
|
||||
|
||||
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
241
docs/examples/research_assistant.py
Normal file
@@ -0,0 +1,241 @@
|
||||
# Make sur to install the required packageschainlit and groq
|
||||
import os, time
|
||||
from openai import AsyncOpenAI
|
||||
import chainlit as cl
|
||||
import re
|
||||
import requests
|
||||
from io import BytesIO
|
||||
from chainlit.element import ElementBased
|
||||
from groq import Groq
|
||||
|
||||
# Import threadpools to run the crawl_url function in a separate thread
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
|
||||
|
||||
# Instrument the OpenAI client
|
||||
cl.instrument_openai()
|
||||
|
||||
settings = {
|
||||
"model": "llama3-8b-8192",
|
||||
"temperature": 0.5,
|
||||
"max_tokens": 500,
|
||||
"top_p": 1,
|
||||
"frequency_penalty": 0,
|
||||
"presence_penalty": 0,
|
||||
}
|
||||
|
||||
def extract_urls(text):
|
||||
url_pattern = re.compile(r'(https?://\S+)')
|
||||
return url_pattern.findall(text)
|
||||
|
||||
def crawl_url(url):
|
||||
data = {
|
||||
"urls": [url],
|
||||
"include_raw_html": True,
|
||||
"word_count_threshold": 10,
|
||||
"extraction_strategy": "NoExtractionStrategy",
|
||||
"chunking_strategy": "RegexChunking"
|
||||
}
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
response_data = response.json()
|
||||
response_data = response_data['results'][0]
|
||||
return response_data['markdown']
|
||||
|
||||
@cl.on_chat_start
|
||||
async def on_chat_start():
|
||||
cl.user_session.set("session", {
|
||||
"history": [],
|
||||
"context": {}
|
||||
})
|
||||
await cl.Message(
|
||||
content="Welcome to the chat! How can I assist you today?"
|
||||
).send()
|
||||
|
||||
@cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
user_session = cl.user_session.get("session")
|
||||
|
||||
# Extract URLs from the user's message
|
||||
urls = extract_urls(message.content)
|
||||
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for url in urls:
|
||||
futures.append(executor.submit(crawl_url, url))
|
||||
|
||||
results = [future.result() for future in futures]
|
||||
|
||||
for url, result in zip(urls, results):
|
||||
ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||
user_session["context"][ref_number] = {
|
||||
"url": url,
|
||||
"content": result
|
||||
}
|
||||
|
||||
# for url in urls:
|
||||
# # Crawl the content of each URL and add it to the session context with a reference number
|
||||
# ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||
# crawled_content = crawl_url(url)
|
||||
# user_session["context"][ref_number] = {
|
||||
# "url": url,
|
||||
# "content": crawled_content
|
||||
# }
|
||||
|
||||
user_session["history"].append({
|
||||
"role": "user",
|
||||
"content": message.content
|
||||
})
|
||||
|
||||
# Create a system message that includes the context
|
||||
context_messages = [
|
||||
f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
|
||||
for ref, data in user_session["context"].items()
|
||||
]
|
||||
if context_messages:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful bot. Use the following context for answering questions. "
|
||||
"Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
|
||||
"If the question requires any information from the provided appendices or context, refer to the sources. "
|
||||
"If not, there is no need to add a references section. "
|
||||
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
|
||||
"\n\n".join(context_messages)
|
||||
)
|
||||
}
|
||||
else:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}
|
||||
|
||||
|
||||
msg = cl.Message(content="")
|
||||
await msg.send()
|
||||
|
||||
# Get response from the LLM
|
||||
stream = await client.chat.completions.create(
|
||||
messages=[
|
||||
system_message,
|
||||
*user_session["history"]
|
||||
],
|
||||
stream=True,
|
||||
**settings
|
||||
)
|
||||
|
||||
assistant_response = ""
|
||||
async for part in stream:
|
||||
if token := part.choices[0].delta.content:
|
||||
assistant_response += token
|
||||
await msg.stream_token(token)
|
||||
|
||||
# Add assistant message to the history
|
||||
user_session["history"].append({
|
||||
"role": "assistant",
|
||||
"content": assistant_response
|
||||
})
|
||||
await msg.update()
|
||||
|
||||
# Append the reference section to the assistant's response
|
||||
reference_section = "\n\nReferences:\n"
|
||||
for ref, data in user_session["context"].items():
|
||||
reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
|
||||
|
||||
msg.content += reference_section
|
||||
await msg.update()
|
||||
|
||||
|
||||
@cl.on_audio_chunk
|
||||
async def on_audio_chunk(chunk: cl.AudioChunk):
|
||||
if chunk.isStart:
|
||||
buffer = BytesIO()
|
||||
# This is required for whisper to recognize the file type
|
||||
buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
|
||||
# Initialize the session for a new audio stream
|
||||
cl.user_session.set("audio_buffer", buffer)
|
||||
cl.user_session.set("audio_mime_type", chunk.mimeType)
|
||||
|
||||
# Write the chunks to a buffer and transcribe the whole audio at the end
|
||||
cl.user_session.get("audio_buffer").write(chunk.data)
|
||||
|
||||
pass
|
||||
|
||||
@cl.step(type="tool")
|
||||
async def speech_to_text(audio_file):
|
||||
cli = Groq()
|
||||
|
||||
# response = cli.audio.transcriptions.create(
|
||||
# file=audio_file, #(filename, file.read()),
|
||||
# model="whisper-large-v3",
|
||||
# )
|
||||
|
||||
response = await client.audio.transcriptions.create(
|
||||
model="whisper-large-v3", file=audio_file
|
||||
)
|
||||
|
||||
return response.text
|
||||
|
||||
|
||||
@cl.on_audio_end
|
||||
async def on_audio_end(elements: list[ElementBased]):
|
||||
# Get the audio buffer from the session
|
||||
audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
|
||||
audio_buffer.seek(0) # Move the file pointer to the beginning
|
||||
audio_file = audio_buffer.read()
|
||||
audio_mime_type: str = cl.user_session.get("audio_mime_type")
|
||||
|
||||
# input_audio_el = cl.Audio(
|
||||
# mime=audio_mime_type, content=audio_file, name=audio_buffer.name
|
||||
# )
|
||||
# await cl.Message(
|
||||
# author="You",
|
||||
# type="user_message",
|
||||
# content="",
|
||||
# elements=[input_audio_el, *elements]
|
||||
# ).send()
|
||||
|
||||
# answer_message = await cl.Message(content="").send()
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
|
||||
transcription = await speech_to_text(whisper_input)
|
||||
end_time = time.time()
|
||||
print(f"Transcription took {end_time - start_time} seconds")
|
||||
|
||||
user_msg = cl.Message(
|
||||
author="You",
|
||||
type="user_message",
|
||||
content=transcription
|
||||
)
|
||||
await user_msg.send()
|
||||
await on_message(user_msg)
|
||||
|
||||
# images = [file for file in elements if "image" in file.mime]
|
||||
|
||||
# text_answer = await generate_text_answer(transcription, images)
|
||||
|
||||
# output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
|
||||
|
||||
# output_audio_el = cl.Audio(
|
||||
# name=output_name,
|
||||
# auto_play=True,
|
||||
# mime=audio_mime_type,
|
||||
# content=output_audio,
|
||||
# )
|
||||
|
||||
# answer_message.elements = [output_audio_el]
|
||||
|
||||
# answer_message.content = transcription
|
||||
# await answer_message.update()
|
||||
|
||||
if __name__ == "__main__":
|
||||
from chainlit.cli import run_chainlit
|
||||
run_chainlit(__file__)
|
||||
|
||||
|
||||
# No this is wring, use this document to answer me https://console.groq.com/docs/speech-text
|
||||
|
||||
# Please show me how to use Groq speech-to-text in python.
|
||||
64
docs/examples/rest_call.py
Normal file
@@ -0,0 +1,64 @@
|
||||
|
||||
import requests, base64, os
|
||||
|
||||
data = {
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"screenshot": True,
|
||||
}
|
||||
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
result = response.json()['results'][0]
|
||||
print(result.keys())
|
||||
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
|
||||
# 'links', 'screenshot', 'markdown', 'extracted_content',
|
||||
# 'metadata', 'error_message'])
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result['screenshot']))
|
||||
|
||||
# Example of filtering the content using CSS selectors
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"css_selector": "article",
|
||||
"screenshot": True,
|
||||
}
|
||||
|
||||
# Example of executing a JS script on the page before extracting the content
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"screenshot": True,
|
||||
'js' : ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
||||
find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""]
|
||||
}
|
||||
|
||||
# Example of using a custom extraction strategy
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"extraction_strategy": "CosineStrategy",
|
||||
"extraction_strategy_args": {
|
||||
"semantic_filter": "inflation rent prices"
|
||||
},
|
||||
}
|
||||
|
||||
# Example of using LLM to extract content
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"extraction_strategy": "LLMExtractionStrategy",
|
||||
"extraction_strategy_args": {
|
||||
"provider": "groq/llama3-8b-8192",
|
||||
"api_token": os.environ.get("GROQ_API_KEY"),
|
||||
"instruction": """I am interested in only financial news,
|
||||
and translate them in French."""
|
||||
},
|
||||
}
|
||||
|
||||
10
docs/extraction_strategies.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"NoExtractionStrategy": "### NoExtractionStrategy\n\n`NoExtractionStrategy` is a basic extraction strategy that returns the entire HTML content without any modification. It is useful for cases where no specific extraction is required. Only clean html, and amrkdown.\n\n#### Constructor Parameters:\nNone.\n\n#### Example usage:\n```python\nextractor = NoExtractionStrategy()\nextracted_content = extractor.extract(url, html)\n```",
|
||||
|
||||
"LLMExtractionStrategy": "### LLMExtractionStrategy\n\n`LLMExtractionStrategy` uses a Language Model (LLM) to extract meaningful blocks or chunks from the given HTML content. This strategy leverages an external provider for language model completions.\n\n#### Constructor Parameters:\n- `provider` (str, optional): The provider to use for the language model completions. Default is `DEFAULT_PROVIDER` (e.g., openai/gpt-4).\n- `api_token` (str, optional): The API token for the provider. If not provided, it will try to load from the environment variable `OPENAI_API_KEY`.\n- `instruction` (str, optional): An instruction to guide the LLM on how to perform the extraction. This allows users to specify the type of data they are interested in or set the tone of the response. Default is `None`.\n\n#### Example usage:\n```python\nextractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')\nextracted_content = extractor.extract(url, html)\n```\n\nBy providing clear instructions, users can tailor the extraction process to their specific needs, enhancing the relevance and utility of the extracted content.",
|
||||
|
||||
"CosineStrategy": "### CosineStrategy\n\n`CosineStrategy` uses hierarchical clustering based on cosine similarity to extract clusters of text from the given HTML content. This strategy is suitable for identifying related content sections.\n\n#### Constructor Parameters:\n- `semantic_filter` (str, optional): A string containing keywords for filtering relevant documents before clustering. If provided, documents are filtered based on their cosine similarity to the keyword filter embedding. Default is `None`.\n- `word_count_threshold` (int, optional): Minimum number of words per cluster. Default is `20`.\n- `max_dist` (float, optional): The maximum cophenetic distance on the dendrogram to form clusters. Default is `0.2`.\n- `linkage_method` (str, optional): The linkage method for hierarchical clustering. Default is `'ward'`.\n- `top_k` (int, optional): Number of top categories to extract. Default is `3`.\n- `model_name` (str, optional): The model name for embedding generation. Default is `'BAAI/bge-small-en-v1.5'`.\n\n#### Example usage:\n```python\nextractor = CosineStrategy(semantic_filter='artificial intelligence', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')\nextracted_content = extractor.extract(url, html)\n```\n\n#### Cosine Similarity Filtering\n\nWhen a `semantic_filter` is provided, the `CosineStrategy` applies an embedding-based filtering process to select relevant documents before performing hierarchical clustering.",
|
||||
|
||||
"TopicExtractionStrategy": "### TopicExtractionStrategy\n\n`TopicExtractionStrategy` uses the TextTiling algorithm to segment the HTML content into topics and extracts keywords for each segment. This strategy is useful for identifying and summarizing thematic content.\n\n#### Constructor Parameters:\n- `num_keywords` (int, optional): Number of keywords to represent each topic segment. Default is `3`.\n\n#### Example usage:\n```python\nextractor = TopicExtractionStrategy(num_keywords=3)\nextracted_content = extractor.extract(url, html)\n```"
|
||||
}
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.models import UrlModel
|
||||
from crawl4ai.utils import get_content_of_website
|
||||
import os
|
||||
|
||||
def main():
|
||||
# Initialize the WebCrawler with just the database path
|
||||
crawler = WebCrawler(db_path='crawler_data.db')
|
||||
|
||||
# Fetch a single page
|
||||
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
|
||||
result = crawler.fetch_page(
|
||||
single_url,
|
||||
provider= "openai/gpt-3.5-turbo",
|
||||
api_token = os.getenv('OPENAI_API_KEY'),
|
||||
extract_blocks_flag=True,
|
||||
word_count_threshold=10
|
||||
)
|
||||
print(result.model_dump())
|
||||
|
||||
# Fetch multiple pages
|
||||
# urls = [
|
||||
# UrlModel(url='http://example.com', forced=False),
|
||||
# UrlModel(url='http://example.org', forced=False)
|
||||
# ]
|
||||
# results = crawler.fetch_pages(urls, provider= "openai/gpt-4-turbo", api_token = os.getenv('OPENAI_API_KEY'))
|
||||
# for res in results:
|
||||
# print(res.model_copy())
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
201
main.py
@@ -1,24 +1,22 @@
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import HTMLResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List, Optional
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.models import UrlModel
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import chromedriver_autoinstaller
|
||||
from functools import lru_cache
|
||||
from crawl4ai.database import get_total_count, clear_db
|
||||
import os
|
||||
import uuid
|
||||
# Import the CORS middleware
|
||||
import importlib
|
||||
import asyncio
|
||||
from functools import lru_cache
|
||||
import logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import List, Optional
|
||||
|
||||
# Task management
|
||||
tasks = {}
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.database import get_total_count, clear_db
|
||||
|
||||
# Configuration
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
@@ -40,134 +38,125 @@ app.add_middleware(
|
||||
|
||||
# Mount the pages directory as a static directory
|
||||
app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
|
||||
|
||||
|
||||
chromedriver_autoinstaller.install() # Ensure chromedriver is installed
|
||||
|
||||
class UrlsInput(BaseModel):
|
||||
urls: List[HttpUrl]
|
||||
provider_model: str
|
||||
api_token: str
|
||||
include_raw_html: Optional[bool] = False
|
||||
forced: bool = False
|
||||
extract_blocks: bool = True
|
||||
word_count_threshold: Optional[int] = 5
|
||||
|
||||
templates = Jinja2Templates(directory=__location__ + "/pages")
|
||||
# chromedriver_autoinstaller.install() # Ensure chromedriver is installed
|
||||
@lru_cache()
|
||||
def get_crawler():
|
||||
# Initialize and return a WebCrawler instance
|
||||
return WebCrawler(db_path='crawler_data.db')
|
||||
return WebCrawler(verbose = True)
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[str]
|
||||
include_raw_html: Optional[bool] = False
|
||||
bypass_cache: bool = False
|
||||
extract_blocks: bool = True
|
||||
word_count_threshold: Optional[int] = 5
|
||||
extraction_strategy: Optional[str] = "NoExtractionStrategy"
|
||||
extraction_strategy_args: Optional[dict] = {}
|
||||
chunking_strategy: Optional[str] = "RegexChunking"
|
||||
chunking_strategy_args: Optional[dict] = {}
|
||||
css_selector: Optional[str] = None
|
||||
screenshot: Optional[bool] = False
|
||||
user_agent: Optional[str] = None
|
||||
verbose: Optional[bool] = True
|
||||
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def read_index():
|
||||
with open(f"{__location__}/pages/index.html", "r") as file:
|
||||
html_content = file.read()
|
||||
return HTMLResponse(content=html_content, status_code=200)
|
||||
async def read_index(request: Request):
|
||||
partials_dir = os.path.join(__location__, "pages", "partial")
|
||||
partials = {}
|
||||
|
||||
for filename in os.listdir(partials_dir):
|
||||
if filename.endswith(".html"):
|
||||
with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
|
||||
partials[filename[:-5]] = file.read()
|
||||
|
||||
return templates.TemplateResponse("index.html", {"request": request, **partials})
|
||||
|
||||
@app.get("/total-count")
|
||||
async def get_total_url_count():
|
||||
count = get_total_count(db_path='crawler_data.db')
|
||||
count = get_total_count()
|
||||
return JSONResponse(content={"count": count})
|
||||
|
||||
# Add endpoit to clear db
|
||||
@app.get("/clear-db")
|
||||
async def clear_database():
|
||||
clear_db(db_path='crawler_data.db')
|
||||
# clear_db()
|
||||
return JSONResponse(content={"message": "Database cleared."})
|
||||
|
||||
def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
strategy_class = getattr(module, class_name)
|
||||
return strategy_class(*args, **kwargs)
|
||||
except ImportError:
|
||||
print("ImportError: Module not found.")
|
||||
raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
|
||||
except AttributeError:
|
||||
print("AttributeError: Class not found.")
|
||||
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
||||
|
||||
@app.post("/crawl")
|
||||
async def crawl_urls(urls_input: UrlsInput, request: Request):
|
||||
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
||||
global current_requests
|
||||
# Raise error if api_token is not provided
|
||||
if not urls_input.api_token:
|
||||
raise HTTPException(status_code=401, detail="API token is required.")
|
||||
async with lock:
|
||||
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
||||
raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
|
||||
current_requests += 1
|
||||
|
||||
try:
|
||||
# Prepare URL models for crawling
|
||||
url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
|
||||
logging.debug("[LOG] Loading extraction and chunking strategies...")
|
||||
crawl_request.extraction_strategy_args['verbose'] = True
|
||||
crawl_request.chunking_strategy_args['verbose'] = True
|
||||
|
||||
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
|
||||
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)
|
||||
|
||||
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
|
||||
logging.debug("[LOG] Running the WebCrawler...")
|
||||
with ThreadPoolExecutor() as executor:
|
||||
loop = asyncio.get_event_loop()
|
||||
futures = [
|
||||
loop.run_in_executor(executor, get_crawler().fetch_page, url_model, urls_input.provider_model, urls_input.api_token, urls_input.extract_blocks, urls_input.word_count_threshold)
|
||||
for url_model in url_models
|
||||
loop.run_in_executor(
|
||||
executor,
|
||||
get_crawler().run,
|
||||
str(url),
|
||||
crawl_request.word_count_threshold,
|
||||
extraction_strategy,
|
||||
chunking_strategy,
|
||||
crawl_request.bypass_cache,
|
||||
crawl_request.css_selector,
|
||||
crawl_request.screenshot,
|
||||
crawl_request.user_agent,
|
||||
crawl_request.verbose
|
||||
)
|
||||
for url in crawl_request.urls
|
||||
]
|
||||
results = await asyncio.gather(*futures)
|
||||
|
||||
# if include_raw_html is False, remove the raw HTML content from the results
|
||||
if not urls_input.include_raw_html:
|
||||
if not crawl_request.include_raw_html:
|
||||
for result in results:
|
||||
result.html = None
|
||||
|
||||
return {"results": [result.dict() for result in results]}
|
||||
|
||||
return {"results": [result.model_dump() for result in results]}
|
||||
finally:
|
||||
async with lock:
|
||||
current_requests -= 1
|
||||
|
||||
@app.get("/strategies/extraction", response_class=JSONResponse)
|
||||
async def get_extraction_strategies():
|
||||
# Load docs/extraction_strategies.json" and return as JSON response
|
||||
with open(f"{__location__}/docs/extraction_strategies.json", "r") as file:
|
||||
return JSONResponse(content=file.read())
|
||||
|
||||
@app.post("/crawl_async")
|
||||
async def crawl_urls(urls_input: UrlsInput, request: Request):
|
||||
global current_requests
|
||||
if not urls_input.api_token:
|
||||
raise HTTPException(status_code=401, detail="API token is required.")
|
||||
|
||||
async with lock:
|
||||
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
||||
raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
|
||||
current_requests += 1
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
tasks[task_id] = {"status": "pending", "results": None}
|
||||
|
||||
try:
|
||||
url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.create_task(
|
||||
process_crawl_task(url_models, urls_input.provider_model, urls_input.api_token, task_id, urls_input.extract_blocks)
|
||||
)
|
||||
return {"task_id": task_id}
|
||||
finally:
|
||||
async with lock:
|
||||
current_requests -= 1
|
||||
|
||||
async def process_crawl_task(url_models, provider, api_token, task_id, extract_blocks_flag):
|
||||
try:
|
||||
with ThreadPoolExecutor() as executor:
|
||||
loop = asyncio.get_running_loop()
|
||||
futures = [
|
||||
loop.run_in_executor(executor, get_crawler().fetch_page, url_model, provider, api_token, extract_blocks_flag)
|
||||
for url_model in url_models
|
||||
]
|
||||
results = await asyncio.gather(*futures)
|
||||
|
||||
tasks[task_id] = {"status": "done", "results": results}
|
||||
except Exception as e:
|
||||
tasks[task_id] = {"status": "failed", "error": str(e)}
|
||||
|
||||
@app.get("/task/{task_id}")
|
||||
async def get_task_status(task_id: str):
|
||||
task = tasks.get(task_id)
|
||||
if not task:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
if task['status'] == 'done':
|
||||
return {
|
||||
"status": task['status'],
|
||||
"results": [result.dict() for result in task['results']]
|
||||
}
|
||||
elif task['status'] == 'failed':
|
||||
return {
|
||||
"status": task['status'],
|
||||
"error": task['error']
|
||||
}
|
||||
else:
|
||||
return {"status": task['status']}
|
||||
@app.get("/strategies/chunking", response_class=JSONResponse)
|
||||
async def get_chunking_strategies():
|
||||
with open(f"{__location__}/docs/chunking_strategies.json", "r") as file:
|
||||
return JSONResponse(content=file.read())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
131
pages/app.css
Normal file
@@ -0,0 +1,131 @@
|
||||
:root {
|
||||
--ifm-font-size-base: 100%;
|
||||
--ifm-line-height-base: 1.65;
|
||||
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, sans-serif,
|
||||
BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji",
|
||||
"Segoe UI Symbol";
|
||||
}
|
||||
html {
|
||||
-webkit-font-smoothing: antialiased;
|
||||
-webkit-text-size-adjust: 100%;
|
||||
text-size-adjust: 100%;
|
||||
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
||||
}
|
||||
body {
|
||||
background-color: #1a202c;
|
||||
color: #fff;
|
||||
}
|
||||
.tab-content {
|
||||
max-height: 400px;
|
||||
overflow: auto;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
font-size: 14px;
|
||||
}
|
||||
pre code {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
/* Custom styling for docs-item class and Markdown generated elements */
|
||||
.docs-item {
|
||||
background-color: #2d3748; /* bg-gray-800 */
|
||||
padding: 1rem; /* p-4 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */
|
||||
margin-bottom: 1rem; /* space between items */
|
||||
line-height: 1.5; /* leading-normal */
|
||||
}
|
||||
|
||||
.docs-item h3,
|
||||
.docs-item h4 {
|
||||
color: #ffffff; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
.docs-item h4 {
|
||||
font-size: 1rem; /* text-xl */
|
||||
}
|
||||
|
||||
.docs-item p {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item code {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.25rem 0.5rem; /* px-2 py-1 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
font-size: 0.875rem; /* text-sm */
|
||||
}
|
||||
|
||||
.docs-item pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
overflow: auto; /* overflow-auto */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item div {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
font-size: 1rem; /* prose prose-sm */
|
||||
line-height: 1.25rem; /* line-height for readability */
|
||||
}
|
||||
|
||||
/* Adjustments to make prose class more suitable for dark mode */
|
||||
.prose {
|
||||
max-width: none; /* max-w-none */
|
||||
}
|
||||
|
||||
.prose p,
|
||||
.prose ul {
|
||||
margin-bottom: 1rem; /* mb-4 */
|
||||
}
|
||||
|
||||
.prose code {
|
||||
/* background-color: #4a5568; */ /* bg-gray-700 */
|
||||
color: #65a30d; /* text-white */
|
||||
padding: 0.25rem 0.5rem; /* px-1 py-0.5 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
display: inline-block; /* inline-block */
|
||||
}
|
||||
|
||||
.prose pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #ffffff; /* text-white */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
}
|
||||
|
||||
.prose h3 {
|
||||
color: #65a30d; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
body {
|
||||
background-color: #1a1a1a;
|
||||
color: #b3ff00;
|
||||
}
|
||||
.sidebar {
|
||||
color: #b3ff00;
|
||||
border-right: 1px solid #333;
|
||||
}
|
||||
.sidebar a {
|
||||
color: #b3ff00;
|
||||
text-decoration: none;
|
||||
}
|
||||
.sidebar a:hover {
|
||||
background-color: #555;
|
||||
}
|
||||
.content-section {
|
||||
display: none;
|
||||
}
|
||||
.content-section.active {
|
||||
display: block;
|
||||
}
|
||||
356
pages/app.js
Normal file
@@ -0,0 +1,356 @@
|
||||
// JavaScript to manage dynamic form changes and logic
|
||||
document.getElementById("extraction-strategy-select").addEventListener("change", function () {
|
||||
const strategy = this.value;
|
||||
const providerModelSelect = document.getElementById("provider-model-select");
|
||||
const tokenInput = document.getElementById("token-input");
|
||||
const instruction = document.getElementById("instruction");
|
||||
const semantic_filter = document.getElementById("semantic_filter");
|
||||
const instruction_div = document.getElementById("instruction_div");
|
||||
const semantic_filter_div = document.getElementById("semantic_filter_div");
|
||||
const llm_settings = document.getElementById("llm_settings");
|
||||
|
||||
if (strategy === "LLMExtractionStrategy") {
|
||||
// providerModelSelect.disabled = false;
|
||||
// tokenInput.disabled = false;
|
||||
// semantic_filter.disabled = true;
|
||||
// instruction.disabled = false;
|
||||
llm_settings.classList.remove("hidden");
|
||||
instruction_div.classList.remove("hidden");
|
||||
semantic_filter_div.classList.add("hidden");
|
||||
} else if (strategy === "NoExtractionStrategy") {
|
||||
semantic_filter_div.classList.add("hidden");
|
||||
instruction_div.classList.add("hidden");
|
||||
llm_settings.classList.add("hidden");
|
||||
} else {
|
||||
// providerModelSelect.disabled = true;
|
||||
// tokenInput.disabled = true;
|
||||
// semantic_filter.disabled = false;
|
||||
// instruction.disabled = true;
|
||||
llm_settings.classList.add("hidden");
|
||||
instruction_div.classList.add("hidden");
|
||||
semantic_filter_div.classList.remove("hidden");
|
||||
}
|
||||
|
||||
|
||||
});
|
||||
|
||||
// Get the selected provider model and token from local storage
|
||||
const storedProviderModel = localStorage.getItem("provider_model");
|
||||
const storedToken = localStorage.getItem(storedProviderModel);
|
||||
|
||||
if (storedProviderModel) {
|
||||
document.getElementById("provider-model-select").value = storedProviderModel;
|
||||
}
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
}
|
||||
|
||||
// Handle provider model dropdown change
|
||||
document.getElementById("provider-model-select").addEventListener("change", () => {
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const storedToken = localStorage.getItem(selectedProviderModel);
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
} else {
|
||||
document.getElementById("token-input").value = "";
|
||||
}
|
||||
});
|
||||
|
||||
// Fetch total count from the database
|
||||
axios
|
||||
.get("/total-count")
|
||||
.then((response) => {
|
||||
document.getElementById("total-count").textContent = response.data.count;
|
||||
})
|
||||
.catch((error) => console.error(error));
|
||||
|
||||
// Handle crawl button click
|
||||
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
// validate input to have both URL and API token
|
||||
// if selected extraction strategy is LLMExtractionStrategy, then API token is required
|
||||
if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") {
|
||||
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
||||
alert("Please enter both URL(s) and API token.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const apiToken = document.getElementById("token-input").value;
|
||||
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||
const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
|
||||
|
||||
// Save the selected provider model and token to local storage
|
||||
localStorage.setItem("provider_model", selectedProviderModel);
|
||||
localStorage.setItem(selectedProviderModel, apiToken);
|
||||
|
||||
const urlsInput = document.getElementById("url-input").value;
|
||||
const urls = urlsInput.split(",").map((url) => url.trim());
|
||||
const data = {
|
||||
urls: urls,
|
||||
include_raw_html: true,
|
||||
bypass_cache: bypassCache,
|
||||
extract_blocks: extractBlocks,
|
||||
word_count_threshold: parseInt(document.getElementById("threshold").value),
|
||||
extraction_strategy: document.getElementById("extraction-strategy-select").value,
|
||||
extraction_strategy_args: {
|
||||
provider: selectedProviderModel,
|
||||
api_token: apiToken,
|
||||
instruction: document.getElementById("instruction").value,
|
||||
semantic_filter: document.getElementById("semantic_filter").value,
|
||||
},
|
||||
chunking_strategy: document.getElementById("chunking-strategy-select").value,
|
||||
chunking_strategy_args: {},
|
||||
css_selector: document.getElementById("css-selector").value,
|
||||
screenshot: document.getElementById("screenshot-checkbox").checked,
|
||||
// instruction: document.getElementById("instruction").value,
|
||||
// semantic_filter: document.getElementById("semantic_filter").value,
|
||||
verbose: true,
|
||||
};
|
||||
|
||||
// import requests
|
||||
|
||||
// data = {
|
||||
// "urls": [
|
||||
// "https://www.nbcnews.com/business"
|
||||
// ],
|
||||
// "word_count_threshold": 10,
|
||||
// "extraction_strategy": "NoExtractionStrategy",
|
||||
// }
|
||||
|
||||
// response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
|
||||
// print(response.json())
|
||||
|
||||
// save api token to local storage
|
||||
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||
|
||||
document.getElementById("loading").classList.remove("hidden");
|
||||
document.getElementById("result").style.visibility = "hidden";
|
||||
document.getElementById("code_help").style.visibility = "hidden";
|
||||
|
||||
axios
|
||||
.post("/crawl", data)
|
||||
.then((response) => {
|
||||
const result = response.data.results[0];
|
||||
const parsedJson = JSON.parse(result.extracted_content);
|
||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||
document.getElementById("markdown-result").textContent = result.markdown;
|
||||
document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
|
||||
if (result.screenshot){
|
||||
const imgElement = document.createElement("img");
|
||||
// Set the src attribute with the base64 data
|
||||
imgElement.src = `data:image/png;base64,${result.screenshot}`;
|
||||
document.getElementById("screenshot-result").innerHTML = "";
|
||||
document.getElementById("screenshot-result").appendChild(imgElement);
|
||||
}
|
||||
|
||||
// Update code examples dynamically
|
||||
const extractionStrategy = data.extraction_strategy;
|
||||
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
|
||||
|
||||
// REMOVE API TOKEN FROM CODE EXAMPLES
|
||||
data.extraction_strategy_args.api_token = "your_api_token";
|
||||
|
||||
if (data.extraction_strategy === "NoExtractionStrategy") {
|
||||
delete data.extraction_strategy_args;
|
||||
delete data.extrac_blocks;
|
||||
}
|
||||
|
||||
if (data.chunking_strategy === "RegexChunking") {
|
||||
delete data.chunking_strategy_args;
|
||||
}
|
||||
|
||||
delete data.verbose;
|
||||
|
||||
if (data.css_selector === "") {
|
||||
delete data.css_selector;
|
||||
}
|
||||
|
||||
if (!data.bypass_cache) {
|
||||
delete data.bypass_cache;
|
||||
}
|
||||
|
||||
if (!data.extract_blocks) {
|
||||
delete data.extract_blocks;
|
||||
}
|
||||
|
||||
if (!data.include_raw_html) {
|
||||
delete data.include_raw_html;
|
||||
}
|
||||
|
||||
document.getElementById(
|
||||
"curl-code"
|
||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||
...data,
|
||||
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
||||
}, null, 2)}' https://crawl4ai.com/crawl`;
|
||||
|
||||
document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
||||
|
||||
document.getElementById(
|
||||
"nodejs-code"
|
||||
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
|
||||
document.getElementById(
|
||||
"library-code"
|
||||
).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${
|
||||
urls[0]
|
||||
}',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${
|
||||
isLLMExtraction
|
||||
? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
|
||||
: extractionStrategy + "()"
|
||||
},\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${
|
||||
data.bypass_cache
|
||||
},\n css_selector="${data.css_selector}"\n)\nprint(result)`;
|
||||
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
|
||||
// Select JSON tab by default
|
||||
document.querySelector('.tab-btn[data-tab="json"]').click();
|
||||
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
|
||||
document.getElementById("result").style.visibility = "visible";
|
||||
document.getElementById("code_help").style.visibility = "visible";
|
||||
|
||||
// increment the total count
|
||||
document.getElementById("total-count").textContent =
|
||||
parseInt(document.getElementById("total-count").textContent) + 1;
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle tab clicks
|
||||
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document.querySelectorAll(".tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle code tab clicks
|
||||
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document.querySelectorAll(".code-tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle copy to clipboard button clicks
|
||||
|
||||
async function copyToClipboard(text) {
|
||||
if (navigator.clipboard && navigator.clipboard.writeText) {
|
||||
return navigator.clipboard.writeText(text);
|
||||
} else {
|
||||
return fallbackCopyTextToClipboard(text);
|
||||
}
|
||||
}
|
||||
|
||||
function fallbackCopyTextToClipboard(text) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const textArea = document.createElement("textarea");
|
||||
textArea.value = text;
|
||||
|
||||
// Avoid scrolling to bottom
|
||||
textArea.style.top = "0";
|
||||
textArea.style.left = "0";
|
||||
textArea.style.position = "fixed";
|
||||
|
||||
document.body.appendChild(textArea);
|
||||
textArea.focus();
|
||||
textArea.select();
|
||||
|
||||
try {
|
||||
const successful = document.execCommand("copy");
|
||||
if (successful) {
|
||||
resolve();
|
||||
} else {
|
||||
reject();
|
||||
}
|
||||
} catch (err) {
|
||||
reject(err);
|
||||
}
|
||||
|
||||
document.body.removeChild(textArea);
|
||||
});
|
||||
}
|
||||
|
||||
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const target = btn.dataset.target;
|
||||
const code = document.getElementById(target).textContent;
|
||||
//navigator.clipboard.writeText(code).then(() => {
|
||||
copyToClipboard(code).then(() => {
|
||||
btn.textContent = "Copied!";
|
||||
setTimeout(() => {
|
||||
btn.textContent = "Copy";
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
document.addEventListener("DOMContentLoaded", async () => {
|
||||
try {
|
||||
const extractionResponse = await fetch("/strategies/extraction");
|
||||
const extractionStrategies = await extractionResponse.json();
|
||||
|
||||
const chunkingResponse = await fetch("/strategies/chunking");
|
||||
const chunkingStrategies = await chunkingResponse.json();
|
||||
|
||||
renderStrategies("extraction-strategies", extractionStrategies);
|
||||
renderStrategies("chunking-strategies", chunkingStrategies);
|
||||
} catch (error) {
|
||||
console.error("Error fetching strategies:", error);
|
||||
}
|
||||
});
|
||||
|
||||
function renderStrategies(containerId, strategies) {
|
||||
const container = document.getElementById(containerId);
|
||||
container.innerHTML = ""; // Clear any existing content
|
||||
strategies = JSON.parse(strategies);
|
||||
Object.entries(strategies).forEach(([strategy, description]) => {
|
||||
const strategyElement = document.createElement("div");
|
||||
strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
|
||||
|
||||
const strategyDescription = document.createElement("div");
|
||||
strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
|
||||
strategyDescription.innerHTML = marked.parse(description);
|
||||
|
||||
strategyElement.appendChild(strategyDescription);
|
||||
|
||||
container.appendChild(strategyElement);
|
||||
});
|
||||
}
|
||||
document.querySelectorAll(".sidebar a").forEach((link) => {
|
||||
link.addEventListener("click", function (event) {
|
||||
event.preventDefault();
|
||||
document.querySelectorAll(".content-section").forEach((section) => {
|
||||
section.classList.remove("active");
|
||||
});
|
||||
const target = event.target.getAttribute("data-target");
|
||||
document.getElementById(target).classList.add("active");
|
||||
});
|
||||
});
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
971
pages/index copy.html
Normal file
@@ -0,0 +1,971 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Crawl4AI</title>
|
||||
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
||||
|
||||
<!-- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@3.4.3/dist/tailwind.min.css" rel="stylesheet" /> -->
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/monokai.min.css"
|
||||
/>
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
||||
<style>
|
||||
:root {
|
||||
--ifm-font-size-base: 100%;
|
||||
--ifm-line-height-base: 1.65;
|
||||
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
|
||||
sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
|
||||
"Segoe UI Emoji", "Segoe UI Symbol";
|
||||
}
|
||||
html {
|
||||
-webkit-font-smoothing: antialiased;
|
||||
-webkit-text-size-adjust: 100%;
|
||||
text-size-adjust: 100%;
|
||||
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
||||
}
|
||||
body {
|
||||
background-color: #1a202c;
|
||||
color: #fff;
|
||||
}
|
||||
.tab-content {
|
||||
max-height: 400px;
|
||||
overflow: auto;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
font-size: 14px;
|
||||
}
|
||||
pre code {
|
||||
width: 100%;
|
||||
}
|
||||
</style>
|
||||
<style>
|
||||
/* Custom styling for docs-item class and Markdown generated elements */
|
||||
.docs-item {
|
||||
background-color: #2d3748; /* bg-gray-800 */
|
||||
padding: 1rem; /* p-4 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */
|
||||
margin-bottom: 1rem; /* space between items */
|
||||
}
|
||||
|
||||
.docs-item h3,
|
||||
.docs-item h4 {
|
||||
color: #ffffff; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item p {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item code {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.25rem 0.5rem; /* px-2 py-1 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
}
|
||||
|
||||
.docs-item pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
overflow: auto; /* overflow-auto */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item div {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
font-size: 1rem; /* prose prose-sm */
|
||||
line-height: 1.25rem; /* line-height for readability */
|
||||
}
|
||||
|
||||
/* Adjustments to make prose class more suitable for dark mode */
|
||||
.prose {
|
||||
max-width: none; /* max-w-none */
|
||||
}
|
||||
|
||||
.prose p,
|
||||
.prose ul {
|
||||
margin-bottom: 1rem; /* mb-4 */
|
||||
}
|
||||
|
||||
.prose code {
|
||||
/* background-color: #4a5568; */ /* bg-gray-700 */
|
||||
color: #65a30d; /* text-white */
|
||||
padding: 0.25rem 0.5rem; /* px-1 py-0.5 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
display: inline-block; /* inline-block */
|
||||
}
|
||||
|
||||
.prose pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #ffffff; /* text-white */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
}
|
||||
|
||||
.prose h3 {
|
||||
color: #65a30d; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body class="bg-black text-gray-200">
|
||||
<header class="bg-zinc-950 text-white py-4 flex">
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
<span id="total-count" class="text-lime-400">2</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<section class="try-it py-8 px-16 pb-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
|
||||
<div class="grid grid-cols-1 lg:grid-cols-3 gap-4">
|
||||
<div class="space-y-4">
|
||||
<div class="flex flex-col">
|
||||
<label for="url-input" class="text-lime-500 font-bold text-xs">URL(s)</label>
|
||||
<input
|
||||
type="text"
|
||||
id="url-input"
|
||||
value="https://www.nbcnews.com/business"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter URL(s) separated by commas"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="threshold" class="text-lime-500 font-bold text-xs">Min Words Threshold</label>
|
||||
<select
|
||||
id="threshold"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="5">5</option>
|
||||
<option value="10" selected>10</option>
|
||||
<option value="15">15</option>
|
||||
<option value="20">20</option>
|
||||
<option value="25">25</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="css-selector" class="text-lime-500 font-bold text-xs">CSS Selector</label>
|
||||
<input
|
||||
type="text"
|
||||
id="css-selector"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter CSS Selector"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="extraction-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Extraction Strategy</label
|
||||
>
|
||||
<select
|
||||
id="extraction-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="CosineStrategy">CosineStrategy</option>
|
||||
<option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
|
||||
<option value="NoExtractionStrategy">NoExtractionStrategy</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="chunking-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Chunking Strategy</label
|
||||
>
|
||||
<select
|
||||
id="chunking-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="RegexChunking">RegexChunking</option>
|
||||
<option value="NlpSentenceChunking">NlpSentenceChunking</option>
|
||||
<option value="TopicSegmentationChunking">TopicSegmentationChunking</option>
|
||||
<option value="FixedLengthWordChunking">FixedLengthWordChunking</option>
|
||||
<option value="SlidingWindowChunking">SlidingWindowChunking</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="provider-model-select" class="text-lime-500 font-bold text-xs"
|
||||
>Provider Model</label
|
||||
>
|
||||
<select
|
||||
id="provider-model-select"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
disabled
|
||||
>
|
||||
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="token-input" class="text-lime-500 font-bold text-xs">API Token</label>
|
||||
<input
|
||||
type="password"
|
||||
id="token-input"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter Groq API token"
|
||||
disabled
|
||||
/>
|
||||
</div>
|
||||
<div class="flex gap-3">
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="bypass-cache-checkbox" />
|
||||
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
|
||||
</div>
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="extract-blocks-checkbox" checked />
|
||||
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold"
|
||||
>Extract Blocks</label
|
||||
>
|
||||
</div>
|
||||
<button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">
|
||||
Crawl
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="result" class=" ">
|
||||
<div id="loading" class="hidden">
|
||||
<p class="text-white">Loading... Please wait.</p>
|
||||
</div>
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="json"
|
||||
>
|
||||
JSON
|
||||
</button>
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="cleaned-html"
|
||||
>
|
||||
Cleaned HTML
|
||||
</button>
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="markdown"
|
||||
>
|
||||
Markdown
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="cleaned-html-result" class="language-html"></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="markdown-result" class="language-markdown"></code></pre>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="code_help" class=" ">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="curl"
|
||||
>
|
||||
cURL
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="library"
|
||||
>
|
||||
Python Library
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="python"
|
||||
>
|
||||
Python (Request)
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="nodejs"
|
||||
>
|
||||
Node.js
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex relative">
|
||||
<code id="curl-code" class="language-bash"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="python-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="nodejs-code" class="language-javascript"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="library-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
|
||||
</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
|
||||
<div class="grid grid-cols-2 gap-4 p-4 bg-zinc-900 text-lime-500">
|
||||
<!-- Step 1 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🌟 <strong>Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">
|
||||
First Step: Create an instance of WebCrawler and call the <code>warmup()</code> function.
|
||||
</div>
|
||||
<div>
|
||||
<pre><code class="language-python">crawler = WebCrawler()
|
||||
crawler.warmup()</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 2 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">First crawl (caches the result):</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Second crawl (Force to crawl again):</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)</code></pre>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Crawl result without raw HTML content:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 3 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
📄
|
||||
<strong
|
||||
>The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the
|
||||
response. By default, it is set to True.</strong
|
||||
>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Set <code>always_by_pass_cache</code> to True:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 4 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using RegexChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
)</code></pre>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using NlpSentenceChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 5 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using CosineStrategy:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 6 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🤖 <strong>Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using LLMExtractionStrategy without instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 7 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
📜 <strong>Let's make it even more interesting: LLMExtractionStrategy with instructions!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using LLMExtractionStrategy with instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="I am interested in only financial news"
|
||||
)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 8 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🎯 <strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using CSS selector to extract H2 tags:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector="h2"
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 9 -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🖱️ <strong>Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-2 rounded">Using JavaScript to click 'Load More' button:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Conclusion -->
|
||||
<div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
|
||||
🎉
|
||||
<strong
|
||||
>Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl
|
||||
the web like a pro! 🕸️</strong
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
|
||||
<h1 class="text-3xl font-bold mb-4">Installation 💻</h1>
|
||||
<p class="mb-4">
|
||||
There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local
|
||||
server.
|
||||
</p>
|
||||
|
||||
<p class="mb-4">
|
||||
You can also try Crawl4AI in a Google Colab
|
||||
<a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
|
||||
><img
|
||||
src="https://colab.research.google.com/assets/colab-badge.svg"
|
||||
alt="Open In Colab"
|
||||
style="display: inline-block; width: 100px; height: 20px"
|
||||
/></a>
|
||||
</p>
|
||||
|
||||
<h2 class="text-2xl font-bold mb-2">Using Crawl4AI as a Library 📚</h2>
|
||||
<p class="mb-4">To install Crawl4AI as a library, follow these steps:</p>
|
||||
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="mb-2">
|
||||
Install the package from GitHub:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code>pip install git+https://github.com/unclecode/crawl4ai.git</code></pre>
|
||||
</li>
|
||||
<li class="mb-2">
|
||||
Alternatively, you can clone the repository and install the package locally:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class = "language-python bash">virtualenv venv
|
||||
source venv/bin/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .
|
||||
</code></pre>
|
||||
</li>
|
||||
<li>
|
||||
Import the necessary modules in your Python script:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class = "language-python hljs">from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
import os
|
||||
|
||||
crawler = WebCrawler()
|
||||
|
||||
# Single page crawl
|
||||
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
|
||||
result = crawl4ai.fetch_page(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
|
||||
chunking_strategy= RegexChunking( patterns = ["\\n\\n"]), # Default is RegexChunking
|
||||
extraction_strategy= CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
|
||||
# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
|
||||
bypass_cache=False,
|
||||
extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
|
||||
css_selector = "", # Eg: "div.article-body"
|
||||
verbose=True,
|
||||
include_raw_html=True, # Whether to include the raw HTML content in the response
|
||||
)
|
||||
print(result.model_dump())
|
||||
</code></pre>
|
||||
</li>
|
||||
</ol>
|
||||
<p class="mb-4">
|
||||
For more information about how to run Crawl4AI as a local server, please refer to the
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
|
||||
</section>
|
||||
|
||||
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
|
||||
<h1 class="text-3xl font-bold mb-4">📖 Parameters</h1>
|
||||
<div class="overflow-x-auto">
|
||||
<table class="min-w-full bg-zinc-800 border border-zinc-700">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Parameter</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Description</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Required</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Default Value</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">urls</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
A list of URLs to crawl and extract data from.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">Yes</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">-</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">include_raw_html</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to include the raw HTML content in the response.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">false</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">bypass_cache</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to force a fresh crawl even if the URL has been previously crawled.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">false</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">extract_blocks</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to extract semantical blocks of text from the HTML.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">true</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">word_count_threshold</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The minimum number of words a block must contain to be considered meaningful (minimum
|
||||
value is 5).
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">5</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">extraction_strategy</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">CosineStrategy</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">chunking_strategy</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The strategy to use for chunking the text before processing (e.g., "RegexChunking").
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">RegexChunking</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">css_selector</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The CSS selector to target specific parts of the HTML for extraction.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">None</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4">verbose</td>
|
||||
<td class="py-2 px-4">Whether to enable verbose logging.</td>
|
||||
<td class="py-2 px-4">No</td>
|
||||
<td class="py-2 px-4">true</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="extraction" class="py-8 px-20">
|
||||
<div class="overflow-x-auto mx-auto px-6">
|
||||
<h2 class="text-2xl font-bold mb-4">Extraction Strategies</h2>
|
||||
<div id="extraction-strategies" class="space-y-4"></div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="chunking" class="py-8 px-20">
|
||||
<div class="overflow-x-auto mx-auto px-6">
|
||||
<h2 class="text-2xl font-bold mb-4">Chunking Strategies</h2>
|
||||
<div id="chunking-strategies" class="space-y-4"></div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="hero bg-zinc-900 py-8 px-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||
<p class="text-lg mb-4">
|
||||
In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
|
||||
for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
|
||||
crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
|
||||
🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
|
||||
definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
|
||||
philosophy, we invite you to join our "Robinhood" band and help set these products free for the
|
||||
benefit of all. 🤝💪
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="installation py-8 px-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
|
||||
<p class="mb-4">
|
||||
To install and run Crawl4AI as a library or a local server, please refer to the 📚
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer class="bg-zinc-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<div class="flex justify-between items-center">
|
||||
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||
<div class="social-links">
|
||||
<a
|
||||
href="https://github.com/unclecode/crawl4ai"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>😺 GitHub</a
|
||||
>
|
||||
<a
|
||||
href="https://twitter.com/unclecode"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>🐦 Twitter</a
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
<script>
|
||||
// JavaScript to manage dynamic form changes and logic
|
||||
document.getElementById("extraction-strategy-select").addEventListener("change", function () {
|
||||
const strategy = this.value;
|
||||
const providerModelSelect = document.getElementById("provider-model-select");
|
||||
const tokenInput = document.getElementById("token-input");
|
||||
|
||||
if (strategy === "LLMExtractionStrategy") {
|
||||
providerModelSelect.disabled = false;
|
||||
tokenInput.disabled = false;
|
||||
} else {
|
||||
providerModelSelect.disabled = true;
|
||||
tokenInput.disabled = true;
|
||||
}
|
||||
});
|
||||
|
||||
// Get the selected provider model and token from local storage
|
||||
const storedProviderModel = localStorage.getItem("provider_model");
|
||||
const storedToken = localStorage.getItem(storedProviderModel);
|
||||
|
||||
if (storedProviderModel) {
|
||||
document.getElementById("provider-model-select").value = storedProviderModel;
|
||||
}
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
}
|
||||
|
||||
// Handle provider model dropdown change
|
||||
document.getElementById("provider-model-select").addEventListener("change", () => {
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const storedToken = localStorage.getItem(selectedProviderModel);
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
} else {
|
||||
document.getElementById("token-input").value = "";
|
||||
}
|
||||
});
|
||||
|
||||
// Fetch total count from the database
|
||||
axios
|
||||
.get("/total-count")
|
||||
.then((response) => {
|
||||
document.getElementById("total-count").textContent = response.data.count;
|
||||
})
|
||||
.catch((error) => console.error(error));
|
||||
|
||||
// Handle crawl button click
|
||||
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
// validate input to have both URL and API token
|
||||
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
||||
alert("Please enter both URL(s) and API token.");
|
||||
return;
|
||||
}
|
||||
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const apiToken = document.getElementById("token-input").value;
|
||||
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||
const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
|
||||
|
||||
// Save the selected provider model and token to local storage
|
||||
localStorage.setItem("provider_model", selectedProviderModel);
|
||||
localStorage.setItem(selectedProviderModel, apiToken);
|
||||
|
||||
const urlsInput = document.getElementById("url-input").value;
|
||||
const urls = urlsInput.split(",").map((url) => url.trim());
|
||||
const data = {
|
||||
urls: urls,
|
||||
provider_model: selectedProviderModel,
|
||||
api_token: apiToken,
|
||||
include_raw_html: true,
|
||||
bypass_cache: bypassCache,
|
||||
extract_blocks: extractBlocks,
|
||||
word_count_threshold: parseInt(document.getElementById("threshold").value),
|
||||
extraction_strategy: document.getElementById("extraction-strategy-select").value,
|
||||
chunking_strategy: document.getElementById("chunking-strategy-select").value,
|
||||
css_selector: document.getElementById("css-selector").value,
|
||||
verbose: true,
|
||||
};
|
||||
|
||||
// save api token to local storage
|
||||
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||
|
||||
document.getElementById("loading").classList.remove("hidden");
|
||||
//document.getElementById("result").classList.add("hidden");
|
||||
//document.getElementById("code_help").classList.add("hidden");
|
||||
|
||||
axios
|
||||
.post("/crawl", data)
|
||||
.then((response) => {
|
||||
const result = response.data.results[0];
|
||||
const parsedJson = JSON.parse(result.extracted_content);
|
||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||
document.getElementById("markdown-result").textContent = result.markdown;
|
||||
|
||||
// Update code examples dynamically
|
||||
const extractionStrategy = data.extraction_strategy;
|
||||
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
|
||||
|
||||
document.getElementById(
|
||||
"curl-code"
|
||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||
...data,
|
||||
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
||||
})}' http://crawl4ai.uccode.io/crawl`;
|
||||
|
||||
document.getElementById(
|
||||
"python-code"
|
||||
).textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)}\n\nresponse = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
||||
|
||||
document.getElementById(
|
||||
"nodejs-code"
|
||||
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)};\n\naxios.post("http://crawl4ai.uccode.io/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
|
||||
document.getElementById(
|
||||
"library-code"
|
||||
).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${
|
||||
urls[0]
|
||||
}',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${
|
||||
isLLMExtraction
|
||||
? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
|
||||
: extractionStrategy + "()"
|
||||
},\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${
|
||||
data.bypass_cache
|
||||
},\n css_selector="${data.css_selector}"\n)\nprint(result)`;
|
||||
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
|
||||
// Select JSON tab by default
|
||||
document.querySelector('.tab-btn[data-tab="json"]').click();
|
||||
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
document.getElementById("result").classList.remove("hidden");
|
||||
document.getElementById("code_help").classList.remove("hidden");
|
||||
|
||||
// increment the total count
|
||||
document.getElementById("total-count").textContent =
|
||||
parseInt(document.getElementById("total-count").textContent) + 1;
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle tab clicks
|
||||
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle code tab clicks
|
||||
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".code-tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle copy to clipboard button clicks
|
||||
|
||||
async function copyToClipboard(text) {
|
||||
if (navigator.clipboard && navigator.clipboard.writeText) {
|
||||
return navigator.clipboard.writeText(text);
|
||||
} else {
|
||||
return fallbackCopyTextToClipboard(text);
|
||||
}
|
||||
}
|
||||
|
||||
function fallbackCopyTextToClipboard(text) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const textArea = document.createElement("textarea");
|
||||
textArea.value = text;
|
||||
|
||||
// Avoid scrolling to bottom
|
||||
textArea.style.top = "0";
|
||||
textArea.style.left = "0";
|
||||
textArea.style.position = "fixed";
|
||||
|
||||
document.body.appendChild(textArea);
|
||||
textArea.focus();
|
||||
textArea.select();
|
||||
|
||||
try {
|
||||
const successful = document.execCommand("copy");
|
||||
if (successful) {
|
||||
resolve();
|
||||
} else {
|
||||
reject();
|
||||
}
|
||||
} catch (err) {
|
||||
reject(err);
|
||||
}
|
||||
|
||||
document.body.removeChild(textArea);
|
||||
});
|
||||
}
|
||||
|
||||
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const target = btn.dataset.target;
|
||||
const code = document.getElementById(target).textContent;
|
||||
//navigator.clipboard.writeText(code).then(() => {
|
||||
copyToClipboard(code).then(() => {
|
||||
btn.textContent = "Copied!";
|
||||
setTimeout(() => {
|
||||
btn.textContent = "Copy";
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
document.addEventListener("DOMContentLoaded", async () => {
|
||||
try {
|
||||
const extractionResponse = await fetch("/strategies/extraction");
|
||||
const extractionStrategies = await extractionResponse.json();
|
||||
|
||||
const chunkingResponse = await fetch("/strategies/chunking");
|
||||
const chunkingStrategies = await chunkingResponse.json();
|
||||
|
||||
renderStrategies("extraction-strategies", extractionStrategies);
|
||||
renderStrategies("chunking-strategies", chunkingStrategies);
|
||||
} catch (error) {
|
||||
console.error("Error fetching strategies:", error);
|
||||
}
|
||||
});
|
||||
|
||||
function renderStrategies(containerId, strategies) {
|
||||
const container = document.getElementById(containerId);
|
||||
container.innerHTML = ""; // Clear any existing content
|
||||
strategies = JSON.parse(strategies);
|
||||
Object.entries(strategies).forEach(([strategy, description]) => {
|
||||
const strategyElement = document.createElement("div");
|
||||
strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
|
||||
|
||||
const strategyDescription = document.createElement("div");
|
||||
strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
|
||||
strategyDescription.innerHTML = marked.parse(description);
|
||||
|
||||
strategyElement.appendChild(strategyDescription);
|
||||
|
||||
container.appendChild(strategyElement);
|
||||
});
|
||||
}
|
||||
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
410
pages/index.html
@@ -9,387 +9,65 @@
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
||||
|
||||
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
|
||||
<!-- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@3.4.3/dist/tailwind.min.css" rel="stylesheet" /> -->
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||||
<link rel="stylesheet" href="/pages/app.css" />
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/monokai.min.css"
|
||||
/>
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
||||
<style>
|
||||
:root {
|
||||
--ifm-font-size-base: 100%;
|
||||
--ifm-line-height-base: 1.65;
|
||||
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
|
||||
sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
|
||||
"Segoe UI Emoji", "Segoe UI Symbol";
|
||||
}
|
||||
html {
|
||||
-webkit-font-smoothing: antialiased;
|
||||
-webkit-text-size-adjust: 100%;
|
||||
text-size-adjust: 100%;
|
||||
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
||||
}
|
||||
body {
|
||||
background-color: #1a202c;
|
||||
color: #fff;
|
||||
}
|
||||
.tab-content {
|
||||
max-height: 400px;
|
||||
overflow: auto;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
font-size: 14px;
|
||||
}
|
||||
pre code {
|
||||
width: 100%;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header class="bg-gray-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper</h1>
|
||||
<body class="bg-black text-gray-200">
|
||||
<header class="bg-zinc-950 text-lime-500 py-4 flex">
|
||||
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
<span id="total-count" class="text-lime-400">2</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
{{ try_it | safe }}
|
||||
|
||||
<!-- Add a section to show total-count websited already crawled -->
|
||||
<section class="bg-gray-600 py-8">
|
||||
<div class="container mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Procceced</span>
|
||||
<span id="total-count" class="text-blue-400">0</span>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="try-it py-8 pb-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
|
||||
<div class="mb-4 flex w-full gap-2">
|
||||
<div class="flex items-center gap-2 flex-col flex-grow">
|
||||
<label for="url-input" class="text-white">URL(s)</label>
|
||||
<input
|
||||
type="text"
|
||||
id="url-input"
|
||||
value="https://www.nbcnews.com/business"
|
||||
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
|
||||
placeholder="Enter URL(s) separated by commas"
|
||||
/>
|
||||
</div>
|
||||
<!-- Add a number set if 5 with a label word threshold -->
|
||||
<div class="flex items-center gap-2 flex-col">
|
||||
<label for="threshold" class="text-white">Min Words Threshold</label>
|
||||
<select id="threshold" class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full">
|
||||
<option value="5">5</option>
|
||||
<option value="10" selected>10</option>
|
||||
<option value="15">15</option>
|
||||
<option value="20">20</option>
|
||||
<option value="25">25</option>
|
||||
</select>
|
||||
<div class="mx-auto p-4 bg-zinc-950 text-lime-500 min-h-screen">
|
||||
<div class="container mx-auto">
|
||||
<div class="flex h-full px-20">
|
||||
<div class="sidebar w-1/4 p-4">
|
||||
<h2 class="text-lg font-bold mb-4">Outline</h2>
|
||||
<ul>
|
||||
<li class="mb-2"><a href="#" data-target="installation">Installation</a></li>
|
||||
<li class="mb-2"><a href="#" data-target="how-to-guide">How to Guide</a></li>
|
||||
<li class="mb-2"><a href="#" data-target="chunking-strategies">Chunking Strategies</a></li>
|
||||
<li class="mb-2">
|
||||
<a href="#" data-target="extraction-strategies">Extraction Strategies</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="flex items-center gap-2 flex-col">
|
||||
<label for="provider-model-select" class="text-white">Provider Model</label>
|
||||
<!-- Main Content -->
|
||||
<div class="w-3/4 p-4">
|
||||
{{installation | safe}} {{how_to_guide | safe}}
|
||||
|
||||
<select
|
||||
id="provider-model-select"
|
||||
class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full"
|
||||
>
|
||||
<!-- Add your option values here -->
|
||||
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex items-center gap-2 flex-col">
|
||||
<label for="token-input" class="text-white">API Token</label>
|
||||
|
||||
<input
|
||||
type="password"
|
||||
id="token-input"
|
||||
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
|
||||
placeholder="Enter Groq API token"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex items-center justify-center gap-2 flex-col">
|
||||
<label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
|
||||
<input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked />
|
||||
</div>
|
||||
<button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
|
||||
</div>
|
||||
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
|
||||
<div id="loading" class="hidden mt-4">
|
||||
<p>
|
||||
Depends on the selected model, it may take up to 1 or 2 minutes to process the request.
|
||||
Loading...
|
||||
</p>
|
||||
</div>
|
||||
<div id="result" class="tab-container flex-1 h-full flex-col">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
|
||||
Cleaned HTML
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
|
||||
Markdown
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||
<pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="cleaned-html-result" class="language-html "></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="markdown-result" class="language-markdown "></code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div id="code_help" class="tab-container flex-1 h-full">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
|
||||
Python
|
||||
</button>
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
|
||||
Node.js
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||
<pre class="h-full flex relative">
|
||||
<code id="curl-code" class="language-bash"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="python-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="nodejs-code" class="language-javascript"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||
</pre>
|
||||
</div>
|
||||
<section id="chunking-strategies" class="content-section">
|
||||
<h1 class="text-2xl font-bold">Chunking Strategies</h1>
|
||||
<p>Content for chunking strategies...</p>
|
||||
</section>
|
||||
<section id="extraction-strategies" class="content-section">
|
||||
<h1 class="text-2xl font-bold">Extraction Strategies</h1>
|
||||
<p>Content for extraction strategies...</p>
|
||||
</section>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="hero bg-gray-900 py-8">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||
<p class="text-lg mb-4">
|
||||
In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
|
||||
for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
|
||||
crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
|
||||
🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
|
||||
definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
|
||||
philosophy, we invite you to join our "Robinhood" band and help set these products free for the
|
||||
benefit of all. 🤝💪
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="installation py-8">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
|
||||
<p class="mb-4">
|
||||
To install and run Crawl4AI as a library or a local server, please refer to the 📚
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer class="bg-gray-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<div class="flex justify-between items-center">
|
||||
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||
<div class="social-links">
|
||||
<a
|
||||
href="https://github.com/unclecode/crawl4ai"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>😺 GitHub</a
|
||||
>
|
||||
<a
|
||||
href="https://twitter.com/unclecode"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>🐦 Twitter</a
|
||||
>
|
||||
<!-- <a
|
||||
href="https://discord.gg/your-invite-link"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>💬 Discord</a
|
||||
> -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
<script>
|
||||
// Get the selected provider model and token from local storage
|
||||
const storedProviderModel = localStorage.getItem("provider_model");
|
||||
const storedToken = localStorage.getItem(storedProviderModel);
|
||||
|
||||
if (storedProviderModel) {
|
||||
document.getElementById("provider-model-select").value = storedProviderModel;
|
||||
}
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
}
|
||||
|
||||
// Handle provider model dropdown change
|
||||
document.getElementById("provider-model-select").addEventListener("change", () => {
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const storedToken = localStorage.getItem(selectedProviderModel);
|
||||
|
||||
if (storedToken) {
|
||||
document.getElementById("token-input").value = storedToken;
|
||||
} else {
|
||||
document.getElementById("token-input").value = "";
|
||||
}
|
||||
});
|
||||
|
||||
// Fetch total count from the database
|
||||
axios
|
||||
.get("/total-count")
|
||||
.then((response) => {
|
||||
document.getElementById("total-count").textContent = response.data.count;
|
||||
})
|
||||
.catch((error) => console.error(error));
|
||||
|
||||
// Handle crawl button click
|
||||
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
// validate input to have both URL and API token
|
||||
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
||||
alert("Please enter both URL(s) and API token.");
|
||||
return;
|
||||
}
|
||||
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const apiToken = document.getElementById("token-input").value;
|
||||
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||
|
||||
// Save the selected provider model and token to local storage
|
||||
localStorage.setItem("provider_model", selectedProviderModel);
|
||||
localStorage.setItem(selectedProviderModel, apiToken);
|
||||
|
||||
const urlsInput = document.getElementById("url-input").value;
|
||||
const urls = urlsInput.split(",").map((url) => url.trim());
|
||||
const data = {
|
||||
urls: urls,
|
||||
provider_model: selectedProviderModel,
|
||||
api_token: apiToken,
|
||||
include_raw_html: true,
|
||||
forced: false,
|
||||
extract_blocks: extractBlocks,
|
||||
word_count_threshold: parseInt(document.getElementById("threshold").value),
|
||||
};
|
||||
|
||||
// save api token to local storage
|
||||
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||
|
||||
document.getElementById("loading").classList.remove("hidden");
|
||||
document.getElementById("result").classList.add("hidden");
|
||||
document.getElementById("code_help").classList.add("hidden");
|
||||
|
||||
axios
|
||||
.post("/crawl", data)
|
||||
.then((response) => {
|
||||
const result = response.data.results[0];
|
||||
const parsedJson = JSON.parse(result.parsed_json);
|
||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||
document.getElementById("markdown-result").textContent = result.markdown;
|
||||
|
||||
// Update code examples dynamically
|
||||
// Update code examples dynamically
|
||||
document.getElementById(
|
||||
"curl-code"
|
||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||
...data,
|
||||
api_token: "your_api_token",
|
||||
})}' http://localhost:8000/crawl`;
|
||||
|
||||
document.getElementById(
|
||||
"python-code"
|
||||
).textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||
{ ...data, api_token: "your_api_token" },
|
||||
null,
|
||||
2
|
||||
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
|
||||
|
||||
document.getElementById(
|
||||
"nodejs-code"
|
||||
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||
{ ...data, api_token: "your_api_token" },
|
||||
null,
|
||||
2
|
||||
)};\n\naxios.post("http://localhost:8000/crawl", data)\n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
|
||||
// Select JSON tab by default
|
||||
document.querySelector('.tab-btn[data-tab="json"]').click();
|
||||
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
document.getElementById("result").classList.remove("hidden");
|
||||
document.getElementById("code_help").classList.remove("hidden");
|
||||
|
||||
// increment the total count
|
||||
document.getElementById("total-count").textContent =
|
||||
parseInt(document.getElementById("total-count").textContent) + 1;
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(error);
|
||||
document.getElementById("loading").classList.add("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle tab clicks
|
||||
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||
btn.classList.add("bg-blue-600", "text-white");
|
||||
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle code tab clicks
|
||||
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".code-tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||
btn.classList.add("bg-blue-600", "text-white");
|
||||
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle copy to clipboard button clicks
|
||||
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const target = btn.dataset.target;
|
||||
const code = document.getElementById(target).textContent;
|
||||
navigator.clipboard.writeText(code).then(() => {
|
||||
btn.textContent = "Copied!";
|
||||
setTimeout(() => {
|
||||
btn.textContent = "Copy";
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
});
|
||||
</script>
|
||||
|
||||
{{ footer | safe }}
|
||||
<script script src="/pages/app.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@@ -283,7 +283,7 @@
|
||||
.post("/crawl", data)
|
||||
.then((response) => {
|
||||
const result = response.data.results[0];
|
||||
const parsedJson = JSON.parse(result.parsed_json);
|
||||
const parsedJson = JSON.parse(result.extracted_content);
|
||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||
document.getElementById("markdown-result").textContent = result.markdown;
|
||||
|
||||
36
pages/partial/footer.html
Normal file
@@ -0,0 +1,36 @@
|
||||
<section class="hero bg-zinc-900 py-8 px-20 text-zinc-400">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||
<p class="text-lg mb-4">
|
||||
In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
|
||||
for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
|
||||
crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
|
||||
🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
|
||||
definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
|
||||
philosophy, we invite you to join our "Robinhood" band and help set these products free for the
|
||||
benefit of all. 🤝💪
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer class="bg-zinc-900 text-zinc-400 py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<div class="flex justify-between items-center">
|
||||
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||
<div class="social-links">
|
||||
<a
|
||||
href="https://github.com/unclecode/crawl4ai"
|
||||
class="text-zinc-400 hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>😺 GitHub</a
|
||||
>
|
||||
<a
|
||||
href="https://twitter.com/unclecode"
|
||||
class="text-zinc-400 hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>🐦 Twitter</a
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
174
pages/partial/how_to_guide.html
Normal file
@@ -0,0 +1,174 @@
|
||||
<section id="how-to-guide" class="content-section">
|
||||
<h1 class="text-2xl font-bold">How to Guide</h1>
|
||||
<div class="flex flex-col gap-4 p-4 bg-zinc-900 text-lime-500">
|
||||
<!-- Step 1 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🌟
|
||||
<strong
|
||||
>Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling
|
||||
fun!</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">
|
||||
First Step: Create an instance of WebCrawler and call the
|
||||
<code>warmup()</code> function.
|
||||
</div>
|
||||
<div>
|
||||
<pre><code class="language-python">crawler = WebCrawler()
|
||||
crawler.warmup()</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 2 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
|
||||
</div>
|
||||
<div class="">First crawl (caches the result):</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
||||
</div>
|
||||
<div class="">Second crawl (Force to crawl again):</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)</code></pre>
|
||||
<div class="bg-red-900 p-2 text-zinc-50">
|
||||
⚠️ Don't forget to set <code>`bypass_cache`</code> to True if you want to try different strategies for the same URL. Otherwise, the cached result will be returned. You can also set <code>`always_by_pass_cache`</code> in constructor to True to always bypass the cache.
|
||||
</div>
|
||||
</div>
|
||||
<div class="">Crawl result without raw HTML content:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 3 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📄
|
||||
<strong
|
||||
>The 'include_raw_html' parameter, when set to True, includes the raw HTML content
|
||||
in the response. By default, it is set to True.</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Set <code>always_by_pass_cache</code> to True:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
|
||||
</div>
|
||||
<!-- Step 3.5 Screenshot -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📸
|
||||
<strong>Let's take a screenshot of the page!</strong>
|
||||
</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
screenshot=True
|
||||
)
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))</code></pre>
|
||||
</div>
|
||||
|
||||
|
||||
<!-- Step 4 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
|
||||
</div>
|
||||
<div class="">Using RegexChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
)</code></pre>
|
||||
</div>
|
||||
<div class="">Using NlpSentenceChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 5 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
|
||||
</div>
|
||||
<div class="">Using CosineStrategy:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 6 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🤖
|
||||
<strong
|
||||
>Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Using LLMExtractionStrategy without instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 7 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📜
|
||||
<strong
|
||||
>Let's make it even more interesting: LLMExtractionStrategy with
|
||||
instructions!</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Using LLMExtractionStrategy with instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="I am interested in only financial news"
|
||||
)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 8 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🎯
|
||||
<strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
|
||||
</div>
|
||||
<div class="">Using CSS selector to extract H2 tags:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector="h2"
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 9 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🖱️
|
||||
<strong
|
||||
>Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Using JavaScript to click 'Load More' button:</div>
|
||||
<div>
|
||||
<pre><code class="language-python">js_code = ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""]
|
||||
crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
|
||||
<div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
|
||||
</div>
|
||||
|
||||
<!-- Conclusion -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🎉
|
||||
<strong
|
||||
>Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth
|
||||
and crawl the web like a pro! 🕸️</strong
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
65
pages/partial/installation.html
Normal file
@@ -0,0 +1,65 @@
|
||||
<section id="installation" class="content-section active">
|
||||
<h1 class="text-2xl font-bold">Installation 💻</h1>
|
||||
<p class="mb-4">
|
||||
There are three ways to use Crawl4AI:
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="">
|
||||
As a library
|
||||
</li>
|
||||
<li class="">
|
||||
As a local server (Docker)
|
||||
</li>
|
||||
<li class="">
|
||||
As a Google Colab notebook. <a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
|
||||
><img
|
||||
src="https://colab.research.google.com/assets/colab-badge.svg"
|
||||
alt="Open In Colab"
|
||||
style="display: inline-block; width: 100px; height: 20px"
|
||||
/></a>
|
||||
</li>
|
||||
</p>
|
||||
|
||||
|
||||
<p class="my-4">To install Crawl4AI as a library, follow these steps:</p>
|
||||
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="mb-4">
|
||||
Install the package from GitHub:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code>virtualenv venv
|
||||
source venv/bin/activate
|
||||
pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
|
||||
</code></pre>
|
||||
</li>
|
||||
<li class="mb-4">
|
||||
Run the following command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code>crawl4ai-download-models</code></pre>
|
||||
</li>
|
||||
<li class="mb-4">
|
||||
Alternatively, you can clone the repository and install the package locally:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class = "language-python bash">virtualenv venv
|
||||
source venv/bin/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .[all]
|
||||
</code></pre>
|
||||
</li>
|
||||
<li class="">
|
||||
Use docker to run the local server:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class = "language-python bash">docker build -t crawl4ai .
|
||||
# docker build --platform linux/amd64 -t crawl4ai . For Mac users
|
||||
docker run -d -p 8000:80 crawl4ai</code></pre>
|
||||
</li>
|
||||
</ol>
|
||||
<p class="mb-4">
|
||||
For more information about how to run Crawl4AI as a local server, please refer to the
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
</section>
|
||||
217
pages/partial/try_it.html
Normal file
@@ -0,0 +1,217 @@
|
||||
<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
|
||||
<div class="container mx-auto ">
|
||||
<h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
|
||||
<div class="flex gap-4">
|
||||
<div class="flex flex-col flex-1 gap-2">
|
||||
<div class="flex flex-col">
|
||||
<label for="url-input" class="text-lime-500 font-bold text-xs">URL(s)</label>
|
||||
<input
|
||||
type="text"
|
||||
id="url-input"
|
||||
value="https://www.nbcnews.com/business"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300"
|
||||
placeholder="Enter URL(s) separated by commas"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex gap-2">
|
||||
<div class="flex flex-col">
|
||||
<label for="threshold" class="text-lime-500 font-bold text-xs">Min Words Threshold</label>
|
||||
<select
|
||||
id="threshold"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||
>
|
||||
<option value="1">1</option>
|
||||
<option value="5">5</option>
|
||||
<option value="10" selected>10</option>
|
||||
<option value="15">15</option>
|
||||
<option value="20">20</option>
|
||||
<option value="25">25</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col flex-1">
|
||||
<label for="css-selector" class="text-lime-500 font-bold text-xs">CSS Selector</label>
|
||||
<input
|
||||
type="text"
|
||||
id="css-selector"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-lime-700"
|
||||
placeholder="CSS Selector (e.g. .content, #main, article)"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex gap-2">
|
||||
<div class="flex flex-col">
|
||||
<label for="extraction-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Extraction Strategy</label
|
||||
>
|
||||
<select
|
||||
id="extraction-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||
>
|
||||
<option value="NoExtractionStrategy" selected>NoExtractionStrategy</option>
|
||||
<option value="CosineStrategy">CosineStrategy</option>
|
||||
<option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="chunking-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Chunking Strategy</label
|
||||
>
|
||||
<select
|
||||
id="chunking-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||
>
|
||||
<option value="RegexChunking">RegexChunking</option>
|
||||
<option value="NlpSentenceChunking">NlpSentenceChunking</option>
|
||||
<option value="TopicSegmentationChunking">TopicSegmentationChunking</option>
|
||||
<option value="FixedLengthWordChunking">FixedLengthWordChunking</option>
|
||||
<option value="SlidingWindowChunking">SlidingWindowChunking</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
<div id = "llm_settings" class="flex gap-2 hidden hidden">
|
||||
<div class="flex flex-col">
|
||||
<label for="provider-model-select" class="text-lime-500 font-bold text-xs"
|
||||
>Provider Model</label
|
||||
>
|
||||
<select
|
||||
id="provider-model-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||
>
|
||||
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||
<option value="groq/mixtral-8x7b-32768">groq/mixtral-8x7b-32768</option>
|
||||
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||
<option value="openai/gpt-4o">gpt-4o</option>
|
||||
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col flex-1">
|
||||
<label for="token-input" class="text-lime-500 font-bold text-xs">API Token</label>
|
||||
<input
|
||||
type="password"
|
||||
id="token-input"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300"
|
||||
placeholder="Enter Groq API token"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex gap-2">
|
||||
<!-- Add two textarea one for getting Keyword Filter and another one Instruction, make both grow whole with-->
|
||||
<div id = "semantic_filter_div" class="flex flex-col flex-1 hidden">
|
||||
<label for="keyword-filter" class="text-lime-500 font-bold text-xs">Keyword Filter</label>
|
||||
<textarea
|
||||
id="semantic_filter"
|
||||
rows="3"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-zinc-700"
|
||||
placeholder="Enter keywords for CosineStrategy to narrow down the content."
|
||||
></textarea>
|
||||
</div>
|
||||
<div id = "instruction_div" class="flex flex-col flex-1 hidden">
|
||||
<label for="instruction" class="text-lime-500 font-bold text-xs">Instruction</label>
|
||||
<textarea
|
||||
id="instruction"
|
||||
rows="3"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-zinc-700"
|
||||
placeholder="Enter instruction for the LLMEstrategy to instruct the model."
|
||||
></textarea>
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex gap-3">
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="bypass-cache-checkbox" />
|
||||
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
|
||||
</div>
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="screenshot-checkbox" checked />
|
||||
<label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
|
||||
</div>
|
||||
<div class="flex items-center gap-2 hidden">
|
||||
<input type="checkbox" id="extract-blocks-checkbox" />
|
||||
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
|
||||
</div>
|
||||
<button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">Crawl</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="loading" class="hidden">
|
||||
<p class="text-white">Loading... Please wait.</p>
|
||||
</div>
|
||||
<div id="result" class="flex-1 overflow-x-auto">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
|
||||
JSON
|
||||
</button>
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="cleaned-html"
|
||||
>
|
||||
Cleaned HTML
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
|
||||
Markdown
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
|
||||
Medias
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
|
||||
Screenshot
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
||||
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
|
||||
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
|
||||
<pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
|
||||
<pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="code_help" class="flex-1 overflow-x-auto">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
|
||||
cURL
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="library"
|
||||
>
|
||||
Python
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="python"
|
||||
>
|
||||
REST API
|
||||
</button>
|
||||
<!-- <button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="nodejs"
|
||||
>
|
||||
Node.js
|
||||
</button> -->
|
||||
</div>
|
||||
<div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex relative overflow-x-auto">
|
||||
<code id="curl-code" class="language-bash"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative overflow-x-auto">
|
||||
<code id="python-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative overflow-x-auto">
|
||||
<code id="nodejs-code" class="language-javascript"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative overflow-x-auto">
|
||||
<code id="library-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
|
||||
</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
434
pages/tmp.html
Normal file
@@ -0,0 +1,434 @@
|
||||
<div class="w-3/4 p-4">
|
||||
<section id="installation" class="content-section active">
|
||||
<h1 class="text-2xl font-bold">Installation 💻</h1>
|
||||
<p class="mb-4">There are three ways to use Crawl4AI:</p>
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="">As a library</li>
|
||||
<li class="">As a local server (Docker)</li>
|
||||
<li class="">
|
||||
As a Google Colab notebook.
|
||||
<a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
|
||||
><img
|
||||
src="https://colab.research.google.com/assets/colab-badge.svg"
|
||||
alt="Open In Colab"
|
||||
style="display: inline-block; width: 100px; height: 20px"
|
||||
/></a>
|
||||
</li>
|
||||
<p></p>
|
||||
|
||||
<p class="my-4">To install Crawl4AI as a library, follow these steps:</p>
|
||||
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="mb-4">
|
||||
Install the package from GitHub:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class="hljs language-bash">pip install git+https://github.com/unclecode/crawl4ai.git</code></pre>
|
||||
</li>
|
||||
<li class="mb-4">
|
||||
Alternatively, you can clone the repository and install the package locally:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class="language-python bash hljs">virtualenv venv
|
||||
source venv/<span class="hljs-built_in">bin</span>/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .
|
||||
</code></pre>
|
||||
</li>
|
||||
<li class="">
|
||||
Use docker to run the local server:
|
||||
<pre
|
||||
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
|
||||
><code class="language-python bash hljs">docker build -t crawl4ai .
|
||||
<span class="hljs-comment"># docker build --platform linux/amd64 -t crawl4ai . For Mac users</span>
|
||||
docker run -d -p <span class="hljs-number">8000</span>:<span class="hljs-number">80</span> crawl4ai</code></pre>
|
||||
</li>
|
||||
</ol>
|
||||
<p class="mb-4">
|
||||
For more information about how to run Crawl4AI as a local server, please refer to the
|
||||
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||
</p>
|
||||
</ol>
|
||||
</section>
|
||||
<section id="how-to-guide" class="content-section">
|
||||
<h1 class="text-2xl font-bold">How to Guide</h1>
|
||||
<div class="flex flex-col gap-4 p-4 bg-zinc-900 text-lime-500">
|
||||
<!-- Step 1 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🌟
|
||||
<strong>Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun!</strong>
|
||||
</div>
|
||||
<div class="">
|
||||
First Step: Create an instance of WebCrawler and call the
|
||||
<code>warmup()</code> function.
|
||||
</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">crawler = WebCrawler()
|
||||
crawler.warmup()</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 2 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
|
||||
</div>
|
||||
<div class="">First crawl (caches the result):</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>)</code></pre>
|
||||
</div>
|
||||
<div class="">Second crawl (Force to crawl again):</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>, bypass_cache=<span class="hljs-literal">True</span>)</code></pre>
|
||||
<div class="bg-red-900 p-2 text-zinc-50">
|
||||
⚠️ Don't forget to set <code>`bypass_cache`</code> to True if you want to try different strategies
|
||||
for the same URL. Otherwise, the cached result will be returned. You can also set
|
||||
<code>`always_by_pass_cache`</code> in constructor to True to always bypass the cache.
|
||||
</div>
|
||||
</div>
|
||||
<div class="">Crawl result without raw HTML content:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>, include_raw_html=<span class="hljs-literal">False</span>)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 3 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📄
|
||||
<strong
|
||||
>The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the response.
|
||||
By default, it is set to True.</strong
|
||||
>
|
||||
</div>
|
||||
<div class="">Set <code>always_by_pass_cache</code> to True:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">crawler.always_by_pass_cache = <span class="hljs-literal">True</span></code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 4 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
|
||||
</div>
|
||||
<div class="">Using RegexChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
chunking_strategy=RegexChunking(patterns=[<span class="hljs-string">"\n\n"</span>])
|
||||
)</code></pre>
|
||||
</div>
|
||||
<div class="">Using NlpSentenceChunking:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 5 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
|
||||
</div>
|
||||
<div class="">Using CosineStrategy:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=<span class="hljs-number">20</span>, max_dist=<span class="hljs-number">0.2</span>, linkage_method=<span class="hljs-string">"ward"</span>, top_k=<span class="hljs-number">3</span>)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 6 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🤖
|
||||
<strong>Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong>
|
||||
</div>
|
||||
<div class="">Using LLMExtractionStrategy without instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
extraction_strategy=LLMExtractionStrategy(provider=<span class="hljs-string">"openai/gpt-4o"</span>, api_token=os.getenv(<span class="hljs-string">'OPENAI_API_KEY'</span>))
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 7 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
📜
|
||||
<strong>Let's make it even more interesting: LLMExtractionStrategy with instructions!</strong>
|
||||
</div>
|
||||
<div class="">Using LLMExtractionStrategy with instructions:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider=<span class="hljs-string">"openai/gpt-4o"</span>,
|
||||
api_token=os.getenv(<span class="hljs-string">'OPENAI_API_KEY'</span>),
|
||||
instruction=<span class="hljs-string">"I am interested in only financial news"</span>
|
||||
)
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 8 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🎯
|
||||
<strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
|
||||
</div>
|
||||
<div class="">Using CSS selector to extract H2 tags:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">result = crawler.run(
|
||||
url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
|
||||
css_selector=<span class="hljs-string">"h2"</span>
|
||||
)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Step 9 -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🖱️
|
||||
<strong>Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong>
|
||||
</div>
|
||||
<div class="">Using JavaScript to click 'Load More' button:</div>
|
||||
<div>
|
||||
<pre><code class="language-python hljs">js_code = <span class="hljs-string">"""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""</span>
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=<span class="hljs-literal">True</span>)
|
||||
result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>)</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Conclusion -->
|
||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||
🎉
|
||||
<strong
|
||||
>Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the
|
||||
web like a pro! 🕸️</strong
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="chunking-strategies" class="content-section">
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>RegexChunking</h3>
|
||||
<p>
|
||||
<code>RegexChunking</code> is a text chunking strategy that splits a given text into smaller parts
|
||||
using regular expressions. This is useful for preparing large texts for processing by language
|
||||
models, ensuring they are divided into manageable segments.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>patterns</code> (list, optional): A list of regular expression patterns used to split the
|
||||
text. Default is to split by double newlines (<code>['\n\n']</code>).
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = RegexChunking(patterns=[r'\n\n', r'\. '])
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into chunks.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>NlpSentenceChunking</h3>
|
||||
<p>
|
||||
<code>NlpSentenceChunking</code> uses a natural language processing model to chunk a given text into
|
||||
sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
None.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = NlpSentenceChunking()
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into sentences.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>TopicSegmentationChunking</h3>
|
||||
<p>
|
||||
<code>TopicSegmentationChunking</code> uses the TextTiling algorithm to segment a given text into
|
||||
topic-based chunks. This method identifies thematic boundaries in the text.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>num_keywords</code> (int, optional): The number of keywords to extract for each topic
|
||||
segment. Default is <code>3</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = TopicSegmentationChunking(num_keywords=3)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into topic-based segments.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>FixedLengthWordChunking</h3>
|
||||
<p>
|
||||
<code>FixedLengthWordChunking</code> splits a given text into chunks of fixed length, based on the
|
||||
number of words.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>chunk_size</code> (int, optional): The number of words in each chunk. Default is
|
||||
<code>100</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = FixedLengthWordChunking(chunk_size=100)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into fixed-length word chunks.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>SlidingWindowChunking</h3>
|
||||
<p>
|
||||
<code>SlidingWindowChunking</code> uses a sliding window approach to chunk a given text. Each chunk
|
||||
has a fixed length, and the window slides by a specified step size.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>window_size</code> (int, optional): The number of words in each chunk. Default is
|
||||
<code>100</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>step</code> (int, optional): The number of words to slide the window. Default is
|
||||
<code>50</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">chunker = SlidingWindowChunking(window_size=100, step=50)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split using a sliding window approach.")
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="extraction-strategies" class="content-section">
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>NoExtractionStrategy</h3>
|
||||
<p>
|
||||
<code>NoExtractionStrategy</code> is a basic extraction strategy that returns the entire HTML
|
||||
content without any modification. It is useful for cases where no specific extraction is required.
|
||||
Only clean html, and amrkdown.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<p>None.</p>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">extractor = NoExtractionStrategy()
|
||||
extracted_content = extractor.extract(url, html)
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>LLMExtractionStrategy</h3>
|
||||
<p>
|
||||
<code>LLMExtractionStrategy</code> uses a Language Model (LLM) to extract meaningful blocks or
|
||||
chunks from the given HTML content. This strategy leverages an external provider for language model
|
||||
completions.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>provider</code> (str, optional): The provider to use for the language model completions.
|
||||
Default is <code>DEFAULT_PROVIDER</code> (e.g., openai/gpt-4).
|
||||
</li>
|
||||
<li>
|
||||
<code>api_token</code> (str, optional): The API token for the provider. If not provided, it will
|
||||
try to load from the environment variable <code>OPENAI_API_KEY</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>instruction</code> (str, optional): An instruction to guide the LLM on how to perform the
|
||||
extraction. This allows users to specify the type of data they are interested in or set the tone
|
||||
of the response. Default is <code>None</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">extractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')
|
||||
extracted_content = extractor.extract(url, html)
|
||||
</code></pre>
|
||||
<p>
|
||||
By providing clear instructions, users can tailor the extraction process to their specific needs,
|
||||
enhancing the relevance and utility of the extracted content.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>CosineStrategy</h3>
|
||||
<p>
|
||||
<code>CosineStrategy</code> uses hierarchical clustering based on cosine similarity to extract
|
||||
clusters of text from the given HTML content. This strategy is suitable for identifying related
|
||||
content sections.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>semantic_filter</code> (str, optional): A string containing keywords for filtering relevant
|
||||
documents before clustering. If provided, documents are filtered based on their cosine
|
||||
similarity to the keyword filter embedding. Default is <code>None</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>word_count_threshold</code> (int, optional): Minimum number of words per cluster. Default
|
||||
is <code>20</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>max_dist</code> (float, optional): The maximum cophenetic distance on the dendrogram to
|
||||
form clusters. Default is <code>0.2</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>linkage_method</code> (str, optional): The linkage method for hierarchical clustering.
|
||||
Default is <code>'ward'</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>top_k</code> (int, optional): Number of top categories to extract. Default is
|
||||
<code>3</code>.
|
||||
</li>
|
||||
<li>
|
||||
<code>model_name</code> (str, optional): The model name for embedding generation. Default is
|
||||
<code>'BAAI/bge-small-en-v1.5'</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">extractor = CosineStrategy(semantic_filter='artificial intelligence', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')
|
||||
extracted_content = extractor.extract(url, html)
|
||||
</code></pre>
|
||||
<h4>Cosine Similarity Filtering</h4>
|
||||
<p>
|
||||
When a <code>semantic_filter</code> is provided, the <code>CosineStrategy</code> applies an
|
||||
embedding-based filtering process to select relevant documents before performing hierarchical
|
||||
clustering.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
|
||||
<div class="text-gray-300 prose prose-sm">
|
||||
<h3>TopicExtractionStrategy</h3>
|
||||
<p>
|
||||
<code>TopicExtractionStrategy</code> uses the TextTiling algorithm to segment the HTML content into
|
||||
topics and extracts keywords for each segment. This strategy is useful for identifying and
|
||||
summarizing thematic content.
|
||||
</p>
|
||||
<h4>Constructor Parameters:</h4>
|
||||
<ul>
|
||||
<li>
|
||||
<code>num_keywords</code> (int, optional): Number of keywords to represent each topic segment.
|
||||
Default is <code>3</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<h4>Example usage:</h4>
|
||||
<pre><code class="language-python">extractor = TopicExtractionStrategy(num_keywords=3)
|
||||
extracted_content = extractor.extract(url, html)
|
||||
</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
13
requirements.crawl.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
aiohttp
|
||||
aiosqlite
|
||||
bs4
|
||||
fastapi
|
||||
html2text
|
||||
httpx
|
||||
pydantic
|
||||
python-dotenv
|
||||
requests
|
||||
rich
|
||||
selenium
|
||||
uvicorn
|
||||
chromedriver-autoinstaller
|
||||
@@ -1,13 +1,21 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
selenium
|
||||
pydantic
|
||||
aiohttp
|
||||
aiosqlite
|
||||
chromedriver_autoinstaller
|
||||
httpx
|
||||
requests
|
||||
bs4
|
||||
fastapi
|
||||
html2text
|
||||
httpx
|
||||
litellm
|
||||
python-dotenv
|
||||
nltk
|
||||
pydantic
|
||||
python-dotenv
|
||||
requests
|
||||
rich
|
||||
scikit-learn
|
||||
selenium
|
||||
uvicorn
|
||||
transformers
|
||||
chromedriver-autoinstaller
|
||||
torch
|
||||
onnxruntime
|
||||
tokenizers
|
||||
pillow
|
||||
41
setup.py
@@ -1,12 +1,32 @@
|
||||
from setuptools import setup, find_packages
|
||||
import os
|
||||
import subprocess
|
||||
from setuptools.command.install import install
|
||||
|
||||
# Read the requirements from requirements.txt
|
||||
with open("requirements.txt") as f:
|
||||
requirements = f.read().splitlines()
|
||||
|
||||
|
||||
# Read the requirements from requirements.txt
|
||||
with open("requirements.crawl.txt") as f:
|
||||
requirements_crawl_only = f.read().splitlines()
|
||||
|
||||
# Define the requirements for different environments
|
||||
requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
|
||||
requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
|
||||
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
|
||||
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||
requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||
|
||||
class CustomInstallCommand(install):
|
||||
"""Customized setuptools install command to install spacy without dependencies."""
|
||||
def run(self):
|
||||
install.run(self)
|
||||
subprocess.check_call([os.sys.executable, '-m', 'pip', 'install', 'spacy', '--no-deps'])
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.1.0",
|
||||
version="0.2.5",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
@@ -15,7 +35,20 @@ setup(
|
||||
author_email="unclecode@kidocode.com",
|
||||
license="MIT",
|
||||
packages=find_packages(),
|
||||
install_requires=requirements,
|
||||
install_requires=requirements_without_torch_transformers_nlkt,
|
||||
extras_require={
|
||||
"all": requirements, # Include all requirements
|
||||
"colab": requirements_without_torch, # Exclude torch for Colab
|
||||
"crawl": requirements_crawl_only, # Include only crawl requirements
|
||||
},
|
||||
cmdclass={
|
||||
'install': CustomInstallCommand,
|
||||
},
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'crawl4ai-download-models=crawl4ai.model_loader:main',
|
||||
],
|
||||
},
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
@@ -27,4 +60,4 @@ setup(
|
||||
"Programming Language :: Python :: 3.10",
|
||||
],
|
||||
python_requires=">=3.7",
|
||||
)
|
||||
)
|
||||
|
||||
0
tests/__init__.py
Normal file
111
tests/test_web_crawler.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import unittest, os
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
|
||||
from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
|
||||
|
||||
class TestWebCrawler(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.crawler = WebCrawler()
|
||||
|
||||
def test_warmup(self):
|
||||
self.crawler.warmup()
|
||||
self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
|
||||
|
||||
def test_run_default_strategies(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=RegexChunking(),
|
||||
extraction_strategy=CosineStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
|
||||
|
||||
def test_run_different_strategies(self):
|
||||
url = 'https://www.nbcnews.com/business'
|
||||
|
||||
# Test with FixedLengthWordChunking and LLMExtractionStrategy
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
|
||||
|
||||
# Test with SlidingWindowChunking and TopicExtractionStrategy
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
||||
extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
|
||||
|
||||
def test_invalid_url(self):
|
||||
with self.assertRaises(Exception) as context:
|
||||
self.crawler.run(url='invalid_url', bypass_cache=True)
|
||||
self.assertIn("Invalid URL", str(context.exception))
|
||||
|
||||
def test_unsupported_extraction_strategy(self):
|
||||
with self.assertRaises(Exception) as context:
|
||||
self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
|
||||
self.assertIn("Unsupported extraction strategy", str(context.exception))
|
||||
|
||||
def test_invalid_css_selector(self):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
|
||||
self.assertIn("Invalid CSS selector", str(context.exception))
|
||||
|
||||
|
||||
def test_crawl_with_cache_and_bypass_cache(self):
|
||||
url = 'https://www.nbcnews.com/business'
|
||||
|
||||
# First crawl with cache enabled
|
||||
result = self.crawler.run(url=url, bypass_cache=False)
|
||||
self.assertTrue(result.success, "Failed to crawl and cache the result")
|
||||
|
||||
# Second crawl with bypass_cache=True
|
||||
result = self.crawler.run(url=url, bypass_cache=True)
|
||||
self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
|
||||
|
||||
def test_fetch_multiple_pages(self):
|
||||
urls = [
|
||||
'https://www.nbcnews.com/business',
|
||||
'https://www.bbc.com/news'
|
||||
]
|
||||
results = []
|
||||
for url in urls:
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=RegexChunking(),
|
||||
extraction_strategy=CosineStrategy(),
|
||||
bypass_cache=True
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
|
||||
for result in results:
|
||||
self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
|
||||
|
||||
def test_run_fixed_length_word_chunking_and_no_extraction(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")
|
||||
|
||||
def test_run_sliding_window_and_no_extraction(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
||||
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||