Compare commits
43 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77da48050d | ||
|
|
9a97aacd85 | ||
|
|
52daf3936a | ||
|
|
42a5da854d | ||
|
|
d1d83a6ef7 | ||
|
|
194050705d | ||
|
|
989f8c91c8 | ||
|
|
edba5fb5e9 | ||
|
|
faa1defa5c | ||
|
|
f7e0cee1b0 | ||
|
|
b3a0edaa6d | ||
|
|
9c34b30723 | ||
|
|
36a5847df5 | ||
|
|
a19379aa58 | ||
|
|
768d048e1c | ||
|
|
94c11a0262 | ||
|
|
649b0bfd02 | ||
|
|
57a00ec677 | ||
|
|
aeb2114170 | ||
|
|
b8d405fddd | ||
|
|
b32013cb97 | ||
|
|
226a62a3c0 | ||
|
|
8e73a482a2 | ||
|
|
0533aeb814 | ||
|
|
aead6de888 | ||
|
|
8d82fd4cfe | ||
|
|
8f44db6499 | ||
|
|
c7553b1280 | ||
|
|
8b8683f22e | ||
|
|
774ace6e3b | ||
|
|
4a8f91a0fc | ||
|
|
18c9784b61 | ||
|
|
e5d401c67c | ||
|
|
ae77589a98 | ||
|
|
ad373c0e19 | ||
|
|
51f26d12fe | ||
|
|
f1b60b2016 | ||
|
|
8c2dc2b1e4 | ||
|
|
dc9a44c12a | ||
|
|
d9753b6349 | ||
|
|
a554c0b143 | ||
|
|
7381fa95e6 | ||
|
|
5cee084340 |
BIN
.files/screenshot.png
Normal file
|
After Width: | Height: | Size: 1.5 MiB |
7
.gitignore
vendored
@@ -173,4 +173,9 @@ Crawl4AI.egg-info/
|
|||||||
requirements0.txt
|
requirements0.txt
|
||||||
a.txt
|
a.txt
|
||||||
|
|
||||||
*.sh
|
*.sh
|
||||||
|
.idea
|
||||||
|
docs/examples/.chainlit/
|
||||||
|
docs/examples/.chainlit/*
|
||||||
|
.chainlit/config.toml
|
||||||
|
.chainlit/translations/en-US.json
|
||||||
|
|||||||
@@ -1 +1,5 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [0.2.4] - 2024-06-17
|
||||||
|
### Fixed
|
||||||
|
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
|
||||||
86
Dockerfile
@@ -1,43 +1,77 @@
|
|||||||
# Use an official Python runtime as a parent image
|
|
||||||
FROM python:3.10-slim
|
# First stage: Build and install dependencies
|
||||||
|
FROM python:3.10-slim-bookworm as builder
|
||||||
|
|
||||||
# Set the working directory in the container
|
# Set the working directory in the container
|
||||||
WORKDIR /usr/src/app
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
# Copy the current directory contents into the container at /usr/src/app
|
# Install build dependencies
|
||||||
COPY . .
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
# Install dependencies for Chrome and ChromeDriver
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
wget \
|
wget \
|
||||||
xvfb \
|
|
||||||
unzip \
|
|
||||||
curl \
|
curl \
|
||||||
|
unzip
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||||
|
pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
|
||||||
|
python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
|
# Download and install ChromeDriver
|
||||||
|
RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
|
||||||
|
wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
|
||||||
|
unzip /tmp/chromedriver_linux64.zip -d /tmp && \
|
||||||
|
mv /tmp/chromedriver /usr/local/bin/chromedriver && \
|
||||||
|
chmod +x /usr/local/bin/chromedriver && \
|
||||||
|
rm /tmp/chromedriver_linux64.zip
|
||||||
|
|
||||||
|
# Second stage: Create final runtime image
|
||||||
|
FROM python:3.10-slim-bookworm
|
||||||
|
|
||||||
|
# Set the working directory in the container
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
# Install runtime dependencies
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
wget \
|
||||||
|
git \
|
||||||
|
xvfb \
|
||||||
gnupg2 \
|
gnupg2 \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
apt-transport-https \
|
apt-transport-https \
|
||||||
software-properties-common \
|
software-properties-common && \
|
||||||
&& mkdir -p /etc/apt/keyrings \
|
wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||||
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
|
echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
|
||||||
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
|
apt-get update && \
|
||||||
&& apt-get update \
|
apt-get install -y --no-install-recommends google-chrome-stable && \
|
||||||
&& apt-get install -y google-chrome-stable \
|
rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& apt-get install -y chromium-chromedriver
|
|
||||||
|
|
||||||
# Install Python dependencies
|
# Copy Chromedriver from the builder stage
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
|
||||||
RUN pip install spacy torch torchvision torchaudio
|
|
||||||
|
|
||||||
# Set display port and dbus env to avoid hanging
|
# Copy installed Python packages from builder stage
|
||||||
ENV DISPLAY=:99
|
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
|
||||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
COPY --from=builder /usr/local/bin /usr/local/bin
|
||||||
|
|
||||||
|
# Copy the rest of the application code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set environment to use Chrome and ChromeDriver properly
|
||||||
|
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||||
|
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||||
|
DISPLAY=:99 \
|
||||||
|
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||||
|
PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# Ensure the PATH environment variable includes the location of the installed packages
|
||||||
|
ENV PATH /usr/local/bin:$PATH
|
||||||
|
|
||||||
# Make port 80 available to the world outside this container
|
# Make port 80 available to the world outside this container
|
||||||
EXPOSE 80
|
EXPOSE 80
|
||||||
|
|
||||||
# Define environment variable
|
|
||||||
ENV PYTHONUNBUFFERED 1
|
|
||||||
|
|
||||||
# Run uvicorn
|
# Run uvicorn
|
||||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
45
Dockerfile-version-0
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Use an official Python runtime as a parent image
|
||||||
|
FROM python:3.10-slim
|
||||||
|
# In case you had some weird issues, try this Image
|
||||||
|
# FROM python:3.10-slim-bookworm as builder
|
||||||
|
|
||||||
|
# Set the working directory in the container
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
# Copy the current directory contents into the container at /usr/src/app
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Install dependencies for Chrome and ChromeDriver
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
wget \
|
||||||
|
xvfb \
|
||||||
|
unzip \
|
||||||
|
curl \
|
||||||
|
gnupg2 \
|
||||||
|
ca-certificates \
|
||||||
|
apt-transport-https \
|
||||||
|
software-properties-common \
|
||||||
|
&& mkdir -p /etc/apt/keyrings \
|
||||||
|
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
|
||||||
|
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
|
||||||
|
&& apt-get update \
|
||||||
|
&& apt-get install -y google-chrome-stable \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get install -y chromium-chromedriver
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
RUN pip install spacy torch torchvision torchaudio
|
||||||
|
|
||||||
|
# Set display port and dbus env to avoid hanging
|
||||||
|
ENV DISPLAY=:99
|
||||||
|
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||||
|
|
||||||
|
# Make port 80 available to the world outside this container
|
||||||
|
EXPOSE 80
|
||||||
|
|
||||||
|
# Define environment variable
|
||||||
|
ENV PYTHONUNBUFFERED 1
|
||||||
|
|
||||||
|
# Run uvicorn
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||||
99
README.md
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.0 🕷️🤖
|
# Crawl4AI v0.2.3 🕷️🤖
|
||||||
|
|
||||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
@@ -8,10 +8,27 @@
|
|||||||
|
|
||||||
Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
|
Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
|
||||||
|
|
||||||
[](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
- Use as REST API: Check [](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
|
||||||
|
- Use as Python library: [](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
||||||
|
|
||||||
## Recent Changes v0.2.0
|
## Recent Changes
|
||||||
|
|
||||||
|
### v0.2.4
|
||||||
|
- 🐞 Resolve the issue with the long url. (Issue #22)
|
||||||
|
|
||||||
|
### v0.2.3
|
||||||
|
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
|
||||||
|
- 🔗 Extrat all external and internal links. Check `result.links`
|
||||||
|
- 📚 Extract metadata from the page. Check `result.metadata`
|
||||||
|
- 🕵️ Support `user_agent` parameter to set the user agent for the HTTP requests.
|
||||||
|
- 🖼️ Take [screenshots](#taking-screenshots) of the page.
|
||||||
|
|
||||||
|
### v0.2.2
|
||||||
|
- Support multiple JS scripts
|
||||||
|
- Fixed some of bugs
|
||||||
|
- Resolved a few issue relevant to Colab installation
|
||||||
|
|
||||||
|
### v0.2.0
|
||||||
- 🚀 10x faster!!
|
- 🚀 10x faster!!
|
||||||
- 📜 Execute custom JavaScript before crawling!
|
- 📜 Execute custom JavaScript before crawling!
|
||||||
- 🤝 Colab friendly!
|
- 🤝 Colab friendly!
|
||||||
@@ -22,7 +39,27 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
|||||||
|
|
||||||
## Power and Simplicity of Crawl4AI 🚀
|
## Power and Simplicity of Crawl4AI 🚀
|
||||||
|
|
||||||
To show the simplicity take a look at the first example:
|
The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"urls": [
|
||||||
|
"https://www.nbcnews.com/business"
|
||||||
|
],
|
||||||
|
"screenshot": True
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
|
||||||
|
response_data = response.json()
|
||||||
|
print(response_data['results'][0].keys())
|
||||||
|
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
|
||||||
|
# 'links', 'screenshot', 'markdown', 'extracted_content',
|
||||||
|
# 'metadata', 'error_message'])
|
||||||
|
```
|
||||||
|
|
||||||
|
But you muore control then take a look at the first example of using the Python library.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import WebCrawler
|
from crawl4ai import WebCrawler
|
||||||
@@ -30,11 +67,9 @@ from crawl4ai import WebCrawler
|
|||||||
# Create the WebCrawler instance
|
# Create the WebCrawler instance
|
||||||
crawler = WebCrawler()
|
crawler = WebCrawler()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Run the crawler with keyword filtering and CSS selector
|
# Run the crawler with keyword filtering and CSS selector
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||||
print(result) # {url, html, markdown, extracted_content, metadata}
|
print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots}
|
||||||
```
|
```
|
||||||
|
|
||||||
Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
|
Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
|
||||||
@@ -52,20 +87,17 @@ from crawl4ai.extraction_strategy import *
|
|||||||
from crawl4ai.crawler_strategy import *
|
from crawl4ai.crawler_strategy import *
|
||||||
|
|
||||||
# Define the JavaScript code to click the "Load More" button
|
# Define the JavaScript code to click the "Load More" button
|
||||||
js_code = """
|
js_code = ["""
|
||||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
loadMoreButton && loadMoreButton.click();
|
loadMoreButton && loadMoreButton.click();
|
||||||
"""
|
"""]
|
||||||
|
|
||||||
# Define the crawling strategy
|
|
||||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
|
||||||
|
|
||||||
# Create the WebCrawler instance with the defined strategy
|
|
||||||
crawler = WebCrawler(crawler_strategy=crawler_strategy)
|
|
||||||
|
|
||||||
|
crawler = WebCrawler(verbose=True)
|
||||||
|
crawler.warmup()
|
||||||
# Run the crawler with keyword filtering and CSS selector
|
# Run the crawler with keyword filtering and CSS selector
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
|
js = js_code,
|
||||||
extraction_strategy=CosineStrategy(
|
extraction_strategy=CosineStrategy(
|
||||||
semantic_filter="technology",
|
semantic_filter="technology",
|
||||||
),
|
),
|
||||||
@@ -74,6 +106,7 @@ result = crawler.run(
|
|||||||
# Run the crawler with LLM extraction strategy
|
# Run the crawler with LLM extraction strategy
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
|
js = js_code,
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
provider="openai/gpt-4o",
|
provider="openai/gpt-4o",
|
||||||
api_token=os.getenv('OPENAI_API_KEY'),
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
@@ -201,14 +234,18 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t
|
|||||||
"url": "https://www.nbcnews.com/business",
|
"url": "https://www.nbcnews.com/business",
|
||||||
"extracted_content": "...",
|
"extracted_content": "...",
|
||||||
"html": "...",
|
"html": "...",
|
||||||
|
"cleaned_html": "...",
|
||||||
"markdown": "...",
|
"markdown": "...",
|
||||||
"metadata": {...}
|
"media": {...},
|
||||||
|
"links": {...},
|
||||||
|
"metadata": {...},
|
||||||
|
"screenshots": "...",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters) section.
|
For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters-) section.
|
||||||
|
|
||||||
|
|
||||||
## Python Library Usage 🚀
|
## Python Library Usage 🚀
|
||||||
@@ -241,6 +278,32 @@ Crawl result without raw HTML content:
|
|||||||
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Result Structure
|
||||||
|
|
||||||
|
The result object contains the following fields:
|
||||||
|
```python
|
||||||
|
class CrawlResult(BaseModel):
|
||||||
|
url: str
|
||||||
|
html: str
|
||||||
|
success: bool
|
||||||
|
cleaned_html: Optional[str] = None
|
||||||
|
media: Dict[str, List[Dict]] = {} # Media tags in the page {"images": [], "audio": [], "video": []}
|
||||||
|
links: Dict[str, List[Dict]] = {} # Links in the page {"external": [], "internal": []}
|
||||||
|
screenshot: Optional[str] = None # Base64 encoded screenshot
|
||||||
|
markdown: Optional[str] = None
|
||||||
|
extracted_content: Optional[str] = None
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
error_message: Optional[str] = None
|
||||||
|
```
|
||||||
|
|
||||||
|
### Taking Screenshots
|
||||||
|
|
||||||
|
```python
|
||||||
|
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result.screenshot))
|
||||||
|
```
|
||||||
|
|
||||||
### Adding a chunking strategy: RegexChunking
|
### Adding a chunking strategy: RegexChunking
|
||||||
|
|
||||||
Using RegexChunking:
|
Using RegexChunking:
|
||||||
@@ -347,10 +410,12 @@ result = crawler.run(url="https://www.nbcnews.com/business")
|
|||||||
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
||||||
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
||||||
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
||||||
|
| `screenshots` | Whether to take screenshots of the page. | No | `false` |
|
||||||
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
||||||
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
|
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
|
||||||
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
||||||
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
|
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
|
||||||
|
| `user_agent` | The user agent to use for the HTTP requests. | No | `Mozilla/5.0` |
|
||||||
| `verbose` | Whether to enable verbose logging. | No | `true` |
|
| `verbose` | Whether to enable verbose logging. | No | `true` |
|
||||||
|
|
||||||
## Chunking Strategies 📚
|
## Chunking Strategies 📚
|
||||||
|
|||||||
@@ -7,6 +7,15 @@ from selenium.webdriver.support import expected_conditions as EC
|
|||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.common.exceptions import InvalidArgumentException
|
from selenium.common.exceptions import InvalidArgumentException
|
||||||
import logging
|
import logging
|
||||||
|
import base64
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import List, Callable
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from .utils import wrap_text
|
||||||
|
|
||||||
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
||||||
logger.setLevel(logging.WARNING)
|
logger.setLevel(logging.WARNING)
|
||||||
|
|
||||||
@@ -25,15 +34,24 @@ driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finde
|
|||||||
driver_finder_logger.setLevel(logging.WARNING)
|
driver_finder_logger.setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
import requests
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
class CrawlerStrategy(ABC):
|
class CrawlerStrategy(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def crawl(self, url: str, **kwargs) -> str:
|
def crawl(self, url: str, **kwargs) -> str:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def take_screenshot(self, save_path: str):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_user_agent(self, user_agent: str):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
|
pass
|
||||||
|
|
||||||
class CloudCrawlerStrategy(CrawlerStrategy):
|
class CloudCrawlerStrategy(CrawlerStrategy):
|
||||||
def __init__(self, use_cached_html = False):
|
def __init__(self, use_cached_html = False):
|
||||||
@@ -59,6 +77,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||||
self.options = Options()
|
self.options = Options()
|
||||||
self.options.headless = True
|
self.options.headless = True
|
||||||
|
if kwargs.get("user_agent"):
|
||||||
|
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
||||||
self.options.add_argument("--no-sandbox")
|
self.options.add_argument("--no-sandbox")
|
||||||
self.options.add_argument("--headless")
|
self.options.add_argument("--headless")
|
||||||
# self.options.add_argument("--disable-dev-shm-usage")
|
# self.options.add_argument("--disable-dev-shm-usage")
|
||||||
@@ -80,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
self.use_cached_html = use_cached_html
|
self.use_cached_html = use_cached_html
|
||||||
self.js_code = js_code
|
self.js_code = js_code
|
||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = kwargs.get("verbose", False)
|
||||||
|
|
||||||
|
# Hooks
|
||||||
|
self.hooks = {
|
||||||
|
'on_driver_created': None,
|
||||||
|
'before_get_url': None,
|
||||||
|
'after_get_url': None,
|
||||||
|
'before_return_html': None
|
||||||
|
}
|
||||||
|
|
||||||
# chromedriver_autoinstaller.install()
|
# chromedriver_autoinstaller.install()
|
||||||
import chromedriver_autoinstaller
|
import chromedriver_autoinstaller
|
||||||
@@ -87,33 +115,77 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
self.service.log_path = "NUL"
|
self.service.log_path = "NUL"
|
||||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
|
||||||
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
|
if hook_type in self.hooks:
|
||||||
|
self.hooks[hook_type] = hook
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||||
|
|
||||||
|
def execute_hook(self, hook_type: str, *args):
|
||||||
|
hook = self.hooks.get(hook_type)
|
||||||
|
if hook:
|
||||||
|
result = hook(*args)
|
||||||
|
if result is not None:
|
||||||
|
if isinstance(result, webdriver.Chrome):
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
|
||||||
|
# If the hook returns None or there is no hook, return self.driver
|
||||||
|
return self.driver
|
||||||
|
|
||||||
|
def update_user_agent(self, user_agent: str):
|
||||||
|
self.options.add_argument(f"user-agent={user_agent}")
|
||||||
|
self.driver.quit()
|
||||||
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
self.driver = self.execute_hook('on_driver_created', self.driver)
|
||||||
|
|
||||||
|
def set_custom_headers(self, headers: dict):
|
||||||
|
# Enable Network domain for sending headers
|
||||||
|
self.driver.execute_cdp_cmd('Network.enable', {})
|
||||||
|
# Set extra HTTP headers
|
||||||
|
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
|
||||||
|
|
||||||
|
|
||||||
def crawl(self, url: str) -> str:
|
def crawl(self, url: str) -> str:
|
||||||
|
# Create md5 hash of the URL
|
||||||
|
import hashlib
|
||||||
|
url_hash = hashlib.md5(url.encode()).hexdigest()
|
||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
if os.path.exists(cache_file_path):
|
if os.path.exists(cache_file_path):
|
||||||
with open(cache_file_path, "r") as f:
|
with open(cache_file_path, "r") as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
WebDriverWait(self.driver, 10).until(
|
WebDriverWait(self.driver, 10).until(
|
||||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||||
)
|
)
|
||||||
|
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||||
|
|
||||||
# Execute JS code if provided
|
# Execute JS code if provided
|
||||||
if self.js_code:
|
if self.js_code and type(self.js_code) == str:
|
||||||
self.driver.execute_script(self.js_code)
|
self.driver.execute_script(self.js_code)
|
||||||
# Optionally, wait for some condition after executing the JS code
|
# Optionally, wait for some condition after executing the JS code
|
||||||
WebDriverWait(self.driver, 10).until(
|
WebDriverWait(self.driver, 10).until(
|
||||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||||
)
|
)
|
||||||
|
elif self.js_code and type(self.js_code) == list:
|
||||||
|
for js in self.js_code:
|
||||||
|
self.driver.execute_script(js)
|
||||||
|
WebDriverWait(self.driver, 10).until(
|
||||||
|
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||||
|
)
|
||||||
|
|
||||||
html = self.driver.page_source
|
html = self.driver.page_source
|
||||||
|
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||||
|
|
||||||
# Store in cache
|
# Store in cache
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
with open(cache_file_path, "w") as f:
|
with open(cache_file_path, "w") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
|
|
||||||
@@ -126,5 +198,62 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
||||||
|
|
||||||
|
def take_screenshot(self) -> str:
|
||||||
|
try:
|
||||||
|
# Get the dimensions of the page
|
||||||
|
total_width = self.driver.execute_script("return document.body.scrollWidth")
|
||||||
|
total_height = self.driver.execute_script("return document.body.scrollHeight")
|
||||||
|
|
||||||
|
# Set the window size to the dimensions of the page
|
||||||
|
self.driver.set_window_size(total_width, total_height)
|
||||||
|
|
||||||
|
# Take screenshot
|
||||||
|
screenshot = self.driver.get_screenshot_as_png()
|
||||||
|
|
||||||
|
# Open the screenshot with PIL
|
||||||
|
image = Image.open(BytesIO(screenshot))
|
||||||
|
|
||||||
|
# Convert to JPEG and compress
|
||||||
|
buffered = BytesIO()
|
||||||
|
image.save(buffered, format="JPEG", quality=85)
|
||||||
|
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
||||||
|
|
||||||
|
return img_base64
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_message = f"Failed to take screenshot: {str(e)}"
|
||||||
|
print(error_message)
|
||||||
|
|
||||||
|
# Generate an image with black background
|
||||||
|
img = Image.new('RGB', (800, 600), color='black')
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
|
||||||
|
# Load a font
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("arial.ttf", 40)
|
||||||
|
except IOError:
|
||||||
|
font = ImageFont.load_default(size=40)
|
||||||
|
|
||||||
|
# Define text color and wrap the text
|
||||||
|
text_color = (255, 255, 255)
|
||||||
|
max_width = 780
|
||||||
|
wrapped_text = wrap_text(draw, error_message, font, max_width)
|
||||||
|
|
||||||
|
# Calculate text position
|
||||||
|
text_position = (10, 10)
|
||||||
|
|
||||||
|
# Draw the text on the image
|
||||||
|
draw.text(text_position, wrapped_text, fill=text_color, font=font)
|
||||||
|
|
||||||
|
# Convert to base64
|
||||||
|
buffered = BytesIO()
|
||||||
|
img.save(buffered, format="JPEG")
|
||||||
|
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
return img_base64
|
||||||
|
|
||||||
def quit(self):
|
def quit(self):
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
@@ -1,13 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from typing import Optional
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||||
os.makedirs(DB_PATH, exist_ok=True)
|
os.makedirs(DB_PATH, exist_ok=True)
|
||||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||||
|
|
||||||
def init_db():
|
def init_db():
|
||||||
global DB_PATH
|
global DB_PATH
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
@@ -19,22 +18,37 @@ def init_db():
|
|||||||
cleaned_html TEXT,
|
cleaned_html TEXT,
|
||||||
markdown TEXT,
|
markdown TEXT,
|
||||||
extracted_content TEXT,
|
extracted_content TEXT,
|
||||||
success BOOLEAN
|
success BOOLEAN,
|
||||||
|
media TEXT DEFAULT "{}",
|
||||||
|
link TEXT DEFAULT "{}",
|
||||||
|
metadata TEXT DEFAULT "{}",
|
||||||
|
screenshot TEXT DEFAULT ""
|
||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
def check_db_path():
|
def alter_db_add_screenshot(new_column: str = "media"):
|
||||||
if not DB_PATH:
|
|
||||||
raise ValueError("Database path is not set or is empty.")
|
|
||||||
|
|
||||||
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
|
|
||||||
check_db_path()
|
check_db_path()
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
|
cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error altering database to add screenshot column: {e}")
|
||||||
|
|
||||||
|
def check_db_path():
|
||||||
|
if not DB_PATH:
|
||||||
|
raise ValueError("Database path is not set or is empty.")
|
||||||
|
|
||||||
|
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
||||||
|
check_db_path()
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
|
||||||
result = cursor.fetchone()
|
result = cursor.fetchone()
|
||||||
conn.close()
|
conn.close()
|
||||||
return result
|
return result
|
||||||
@@ -42,21 +56,25 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
|
|||||||
print(f"Error retrieving cached URL: {e}")
|
print(f"Error retrieving cached URL: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
|
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
|
||||||
check_db_path()
|
check_db_path()
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
|
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
||||||
VALUES (?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
ON CONFLICT(url) DO UPDATE SET
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
html = excluded.html,
|
html = excluded.html,
|
||||||
cleaned_html = excluded.cleaned_html,
|
cleaned_html = excluded.cleaned_html,
|
||||||
markdown = excluded.markdown,
|
markdown = excluded.markdown,
|
||||||
extracted_content = excluded.extracted_content,
|
extracted_content = excluded.extracted_content,
|
||||||
success = excluded.success
|
success = excluded.success,
|
||||||
''', (url, html, cleaned_html, markdown, extracted_content, success))
|
media = excluded.media,
|
||||||
|
links = excluded.links,
|
||||||
|
metadata = excluded.metadata,
|
||||||
|
screenshot = excluded.screenshot
|
||||||
|
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -95,4 +113,20 @@ def flush_db():
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error flushing database: {e}")
|
print(f"Error flushing database: {e}")
|
||||||
|
|
||||||
|
def update_existing_records(new_column: str = "media", default_value: str = "{}"):
|
||||||
|
check_db_path()
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error updating existing records: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
init_db() # Initialize the database if not already initialized
|
||||||
|
alter_db_add_screenshot("metadata") # Add the new column to the table
|
||||||
|
update_existing_records("metadata") # Update existing records to set the new column to an empty string
|
||||||
|
|||||||
@@ -188,14 +188,15 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
||||||
|
|
||||||
if False and self.device.type == "cpu":
|
# if False and self.device.type == "cpu":
|
||||||
self.model = load_onnx_all_MiniLM_l6_v2()
|
# self.model = load_onnx_all_MiniLM_l6_v2()
|
||||||
self.tokenizer = self.model.tokenizer
|
# self.tokenizer = self.model.tokenizer
|
||||||
self.get_embedding_method = "direct"
|
# self.get_embedding_method = "direct"
|
||||||
else:
|
# else:
|
||||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
|
||||||
self.model.eval()
|
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||||
self.get_embedding_method = "batch"
|
self.model.eval()
|
||||||
|
self.get_embedding_method = "batch"
|
||||||
|
|
||||||
self.buffer_embeddings = np.array([])
|
self.buffer_embeddings = np.array([])
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ from functools import lru_cache
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import subprocess, os
|
import subprocess, os
|
||||||
import shutil
|
import shutil
|
||||||
|
import tarfile
|
||||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
from crawl4ai.config import MODEL_REPO_BRANCH
|
||||||
import argparse
|
import argparse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
@@ -34,8 +35,7 @@ def calculate_batch_size(device):
|
|||||||
else:
|
else:
|
||||||
return 32
|
return 32
|
||||||
else:
|
else:
|
||||||
return 16 # Default batch size
|
return 16 # Default batch size
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def get_device():
|
def get_device():
|
||||||
@@ -82,12 +82,19 @@ def load_bge_small_en_v1_5():
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def load_onnx_all_MiniLM_l6_v2():
|
def load_onnx_all_MiniLM_l6_v2():
|
||||||
from crawl4ai.onnx_embedding import DefaultEmbeddingModel
|
from crawl4ai.onnx_embedding import DefaultEmbeddingModel
|
||||||
model_path = "models/onnx/model.onnx"
|
|
||||||
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx"
|
|
||||||
download_path = os.path.join(__location__, model_path)
|
|
||||||
|
|
||||||
|
model_path = "models/onnx.tar.gz"
|
||||||
|
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz"
|
||||||
|
__location__ = os.path.realpath(
|
||||||
|
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
download_path = os.path.join(__location__, model_path)
|
||||||
|
onnx_dir = os.path.join(__location__, "models/onnx")
|
||||||
|
|
||||||
|
# Create the models directory if it does not exist
|
||||||
|
os.makedirs(os.path.dirname(download_path), exist_ok=True)
|
||||||
|
|
||||||
|
# Download the tar.gz file if it does not exist
|
||||||
if not os.path.exists(download_path):
|
if not os.path.exists(download_path):
|
||||||
# Define a download function with a simple progress display
|
|
||||||
def download_with_progress(url, filename):
|
def download_with_progress(url, filename):
|
||||||
def reporthook(block_num, block_size, total_size):
|
def reporthook(block_num, block_size, total_size):
|
||||||
downloaded = block_num * block_size
|
downloaded = block_num * block_size
|
||||||
@@ -95,12 +102,22 @@ def load_onnx_all_MiniLM_l6_v2():
|
|||||||
if downloaded < total_size:
|
if downloaded < total_size:
|
||||||
print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
|
print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
|
||||||
else:
|
else:
|
||||||
print("\rDownload complete! ")
|
print("\rDownload complete!")
|
||||||
|
|
||||||
urllib.request.urlretrieve(url, filename, reporthook)
|
urllib.request.urlretrieve(url, filename, reporthook)
|
||||||
|
|
||||||
download_with_progress(model_url, download_path)
|
download_with_progress(model_url, download_path)
|
||||||
|
|
||||||
|
# Extract the tar.gz file if the onnx directory does not exist
|
||||||
|
if not os.path.exists(onnx_dir):
|
||||||
|
with tarfile.open(download_path, "r:gz") as tar:
|
||||||
|
tar.extractall(path=os.path.join(__location__, "models"))
|
||||||
|
|
||||||
|
# remove the tar.gz file
|
||||||
|
os.remove(download_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
model = DefaultEmbeddingModel()
|
model = DefaultEmbeddingModel()
|
||||||
return model
|
return model
|
||||||
|
|
||||||
@@ -240,8 +257,8 @@ def download_all_models(remove_existing=False):
|
|||||||
# load_bert_base_uncased()
|
# load_bert_base_uncased()
|
||||||
# print("[LOG] Downloading BGE Small EN v1.5...")
|
# print("[LOG] Downloading BGE Small EN v1.5...")
|
||||||
# load_bge_small_en_v1_5()
|
# load_bge_small_en_v1_5()
|
||||||
print("[LOG] Downloading ONNX model...")
|
# print("[LOG] Downloading ONNX model...")
|
||||||
load_onnx_all_MiniLM_l6_v2()
|
# load_onnx_all_MiniLM_l6_v2()
|
||||||
print("[LOG] Downloading text classifier...")
|
print("[LOG] Downloading text classifier...")
|
||||||
_, device = load_text_multilabel_classifier()
|
_, device = load_text_multilabel_classifier()
|
||||||
print(f"[LOG] Text classifier loaded on {device}")
|
print(f"[LOG] Text classifier loaded on {device}")
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from pydantic import BaseModel, HttpUrl
|
from pydantic import BaseModel, HttpUrl
|
||||||
from typing import List
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
class UrlModel(BaseModel):
|
class UrlModel(BaseModel):
|
||||||
url: HttpUrl
|
url: HttpUrl
|
||||||
@@ -9,8 +9,11 @@ class CrawlResult(BaseModel):
|
|||||||
url: str
|
url: str
|
||||||
html: str
|
html: str
|
||||||
success: bool
|
success: bool
|
||||||
cleaned_html: str = None
|
cleaned_html: Optional[str] = None
|
||||||
markdown: str = None
|
media: Dict[str, List[Dict]] = {}
|
||||||
extracted_content: str = None
|
links: Dict[str, List[Dict]] = {}
|
||||||
metadata: dict = None
|
screenshot: Optional[str] = None
|
||||||
error_message: str = None
|
markdown: Optional[str] = None
|
||||||
|
extracted_content: Optional[str] = None
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
error_message: Optional[str] = None
|
||||||
@@ -151,7 +151,7 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
|
|
||||||
super().handle_tag(tag, attrs, start)
|
super().handle_tag(tag, attrs, start)
|
||||||
|
|
||||||
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
||||||
try:
|
try:
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
@@ -170,6 +170,28 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
|
|||||||
for el in selected_elements:
|
for el in selected_elements:
|
||||||
div_tag.append(el)
|
div_tag.append(el)
|
||||||
body = div_tag
|
body = div_tag
|
||||||
|
|
||||||
|
links = {
|
||||||
|
'internal': [],
|
||||||
|
'external': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract all internal and external links
|
||||||
|
for a in body.find_all('a', href=True):
|
||||||
|
href = a['href']
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
if href.startswith('http') and url_base not in href:
|
||||||
|
links['external'].append({
|
||||||
|
'href': href,
|
||||||
|
'text': a.get_text()
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
links['internal'].append(
|
||||||
|
{
|
||||||
|
'href': href,
|
||||||
|
'text': a.get_text()
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Remove script, style, and other tags that don't carry useful content from body
|
# Remove script, style, and other tags that don't carry useful content from body
|
||||||
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
|
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
|
||||||
@@ -180,6 +202,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
|
|||||||
if tag.name != 'img':
|
if tag.name != 'img':
|
||||||
tag.attrs = {}
|
tag.attrs = {}
|
||||||
|
|
||||||
|
# Extract all img tgas inti [{src: '', alt: ''}]
|
||||||
|
media = {
|
||||||
|
'images': [],
|
||||||
|
'videos': [],
|
||||||
|
'audios': []
|
||||||
|
}
|
||||||
|
for img in body.find_all('img'):
|
||||||
|
media['images'].append({
|
||||||
|
'src': img.get('src'),
|
||||||
|
'alt': img.get('alt'),
|
||||||
|
"type": "image"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Extract all video tags into [{src: '', alt: ''}]
|
||||||
|
for video in body.find_all('video'):
|
||||||
|
media['videos'].append({
|
||||||
|
'src': video.get('src'),
|
||||||
|
'alt': video.get('alt'),
|
||||||
|
"type": "video"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Extract all audio tags into [{src: '', alt: ''}]
|
||||||
|
for audio in body.find_all('audio'):
|
||||||
|
media['audios'].append({
|
||||||
|
'src': audio.get('src'),
|
||||||
|
'alt': audio.get('alt'),
|
||||||
|
"type": "audio"
|
||||||
|
})
|
||||||
|
|
||||||
# Replace images with their alt text or remove them if no alt text is available
|
# Replace images with their alt text or remove them if no alt text is available
|
||||||
for img in body.find_all('img'):
|
for img in body.find_all('img'):
|
||||||
alt_text = img.get('alt')
|
alt_text = img.get('alt')
|
||||||
@@ -299,13 +350,56 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
|
|||||||
return{
|
return{
|
||||||
'markdown': markdown,
|
'markdown': markdown,
|
||||||
'cleaned_html': cleaned_html,
|
'cleaned_html': cleaned_html,
|
||||||
'success': True
|
'success': True,
|
||||||
|
'media': media,
|
||||||
|
'links': links
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('Error processing HTML content:', str(e))
|
print('Error processing HTML content:', str(e))
|
||||||
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
|
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metadata(html):
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
if not html:
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
# Parse HTML content with BeautifulSoup
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Title
|
||||||
|
title_tag = soup.find('title')
|
||||||
|
metadata['title'] = title_tag.string if title_tag else None
|
||||||
|
|
||||||
|
# Meta description
|
||||||
|
description_tag = soup.find('meta', attrs={'name': 'description'})
|
||||||
|
metadata['description'] = description_tag['content'] if description_tag else None
|
||||||
|
|
||||||
|
# Meta keywords
|
||||||
|
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
|
||||||
|
metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
|
||||||
|
|
||||||
|
# Meta author
|
||||||
|
author_tag = soup.find('meta', attrs={'name': 'author'})
|
||||||
|
metadata['author'] = author_tag['content'] if author_tag else None
|
||||||
|
|
||||||
|
# Open Graph metadata
|
||||||
|
og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
|
||||||
|
for tag in og_tags:
|
||||||
|
property_name = tag['property']
|
||||||
|
metadata[property_name] = tag['content']
|
||||||
|
|
||||||
|
# Twitter Card metadata
|
||||||
|
twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
|
||||||
|
for tag in twitter_tags:
|
||||||
|
property_name = tag['name']
|
||||||
|
metadata[property_name] = tag['content']
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
def extract_xml_tags(string):
|
def extract_xml_tags(string):
|
||||||
tags = re.findall(r'<(\w+)>', string)
|
tags = re.findall(r'<(\w+)>', string)
|
||||||
return list(set(tags))
|
return list(set(tags))
|
||||||
@@ -483,4 +577,16 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
|
|||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
extracted_content.extend(future.result())
|
extracted_content.extend(future.result())
|
||||||
|
|
||||||
return extracted_content
|
return extracted_content
|
||||||
|
|
||||||
|
|
||||||
|
def wrap_text(draw, text, font, max_width):
|
||||||
|
# Wrap the text to fit within the specified width
|
||||||
|
lines = []
|
||||||
|
words = text.split()
|
||||||
|
while words:
|
||||||
|
line = ''
|
||||||
|
while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
|
||||||
|
line += (words.pop(0) + ' ')
|
||||||
|
lines.append(line)
|
||||||
|
return '\n'.join(lines)
|
||||||
357
crawl4ai/web_crawler.back.py
Normal file
@@ -0,0 +1,357 @@
|
|||||||
|
import os, time
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .models import UrlModel, CrawlResult
|
||||||
|
from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db
|
||||||
|
from .utils import *
|
||||||
|
from .chunking_strategy import *
|
||||||
|
from .extraction_strategy import *
|
||||||
|
from .crawler_strategy import *
|
||||||
|
from typing import List
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from .config import *
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawler:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
# db_path: str = None,
|
||||||
|
crawler_strategy: CrawlerStrategy = None,
|
||||||
|
always_by_pass_cache: bool = False,
|
||||||
|
verbose: bool = False,
|
||||||
|
):
|
||||||
|
# self.db_path = db_path
|
||||||
|
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||||
|
self.always_by_pass_cache = always_by_pass_cache
|
||||||
|
|
||||||
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
|
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||||
|
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||||
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||||
|
|
||||||
|
# If db_path is not provided, use the default path
|
||||||
|
# if not db_path:
|
||||||
|
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
|
||||||
|
|
||||||
|
# flush_db()
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
self.ready = False
|
||||||
|
|
||||||
|
def warmup(self):
|
||||||
|
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||||
|
result = self.run(
|
||||||
|
url='https://crawl4ai.uccode.io/',
|
||||||
|
word_count_threshold=5,
|
||||||
|
extraction_strategy= NoExtractionStrategy(),
|
||||||
|
bypass_cache=False,
|
||||||
|
verbose = False
|
||||||
|
)
|
||||||
|
self.ready = True
|
||||||
|
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
||||||
|
|
||||||
|
def fetch_page(
|
||||||
|
self,
|
||||||
|
url_model: UrlModel,
|
||||||
|
provider: str = DEFAULT_PROVIDER,
|
||||||
|
api_token: str = None,
|
||||||
|
extract_blocks_flag: bool = True,
|
||||||
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
|
css_selector: str = None,
|
||||||
|
screenshot: bool = False,
|
||||||
|
use_cached_html: bool = False,
|
||||||
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
**kwargs,
|
||||||
|
) -> CrawlResult:
|
||||||
|
return self.run(
|
||||||
|
url_model.url,
|
||||||
|
word_count_threshold,
|
||||||
|
extraction_strategy or NoExtractionStrategy(),
|
||||||
|
chunking_strategy,
|
||||||
|
bypass_cache=url_model.forced,
|
||||||
|
css_selector=css_selector,
|
||||||
|
screenshot=screenshot,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run_old(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
bypass_cache: bool = False,
|
||||||
|
css_selector: str = None,
|
||||||
|
screenshot: bool = False,
|
||||||
|
user_agent: str = None,
|
||||||
|
verbose=True,
|
||||||
|
**kwargs,
|
||||||
|
) -> CrawlResult:
|
||||||
|
if user_agent:
|
||||||
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
|
extraction_strategy.verbose = verbose
|
||||||
|
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
|
||||||
|
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||||
|
raise ValueError("Unsupported extraction strategy")
|
||||||
|
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||||
|
raise ValueError("Unsupported chunking strategy")
|
||||||
|
|
||||||
|
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
|
||||||
|
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||||
|
word_count_threshold = MIN_WORD_THRESHOLD
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
if not bypass_cache and not self.always_by_pass_cache:
|
||||||
|
cached = get_cached_url(url)
|
||||||
|
if cached:
|
||||||
|
return CrawlResult(
|
||||||
|
**{
|
||||||
|
"url": cached[0],
|
||||||
|
"html": cached[1],
|
||||||
|
"cleaned_html": cached[2],
|
||||||
|
"markdown": cached[3],
|
||||||
|
"extracted_content": cached[4],
|
||||||
|
"success": cached[5],
|
||||||
|
"media": json.loads(cached[6] or "{}"),
|
||||||
|
"links": json.loads(cached[7] or "{}"),
|
||||||
|
"metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
|
||||||
|
"screenshot": cached[9],
|
||||||
|
"error_message": "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initialize WebDriver for crawling
|
||||||
|
t = time.time()
|
||||||
|
if kwargs.get("js", None):
|
||||||
|
self.crawler_strategy.js_code = kwargs.get("js")
|
||||||
|
html = self.crawler_strategy.crawl(url)
|
||||||
|
base64_image = None
|
||||||
|
if screenshot:
|
||||||
|
base64_image = self.crawler_strategy.take_screenshot()
|
||||||
|
success = True
|
||||||
|
error_message = ""
|
||||||
|
# Extract content from HTML
|
||||||
|
try:
|
||||||
|
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||||
|
metadata = extract_metadata(html)
|
||||||
|
if result is None:
|
||||||
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||||
|
except InvalidCSSSelectorError as e:
|
||||||
|
raise ValueError(str(e))
|
||||||
|
|
||||||
|
cleaned_html = result.get("cleaned_html", "")
|
||||||
|
markdown = result.get("markdown", "")
|
||||||
|
media = result.get("media", [])
|
||||||
|
links = result.get("links", [])
|
||||||
|
|
||||||
|
# Print a profession LOG style message, show time taken and say crawling is done
|
||||||
|
if verbose:
|
||||||
|
print(
|
||||||
|
f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
|
||||||
|
)
|
||||||
|
|
||||||
|
extracted_content = []
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||||
|
t = time.time()
|
||||||
|
# Split markdown into sections
|
||||||
|
sections = chunking_strategy.chunk(markdown)
|
||||||
|
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
|
||||||
|
|
||||||
|
extracted_content = extraction_strategy.run(
|
||||||
|
url, sections,
|
||||||
|
)
|
||||||
|
extracted_content = json.dumps(extracted_content)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(
|
||||||
|
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cache the result
|
||||||
|
cleaned_html = beautify_html(cleaned_html)
|
||||||
|
cache_url(
|
||||||
|
url,
|
||||||
|
html,
|
||||||
|
cleaned_html,
|
||||||
|
markdown,
|
||||||
|
extracted_content,
|
||||||
|
success,
|
||||||
|
json.dumps(media),
|
||||||
|
json.dumps(links),
|
||||||
|
json.dumps(metadata),
|
||||||
|
screenshot=base64_image,
|
||||||
|
)
|
||||||
|
|
||||||
|
return CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html=html,
|
||||||
|
cleaned_html=cleaned_html,
|
||||||
|
markdown=markdown,
|
||||||
|
media=media,
|
||||||
|
links=links,
|
||||||
|
metadata=metadata,
|
||||||
|
screenshot=base64_image,
|
||||||
|
extracted_content=extracted_content,
|
||||||
|
success=success,
|
||||||
|
error_message=error_message,
|
||||||
|
)
|
||||||
|
|
||||||
|
def fetch_pages(
|
||||||
|
self,
|
||||||
|
url_models: List[UrlModel],
|
||||||
|
provider: str = DEFAULT_PROVIDER,
|
||||||
|
api_token: str = None,
|
||||||
|
extract_blocks_flag: bool = True,
|
||||||
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
|
use_cached_html: bool = False,
|
||||||
|
css_selector: str = None,
|
||||||
|
screenshot: bool = False,
|
||||||
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
**kwargs,
|
||||||
|
) -> List[CrawlResult]:
|
||||||
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
|
def fetch_page_wrapper(url_model, *args, **kwargs):
|
||||||
|
return self.fetch_page(url_model, *args, **kwargs)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
results = list(
|
||||||
|
executor.map(
|
||||||
|
fetch_page_wrapper,
|
||||||
|
url_models,
|
||||||
|
[provider] * len(url_models),
|
||||||
|
[api_token] * len(url_models),
|
||||||
|
[extract_blocks_flag] * len(url_models),
|
||||||
|
[word_count_threshold] * len(url_models),
|
||||||
|
[css_selector] * len(url_models),
|
||||||
|
[screenshot] * len(url_models),
|
||||||
|
[use_cached_html] * len(url_models),
|
||||||
|
[extraction_strategy] * len(url_models),
|
||||||
|
[chunking_strategy] * len(url_models),
|
||||||
|
*[kwargs] * len(url_models),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def run(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
bypass_cache: bool = False,
|
||||||
|
css_selector: str = None,
|
||||||
|
screenshot: bool = False,
|
||||||
|
user_agent: str = None,
|
||||||
|
verbose=True,
|
||||||
|
**kwargs,
|
||||||
|
) -> CrawlResult:
|
||||||
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
|
extraction_strategy.verbose = verbose
|
||||||
|
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||||
|
raise ValueError("Unsupported extraction strategy")
|
||||||
|
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||||
|
raise ValueError("Unsupported chunking strategy")
|
||||||
|
|
||||||
|
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||||
|
word_count_threshold = MIN_WORD_THRESHOLD
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
cached = None
|
||||||
|
extracted_content = None
|
||||||
|
if not bypass_cache and not self.always_by_pass_cache:
|
||||||
|
cached = get_cached_url(url)
|
||||||
|
|
||||||
|
if cached:
|
||||||
|
html = cached[1]
|
||||||
|
extracted_content = cached[2]
|
||||||
|
if screenshot:
|
||||||
|
screenshot = cached[9]
|
||||||
|
|
||||||
|
else:
|
||||||
|
if user_agent:
|
||||||
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
|
html = self.crawler_strategy.crawl(url)
|
||||||
|
if screenshot:
|
||||||
|
screenshot = self.crawler_strategy.take_screenshot()
|
||||||
|
|
||||||
|
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
|
||||||
|
|
||||||
|
def process_html(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
html: str,
|
||||||
|
extracted_content: str,
|
||||||
|
word_count_threshold: int,
|
||||||
|
extraction_strategy: ExtractionStrategy,
|
||||||
|
chunking_strategy: ChunkingStrategy,
|
||||||
|
css_selector: str,
|
||||||
|
screenshot: bool,
|
||||||
|
verbose: bool,
|
||||||
|
is_cached: bool,
|
||||||
|
**kwargs,
|
||||||
|
) -> CrawlResult:
|
||||||
|
t = time.time()
|
||||||
|
# Extract content from HTML
|
||||||
|
try:
|
||||||
|
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||||
|
metadata = extract_metadata(html)
|
||||||
|
if result is None:
|
||||||
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||||
|
except InvalidCSSSelectorError as e:
|
||||||
|
raise ValueError(str(e))
|
||||||
|
|
||||||
|
cleaned_html = result.get("cleaned_html", "")
|
||||||
|
markdown = result.get("markdown", "")
|
||||||
|
media = result.get("media", [])
|
||||||
|
links = result.get("links", [])
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
|
||||||
|
|
||||||
|
if extracted_content is None:
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||||
|
|
||||||
|
sections = chunking_strategy.chunk(markdown)
|
||||||
|
extracted_content = extraction_strategy.run(url, sections)
|
||||||
|
extracted_content = json.dumps(extracted_content)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
|
||||||
|
|
||||||
|
screenshot = None if not screenshot else screenshot
|
||||||
|
|
||||||
|
if not is_cached:
|
||||||
|
cache_url(
|
||||||
|
url,
|
||||||
|
html,
|
||||||
|
cleaned_html,
|
||||||
|
markdown,
|
||||||
|
extracted_content,
|
||||||
|
True,
|
||||||
|
json.dumps(media),
|
||||||
|
json.dumps(links),
|
||||||
|
json.dumps(metadata),
|
||||||
|
screenshot=screenshot,
|
||||||
|
)
|
||||||
|
|
||||||
|
return CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html=html,
|
||||||
|
cleaned_html=cleaned_html,
|
||||||
|
markdown=markdown,
|
||||||
|
media=media,
|
||||||
|
links=links,
|
||||||
|
metadata=metadata,
|
||||||
|
screenshot=screenshot,
|
||||||
|
extracted_content=extracted_content,
|
||||||
|
success=True,
|
||||||
|
error_message="",
|
||||||
|
)
|
||||||
@@ -51,7 +51,6 @@ class WebCrawler:
|
|||||||
self.ready = True
|
self.ready = True
|
||||||
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(
|
def fetch_page(
|
||||||
self,
|
self,
|
||||||
url_model: UrlModel,
|
url_model: UrlModel,
|
||||||
@@ -59,6 +58,8 @@ class WebCrawler:
|
|||||||
api_token: str = None,
|
api_token: str = None,
|
||||||
extract_blocks_flag: bool = True,
|
extract_blocks_flag: bool = True,
|
||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
|
css_selector: str = None,
|
||||||
|
screenshot: bool = False,
|
||||||
use_cached_html: bool = False,
|
use_cached_html: bool = False,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
@@ -70,111 +71,12 @@ class WebCrawler:
|
|||||||
extraction_strategy or NoExtractionStrategy(),
|
extraction_strategy or NoExtractionStrategy(),
|
||||||
chunking_strategy,
|
chunking_strategy,
|
||||||
bypass_cache=url_model.forced,
|
bypass_cache=url_model.forced,
|
||||||
|
css_selector=css_selector,
|
||||||
|
screenshot=screenshot,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def run(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
|
||||||
bypass_cache: bool = False,
|
|
||||||
css_selector: str = None,
|
|
||||||
verbose=True,
|
|
||||||
**kwargs,
|
|
||||||
) -> CrawlResult:
|
|
||||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
|
||||||
extraction_strategy.verbose = verbose
|
|
||||||
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
|
|
||||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
|
||||||
raise ValueError("Unsupported extraction strategy")
|
|
||||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
|
||||||
raise ValueError("Unsupported chunking strategy")
|
|
||||||
|
|
||||||
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
|
|
||||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
|
||||||
word_count_threshold = MIN_WORD_THRESHOLD
|
|
||||||
|
|
||||||
# Check cache first
|
|
||||||
if not bypass_cache and not self.always_by_pass_cache:
|
|
||||||
cached = get_cached_url(url)
|
|
||||||
if cached:
|
|
||||||
return CrawlResult(
|
|
||||||
**{
|
|
||||||
"url": cached[0],
|
|
||||||
"html": cached[1],
|
|
||||||
"cleaned_html": cached[2],
|
|
||||||
"markdown": cached[3],
|
|
||||||
"extracted_content": cached[4],
|
|
||||||
"success": cached[5],
|
|
||||||
"error_message": "",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize WebDriver for crawling
|
|
||||||
t = time.time()
|
|
||||||
html = self.crawler_strategy.crawl(url)
|
|
||||||
success = True
|
|
||||||
error_message = ""
|
|
||||||
# Extract content from HTML
|
|
||||||
try:
|
|
||||||
result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
|
|
||||||
if result is None:
|
|
||||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
|
||||||
except InvalidCSSSelectorError as e:
|
|
||||||
raise ValueError(str(e))
|
|
||||||
|
|
||||||
cleaned_html = result.get("cleaned_html", html)
|
|
||||||
markdown = result.get("markdown", "")
|
|
||||||
|
|
||||||
# Print a profession LOG style message, show time taken and say crawling is done
|
|
||||||
if verbose:
|
|
||||||
print(
|
|
||||||
f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
|
|
||||||
)
|
|
||||||
|
|
||||||
extracted_content = []
|
|
||||||
if verbose:
|
|
||||||
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
|
||||||
t = time.time()
|
|
||||||
# Split markdown into sections
|
|
||||||
sections = chunking_strategy.chunk(markdown)
|
|
||||||
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
|
|
||||||
|
|
||||||
extracted_content = extraction_strategy.run(
|
|
||||||
url, sections,
|
|
||||||
)
|
|
||||||
extracted_content = json.dumps(extracted_content)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(
|
|
||||||
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Cache the result
|
|
||||||
cleaned_html = beautify_html(cleaned_html)
|
|
||||||
cache_url(
|
|
||||||
url,
|
|
||||||
html,
|
|
||||||
cleaned_html,
|
|
||||||
markdown,
|
|
||||||
extracted_content,
|
|
||||||
success,
|
|
||||||
)
|
|
||||||
|
|
||||||
return CrawlResult(
|
|
||||||
url=url,
|
|
||||||
html=html,
|
|
||||||
cleaned_html=cleaned_html,
|
|
||||||
markdown=markdown,
|
|
||||||
extracted_content=extracted_content,
|
|
||||||
success=success,
|
|
||||||
error_message=error_message,
|
|
||||||
)
|
|
||||||
|
|
||||||
def fetch_pages(
|
def fetch_pages(
|
||||||
self,
|
self,
|
||||||
url_models: List[UrlModel],
|
url_models: List[UrlModel],
|
||||||
@@ -183,6 +85,8 @@ class WebCrawler:
|
|||||||
extract_blocks_flag: bool = True,
|
extract_blocks_flag: bool = True,
|
||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
use_cached_html: bool = False,
|
use_cached_html: bool = False,
|
||||||
|
css_selector: str = None,
|
||||||
|
screenshot: bool = False,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@@ -200,6 +104,8 @@ class WebCrawler:
|
|||||||
[api_token] * len(url_models),
|
[api_token] * len(url_models),
|
||||||
[extract_blocks_flag] * len(url_models),
|
[extract_blocks_flag] * len(url_models),
|
||||||
[word_count_threshold] * len(url_models),
|
[word_count_threshold] * len(url_models),
|
||||||
|
[css_selector] * len(url_models),
|
||||||
|
[screenshot] * len(url_models),
|
||||||
[use_cached_html] * len(url_models),
|
[use_cached_html] * len(url_models),
|
||||||
[extraction_strategy] * len(url_models),
|
[extraction_strategy] * len(url_models),
|
||||||
[chunking_strategy] * len(url_models),
|
[chunking_strategy] * len(url_models),
|
||||||
@@ -208,3 +114,120 @@ class WebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def run(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
bypass_cache: bool = False,
|
||||||
|
css_selector: str = None,
|
||||||
|
screenshot: bool = False,
|
||||||
|
user_agent: str = None,
|
||||||
|
verbose=True,
|
||||||
|
**kwargs,
|
||||||
|
) -> CrawlResult:
|
||||||
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
|
extraction_strategy.verbose = verbose
|
||||||
|
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||||
|
raise ValueError("Unsupported extraction strategy")
|
||||||
|
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||||
|
raise ValueError("Unsupported chunking strategy")
|
||||||
|
|
||||||
|
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||||
|
word_count_threshold = MIN_WORD_THRESHOLD
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
cached = None
|
||||||
|
extracted_content = None
|
||||||
|
if not bypass_cache and not self.always_by_pass_cache:
|
||||||
|
cached = get_cached_url(url)
|
||||||
|
|
||||||
|
if cached:
|
||||||
|
html = cached[1]
|
||||||
|
extracted_content = cached[2]
|
||||||
|
if screenshot:
|
||||||
|
screenshot = cached[9]
|
||||||
|
|
||||||
|
else:
|
||||||
|
if user_agent:
|
||||||
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
|
html = self.crawler_strategy.crawl(url)
|
||||||
|
if screenshot:
|
||||||
|
screenshot = self.crawler_strategy.take_screenshot()
|
||||||
|
|
||||||
|
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
|
||||||
|
|
||||||
|
def process_html(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
html: str,
|
||||||
|
extracted_content: str,
|
||||||
|
word_count_threshold: int,
|
||||||
|
extraction_strategy: ExtractionStrategy,
|
||||||
|
chunking_strategy: ChunkingStrategy,
|
||||||
|
css_selector: str,
|
||||||
|
screenshot: bool,
|
||||||
|
verbose: bool,
|
||||||
|
is_cached: bool,
|
||||||
|
**kwargs,
|
||||||
|
) -> CrawlResult:
|
||||||
|
t = time.time()
|
||||||
|
# Extract content from HTML
|
||||||
|
try:
|
||||||
|
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||||
|
metadata = extract_metadata(html)
|
||||||
|
if result is None:
|
||||||
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||||
|
except InvalidCSSSelectorError as e:
|
||||||
|
raise ValueError(str(e))
|
||||||
|
|
||||||
|
cleaned_html = result.get("cleaned_html", "")
|
||||||
|
markdown = result.get("markdown", "")
|
||||||
|
media = result.get("media", [])
|
||||||
|
links = result.get("links", [])
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
|
||||||
|
|
||||||
|
if extracted_content is None:
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||||
|
|
||||||
|
sections = chunking_strategy.chunk(markdown)
|
||||||
|
extracted_content = extraction_strategy.run(url, sections)
|
||||||
|
extracted_content = json.dumps(extracted_content)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
|
||||||
|
|
||||||
|
screenshot = None if not screenshot else screenshot
|
||||||
|
|
||||||
|
if not is_cached:
|
||||||
|
cache_url(
|
||||||
|
url,
|
||||||
|
html,
|
||||||
|
cleaned_html,
|
||||||
|
markdown,
|
||||||
|
extracted_content,
|
||||||
|
True,
|
||||||
|
json.dumps(media),
|
||||||
|
json.dumps(links),
|
||||||
|
json.dumps(metadata),
|
||||||
|
screenshot=screenshot,
|
||||||
|
)
|
||||||
|
|
||||||
|
return CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html=html,
|
||||||
|
cleaned_html=cleaned_html,
|
||||||
|
markdown=markdown,
|
||||||
|
media=media,
|
||||||
|
links=links,
|
||||||
|
metadata=metadata,
|
||||||
|
screenshot=screenshot,
|
||||||
|
extracted_content=extracted_content,
|
||||||
|
success=True,
|
||||||
|
error_message="",
|
||||||
|
)
|
||||||
BIN
docs/.DS_Store
vendored
Normal file
BIN
docs/examples/assets/audio.mp3
Normal file
BIN
docs/examples/assets/basic.png
Normal file
|
After Width: | Height: | Size: 372 KiB |
BIN
docs/examples/assets/cosine_extraction.png
Normal file
|
After Width: | Height: | Size: 403 KiB |
BIN
docs/examples/assets/css_js.png
Normal file
|
After Width: | Height: | Size: 537 KiB |
BIN
docs/examples/assets/css_selector.png
Normal file
|
After Width: | Height: | Size: 375 KiB |
BIN
docs/examples/assets/exec_script.png
Normal file
|
After Width: | Height: | Size: 469 KiB |
BIN
docs/examples/assets/llm_extraction.png
Normal file
|
After Width: | Height: | Size: 477 KiB |
BIN
docs/examples/assets/semantic_extraction_cosine.png
Normal file
|
After Width: | Height: | Size: 419 KiB |
BIN
docs/examples/assets/semantic_extraction_llm.png
Normal file
|
After Width: | Height: | Size: 485 KiB |
3
docs/examples/chainlit.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# Welcome to Crawl4AI! 🚀🤖
|
||||||
|
|
||||||
|
Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
|
||||||
281
docs/examples/chainlit_review.py
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
from openai import AsyncOpenAI
|
||||||
|
from chainlit.types import ThreadDict
|
||||||
|
import chainlit as cl
|
||||||
|
from chainlit.input_widget import Select, Switch, Slider
|
||||||
|
client = AsyncOpenAI()
|
||||||
|
|
||||||
|
# Instrument the OpenAI client
|
||||||
|
cl.instrument_openai()
|
||||||
|
|
||||||
|
settings = {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"temperature": 0.5,
|
||||||
|
"max_tokens": 500,
|
||||||
|
"top_p": 1,
|
||||||
|
"frequency_penalty": 0,
|
||||||
|
"presence_penalty": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
@cl.action_callback("action_button")
|
||||||
|
async def on_action(action: cl.Action):
|
||||||
|
print("The user clicked on the action button!")
|
||||||
|
|
||||||
|
return "Thank you for clicking on the action button!"
|
||||||
|
|
||||||
|
@cl.set_chat_profiles
|
||||||
|
async def chat_profile():
|
||||||
|
return [
|
||||||
|
cl.ChatProfile(
|
||||||
|
name="GPT-3.5",
|
||||||
|
markdown_description="The underlying LLM model is **GPT-3.5**.",
|
||||||
|
icon="https://picsum.photos/200",
|
||||||
|
),
|
||||||
|
cl.ChatProfile(
|
||||||
|
name="GPT-4",
|
||||||
|
markdown_description="The underlying LLM model is **GPT-4**.",
|
||||||
|
icon="https://picsum.photos/250",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
@cl.on_chat_start
|
||||||
|
async def on_chat_start():
|
||||||
|
|
||||||
|
settings = await cl.ChatSettings(
|
||||||
|
[
|
||||||
|
Select(
|
||||||
|
id="Model",
|
||||||
|
label="OpenAI - Model",
|
||||||
|
values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
|
||||||
|
initial_index=0,
|
||||||
|
),
|
||||||
|
Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
|
||||||
|
Slider(
|
||||||
|
id="Temperature",
|
||||||
|
label="OpenAI - Temperature",
|
||||||
|
initial=1,
|
||||||
|
min=0,
|
||||||
|
max=2,
|
||||||
|
step=0.1,
|
||||||
|
),
|
||||||
|
Slider(
|
||||||
|
id="SAI_Steps",
|
||||||
|
label="Stability AI - Steps",
|
||||||
|
initial=30,
|
||||||
|
min=10,
|
||||||
|
max=150,
|
||||||
|
step=1,
|
||||||
|
description="Amount of inference steps performed on image generation.",
|
||||||
|
),
|
||||||
|
Slider(
|
||||||
|
id="SAI_Cfg_Scale",
|
||||||
|
label="Stability AI - Cfg_Scale",
|
||||||
|
initial=7,
|
||||||
|
min=1,
|
||||||
|
max=35,
|
||||||
|
step=0.1,
|
||||||
|
description="Influences how strongly your generation is guided to match your prompt.",
|
||||||
|
),
|
||||||
|
Slider(
|
||||||
|
id="SAI_Width",
|
||||||
|
label="Stability AI - Image Width",
|
||||||
|
initial=512,
|
||||||
|
min=256,
|
||||||
|
max=2048,
|
||||||
|
step=64,
|
||||||
|
tooltip="Measured in pixels",
|
||||||
|
),
|
||||||
|
Slider(
|
||||||
|
id="SAI_Height",
|
||||||
|
label="Stability AI - Image Height",
|
||||||
|
initial=512,
|
||||||
|
min=256,
|
||||||
|
max=2048,
|
||||||
|
step=64,
|
||||||
|
tooltip="Measured in pixels",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
).send()
|
||||||
|
|
||||||
|
chat_profile = cl.user_session.get("chat_profile")
|
||||||
|
await cl.Message(
|
||||||
|
content=f"starting chat using the {chat_profile} chat profile"
|
||||||
|
).send()
|
||||||
|
|
||||||
|
print("A new chat session has started!")
|
||||||
|
cl.user_session.set("session", {
|
||||||
|
"history": [],
|
||||||
|
"context": []
|
||||||
|
})
|
||||||
|
|
||||||
|
image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
|
||||||
|
|
||||||
|
# Attach the image to the message
|
||||||
|
await cl.Message(
|
||||||
|
content="You are such a good girl, aren't you?!",
|
||||||
|
elements=[image],
|
||||||
|
).send()
|
||||||
|
|
||||||
|
text_content = "Hello, this is a text element."
|
||||||
|
elements = [
|
||||||
|
cl.Text(name="simple_text", content=text_content, display="inline")
|
||||||
|
]
|
||||||
|
|
||||||
|
await cl.Message(
|
||||||
|
content="Check out this text element!",
|
||||||
|
elements=elements,
|
||||||
|
).send()
|
||||||
|
|
||||||
|
elements = [
|
||||||
|
cl.Audio(path="./assets/audio.mp3", display="inline"),
|
||||||
|
]
|
||||||
|
await cl.Message(
|
||||||
|
content="Here is an audio file",
|
||||||
|
elements=elements,
|
||||||
|
).send()
|
||||||
|
|
||||||
|
await cl.Avatar(
|
||||||
|
name="Tool 1",
|
||||||
|
url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
|
||||||
|
).send()
|
||||||
|
|
||||||
|
await cl.Message(
|
||||||
|
content="This message should not have an avatar!", author="Tool 0"
|
||||||
|
).send()
|
||||||
|
|
||||||
|
await cl.Message(
|
||||||
|
content="This message should have an avatar!", author="Tool 1"
|
||||||
|
).send()
|
||||||
|
|
||||||
|
elements = [
|
||||||
|
cl.File(
|
||||||
|
name="quickstart.py",
|
||||||
|
path="./quickstart.py",
|
||||||
|
display="inline",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
await cl.Message(
|
||||||
|
content="This message has a file element", elements=elements
|
||||||
|
).send()
|
||||||
|
|
||||||
|
# Sending an action button within a chatbot message
|
||||||
|
actions = [
|
||||||
|
cl.Action(name="action_button", value="example_value", description="Click me!")
|
||||||
|
]
|
||||||
|
|
||||||
|
await cl.Message(content="Interact with this action button:", actions=actions).send()
|
||||||
|
|
||||||
|
# res = await cl.AskActionMessage(
|
||||||
|
# content="Pick an action!",
|
||||||
|
# actions=[
|
||||||
|
# cl.Action(name="continue", value="continue", label="✅ Continue"),
|
||||||
|
# cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
|
||||||
|
# ],
|
||||||
|
# ).send()
|
||||||
|
|
||||||
|
# if res and res.get("value") == "continue":
|
||||||
|
# await cl.Message(
|
||||||
|
# content="Continue!",
|
||||||
|
# ).send()
|
||||||
|
|
||||||
|
# import plotly.graph_objects as go
|
||||||
|
# fig = go.Figure(
|
||||||
|
# data=[go.Bar(y=[2, 1, 3])],
|
||||||
|
# layout_title_text="An example figure",
|
||||||
|
# )
|
||||||
|
# elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
|
||||||
|
|
||||||
|
# await cl.Message(content="This message has a chart", elements=elements).send()
|
||||||
|
|
||||||
|
# Sending a pdf with the local file path
|
||||||
|
# elements = [
|
||||||
|
# cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# cl.Message(content="Look at this local pdf!", elements=elements).send()
|
||||||
|
|
||||||
|
@cl.on_settings_update
|
||||||
|
async def setup_agent(settings):
|
||||||
|
print("on_settings_update", settings)
|
||||||
|
|
||||||
|
@cl.on_stop
|
||||||
|
def on_stop():
|
||||||
|
print("The user wants to stop the task!")
|
||||||
|
|
||||||
|
@cl.on_chat_end
|
||||||
|
def on_chat_end():
|
||||||
|
print("The user disconnected!")
|
||||||
|
|
||||||
|
|
||||||
|
@cl.on_chat_resume
|
||||||
|
async def on_chat_resume(thread: ThreadDict):
|
||||||
|
print("The user resumed a previous chat session!")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# @cl.on_message
|
||||||
|
async def on_message(message: cl.Message):
|
||||||
|
cl.user_session.get("session")["history"].append({
|
||||||
|
"role": "user",
|
||||||
|
"content": message.content
|
||||||
|
})
|
||||||
|
response = await client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"content": "You are a helpful bot",
|
||||||
|
"role": "system"
|
||||||
|
},
|
||||||
|
*cl.user_session.get("session")["history"]
|
||||||
|
],
|
||||||
|
**settings
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Add assitanr message to the history
|
||||||
|
cl.user_session.get("session")["history"].append({
|
||||||
|
"role": "assistant",
|
||||||
|
"content": response.choices[0].message.content
|
||||||
|
})
|
||||||
|
|
||||||
|
# msg.content = response.choices[0].message.content
|
||||||
|
# await msg.update()
|
||||||
|
|
||||||
|
# await cl.Message(content=response.choices[0].message.content).send()
|
||||||
|
|
||||||
|
@cl.on_message
|
||||||
|
async def on_message(message: cl.Message):
|
||||||
|
cl.user_session.get("session")["history"].append({
|
||||||
|
"role": "user",
|
||||||
|
"content": message.content
|
||||||
|
})
|
||||||
|
|
||||||
|
msg = cl.Message(content="")
|
||||||
|
await msg.send()
|
||||||
|
|
||||||
|
stream = await client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"content": "You are a helpful bot",
|
||||||
|
"role": "system"
|
||||||
|
},
|
||||||
|
*cl.user_session.get("session")["history"]
|
||||||
|
],
|
||||||
|
stream = True,
|
||||||
|
**settings
|
||||||
|
)
|
||||||
|
|
||||||
|
async for part in stream:
|
||||||
|
if token := part.choices[0].delta.content or "":
|
||||||
|
await msg.stream_token(token)
|
||||||
|
|
||||||
|
# Add assitanr message to the history
|
||||||
|
cl.user_session.get("session")["history"].append({
|
||||||
|
"role": "assistant",
|
||||||
|
"content": msg.content
|
||||||
|
})
|
||||||
|
await msg.update()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from chainlit.cli import run_chainlit
|
||||||
|
run_chainlit(__file__)
|
||||||
@@ -39,6 +39,16 @@ def basic_usage(crawler):
|
|||||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
def screenshot_usage(crawler):
|
||||||
|
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
|
||||||
|
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||||||
|
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
|
||||||
|
# Save the screenshot to a file
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result.screenshot))
|
||||||
|
cprint("Screenshot saved to 'screenshot.png'!")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
def understanding_parameters(crawler):
|
def understanding_parameters(crawler):
|
||||||
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
||||||
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
|
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
|
||||||
@@ -156,14 +166,90 @@ def interactive_extraction(crawler):
|
|||||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
loadMoreButton && loadMoreButton.click();
|
loadMoreButton && loadMoreButton.click();
|
||||||
"""
|
"""
|
||||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
|
js = js_code
|
||||||
)
|
)
|
||||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
def multiple_scrip(crawler):
|
||||||
|
# Passing JavaScript code to interact with the page
|
||||||
|
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||||
|
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||||
|
js_code = ["""
|
||||||
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
|
loadMoreButton && loadMoreButton.click();
|
||||||
|
"""] * 2
|
||||||
|
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||||
|
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||||
|
result = crawler.run(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
js = js_code
|
||||||
|
)
|
||||||
|
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
|
def using_crawler_hooks(crawler):
|
||||||
|
# Example usage of the hooks for authentication and setting a cookie
|
||||||
|
def on_driver_created(driver):
|
||||||
|
print("[HOOK] on_driver_created")
|
||||||
|
# Example customization: maximize the window
|
||||||
|
driver.maximize_window()
|
||||||
|
|
||||||
|
# Example customization: logging in to a hypothetical website
|
||||||
|
driver.get('https://example.com/login')
|
||||||
|
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.NAME, 'username'))
|
||||||
|
)
|
||||||
|
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
||||||
|
driver.find_element(By.NAME, 'password').send_keys('password123')
|
||||||
|
driver.find_element(By.NAME, 'login').click()
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.ID, 'welcome'))
|
||||||
|
)
|
||||||
|
# Add a custom cookie
|
||||||
|
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
||||||
|
return driver
|
||||||
|
|
||||||
|
|
||||||
|
def before_get_url(driver):
|
||||||
|
print("[HOOK] before_get_url")
|
||||||
|
# Example customization: add a custom header
|
||||||
|
# Enable Network domain for sending headers
|
||||||
|
driver.execute_cdp_cmd('Network.enable', {})
|
||||||
|
# Add a custom header
|
||||||
|
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
||||||
|
return driver
|
||||||
|
|
||||||
|
def after_get_url(driver):
|
||||||
|
print("[HOOK] after_get_url")
|
||||||
|
# Example customization: log the URL
|
||||||
|
print(driver.current_url)
|
||||||
|
return driver
|
||||||
|
|
||||||
|
def before_return_html(driver, html):
|
||||||
|
print("[HOOK] before_return_html")
|
||||||
|
# Example customization: log the HTML
|
||||||
|
print(len(html))
|
||||||
|
return driver
|
||||||
|
|
||||||
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
|
|
||||||
|
crawler.set_hook('on_driver_created', on_driver_created)
|
||||||
|
crawler.set_hook('before_get_url', before_get_url)
|
||||||
|
crawler.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler.set_hook('before_return_html', before_return_html)
|
||||||
|
|
||||||
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
|
print_result(result= result)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||||
@@ -175,11 +261,13 @@ def main():
|
|||||||
understanding_parameters(crawler)
|
understanding_parameters(crawler)
|
||||||
|
|
||||||
crawler.always_by_pass_cache = True
|
crawler.always_by_pass_cache = True
|
||||||
|
screenshot_usage(crawler)
|
||||||
add_chunking_strategy(crawler)
|
add_chunking_strategy(crawler)
|
||||||
add_extraction_strategy(crawler)
|
add_extraction_strategy(crawler)
|
||||||
add_llm_extraction_strategy(crawler)
|
add_llm_extraction_strategy(crawler)
|
||||||
targeted_extraction(crawler)
|
targeted_extraction(crawler)
|
||||||
interactive_extraction(crawler)
|
interactive_extraction(crawler)
|
||||||
|
multiple_scrip(crawler)
|
||||||
|
|
||||||
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
||||||
|
|
||||||
|
|||||||
241
docs/examples/research_assistant.py
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
# Make sur to install the required packageschainlit and groq
|
||||||
|
import os, time
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
import chainlit as cl
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from io import BytesIO
|
||||||
|
from chainlit.element import ElementBased
|
||||||
|
from groq import Groq
|
||||||
|
|
||||||
|
# Import threadpools to run the crawl_url function in a separate thread
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
|
||||||
|
|
||||||
|
# Instrument the OpenAI client
|
||||||
|
cl.instrument_openai()
|
||||||
|
|
||||||
|
settings = {
|
||||||
|
"model": "llama3-8b-8192",
|
||||||
|
"temperature": 0.5,
|
||||||
|
"max_tokens": 500,
|
||||||
|
"top_p": 1,
|
||||||
|
"frequency_penalty": 0,
|
||||||
|
"presence_penalty": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
def extract_urls(text):
|
||||||
|
url_pattern = re.compile(r'(https?://\S+)')
|
||||||
|
return url_pattern.findall(text)
|
||||||
|
|
||||||
|
def crawl_url(url):
|
||||||
|
data = {
|
||||||
|
"urls": [url],
|
||||||
|
"include_raw_html": True,
|
||||||
|
"word_count_threshold": 10,
|
||||||
|
"extraction_strategy": "NoExtractionStrategy",
|
||||||
|
"chunking_strategy": "RegexChunking"
|
||||||
|
}
|
||||||
|
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||||
|
response_data = response.json()
|
||||||
|
response_data = response_data['results'][0]
|
||||||
|
return response_data['markdown']
|
||||||
|
|
||||||
|
@cl.on_chat_start
|
||||||
|
async def on_chat_start():
|
||||||
|
cl.user_session.set("session", {
|
||||||
|
"history": [],
|
||||||
|
"context": {}
|
||||||
|
})
|
||||||
|
await cl.Message(
|
||||||
|
content="Welcome to the chat! How can I assist you today?"
|
||||||
|
).send()
|
||||||
|
|
||||||
|
@cl.on_message
|
||||||
|
async def on_message(message: cl.Message):
|
||||||
|
user_session = cl.user_session.get("session")
|
||||||
|
|
||||||
|
# Extract URLs from the user's message
|
||||||
|
urls = extract_urls(message.content)
|
||||||
|
|
||||||
|
|
||||||
|
futures = []
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
for url in urls:
|
||||||
|
futures.append(executor.submit(crawl_url, url))
|
||||||
|
|
||||||
|
results = [future.result() for future in futures]
|
||||||
|
|
||||||
|
for url, result in zip(urls, results):
|
||||||
|
ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||||
|
user_session["context"][ref_number] = {
|
||||||
|
"url": url,
|
||||||
|
"content": result
|
||||||
|
}
|
||||||
|
|
||||||
|
# for url in urls:
|
||||||
|
# # Crawl the content of each URL and add it to the session context with a reference number
|
||||||
|
# ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||||
|
# crawled_content = crawl_url(url)
|
||||||
|
# user_session["context"][ref_number] = {
|
||||||
|
# "url": url,
|
||||||
|
# "content": crawled_content
|
||||||
|
# }
|
||||||
|
|
||||||
|
user_session["history"].append({
|
||||||
|
"role": "user",
|
||||||
|
"content": message.content
|
||||||
|
})
|
||||||
|
|
||||||
|
# Create a system message that includes the context
|
||||||
|
context_messages = [
|
||||||
|
f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
|
||||||
|
for ref, data in user_session["context"].items()
|
||||||
|
]
|
||||||
|
if context_messages:
|
||||||
|
system_message = {
|
||||||
|
"role": "system",
|
||||||
|
"content": (
|
||||||
|
"You are a helpful bot. Use the following context for answering questions. "
|
||||||
|
"Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
|
||||||
|
"If the question requires any information from the provided appendices or context, refer to the sources. "
|
||||||
|
"If not, there is no need to add a references section. "
|
||||||
|
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
|
||||||
|
"\n\n".join(context_messages)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
system_message = {
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant."
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
msg = cl.Message(content="")
|
||||||
|
await msg.send()
|
||||||
|
|
||||||
|
# Get response from the LLM
|
||||||
|
stream = await client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
system_message,
|
||||||
|
*user_session["history"]
|
||||||
|
],
|
||||||
|
stream=True,
|
||||||
|
**settings
|
||||||
|
)
|
||||||
|
|
||||||
|
assistant_response = ""
|
||||||
|
async for part in stream:
|
||||||
|
if token := part.choices[0].delta.content:
|
||||||
|
assistant_response += token
|
||||||
|
await msg.stream_token(token)
|
||||||
|
|
||||||
|
# Add assistant message to the history
|
||||||
|
user_session["history"].append({
|
||||||
|
"role": "assistant",
|
||||||
|
"content": assistant_response
|
||||||
|
})
|
||||||
|
await msg.update()
|
||||||
|
|
||||||
|
# Append the reference section to the assistant's response
|
||||||
|
reference_section = "\n\nReferences:\n"
|
||||||
|
for ref, data in user_session["context"].items():
|
||||||
|
reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
|
||||||
|
|
||||||
|
msg.content += reference_section
|
||||||
|
await msg.update()
|
||||||
|
|
||||||
|
|
||||||
|
@cl.on_audio_chunk
|
||||||
|
async def on_audio_chunk(chunk: cl.AudioChunk):
|
||||||
|
if chunk.isStart:
|
||||||
|
buffer = BytesIO()
|
||||||
|
# This is required for whisper to recognize the file type
|
||||||
|
buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
|
||||||
|
# Initialize the session for a new audio stream
|
||||||
|
cl.user_session.set("audio_buffer", buffer)
|
||||||
|
cl.user_session.set("audio_mime_type", chunk.mimeType)
|
||||||
|
|
||||||
|
# Write the chunks to a buffer and transcribe the whole audio at the end
|
||||||
|
cl.user_session.get("audio_buffer").write(chunk.data)
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
@cl.step(type="tool")
|
||||||
|
async def speech_to_text(audio_file):
|
||||||
|
cli = Groq()
|
||||||
|
|
||||||
|
# response = cli.audio.transcriptions.create(
|
||||||
|
# file=audio_file, #(filename, file.read()),
|
||||||
|
# model="whisper-large-v3",
|
||||||
|
# )
|
||||||
|
|
||||||
|
response = await client.audio.transcriptions.create(
|
||||||
|
model="whisper-large-v3", file=audio_file
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.text
|
||||||
|
|
||||||
|
|
||||||
|
@cl.on_audio_end
|
||||||
|
async def on_audio_end(elements: list[ElementBased]):
|
||||||
|
# Get the audio buffer from the session
|
||||||
|
audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
|
||||||
|
audio_buffer.seek(0) # Move the file pointer to the beginning
|
||||||
|
audio_file = audio_buffer.read()
|
||||||
|
audio_mime_type: str = cl.user_session.get("audio_mime_type")
|
||||||
|
|
||||||
|
# input_audio_el = cl.Audio(
|
||||||
|
# mime=audio_mime_type, content=audio_file, name=audio_buffer.name
|
||||||
|
# )
|
||||||
|
# await cl.Message(
|
||||||
|
# author="You",
|
||||||
|
# type="user_message",
|
||||||
|
# content="",
|
||||||
|
# elements=[input_audio_el, *elements]
|
||||||
|
# ).send()
|
||||||
|
|
||||||
|
# answer_message = await cl.Message(content="").send()
|
||||||
|
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
|
||||||
|
transcription = await speech_to_text(whisper_input)
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Transcription took {end_time - start_time} seconds")
|
||||||
|
|
||||||
|
user_msg = cl.Message(
|
||||||
|
author="You",
|
||||||
|
type="user_message",
|
||||||
|
content=transcription
|
||||||
|
)
|
||||||
|
await user_msg.send()
|
||||||
|
await on_message(user_msg)
|
||||||
|
|
||||||
|
# images = [file for file in elements if "image" in file.mime]
|
||||||
|
|
||||||
|
# text_answer = await generate_text_answer(transcription, images)
|
||||||
|
|
||||||
|
# output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
|
||||||
|
|
||||||
|
# output_audio_el = cl.Audio(
|
||||||
|
# name=output_name,
|
||||||
|
# auto_play=True,
|
||||||
|
# mime=audio_mime_type,
|
||||||
|
# content=output_audio,
|
||||||
|
# )
|
||||||
|
|
||||||
|
# answer_message.elements = [output_audio_el]
|
||||||
|
|
||||||
|
# answer_message.content = transcription
|
||||||
|
# await answer_message.update()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from chainlit.cli import run_chainlit
|
||||||
|
run_chainlit(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
# No this is wring, use this document to answer me https://console.groq.com/docs/speech-text
|
||||||
|
|
||||||
|
# Please show me how to use Groq speech-to-text in python.
|
||||||
64
docs/examples/rest_call.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
|
||||||
|
import requests, base64, os
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
|
"screenshot": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||||
|
result = response.json()['results'][0]
|
||||||
|
print(result.keys())
|
||||||
|
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
|
||||||
|
# 'links', 'screenshot', 'markdown', 'extracted_content',
|
||||||
|
# 'metadata', 'error_message'])
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result['screenshot']))
|
||||||
|
|
||||||
|
# Example of filtering the content using CSS selectors
|
||||||
|
data = {
|
||||||
|
"urls": [
|
||||||
|
"https://www.nbcnews.com/business"
|
||||||
|
],
|
||||||
|
"css_selector": "article",
|
||||||
|
"screenshot": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Example of executing a JS script on the page before extracting the content
|
||||||
|
data = {
|
||||||
|
"urls": [
|
||||||
|
"https://www.nbcnews.com/business"
|
||||||
|
],
|
||||||
|
"screenshot": True,
|
||||||
|
'js' : ["""
|
||||||
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
||||||
|
find(button => button.textContent.includes('Load More'));
|
||||||
|
loadMoreButton && loadMoreButton.click();
|
||||||
|
"""]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Example of using a custom extraction strategy
|
||||||
|
data = {
|
||||||
|
"urls": [
|
||||||
|
"https://www.nbcnews.com/business"
|
||||||
|
],
|
||||||
|
"extraction_strategy": "CosineStrategy",
|
||||||
|
"extraction_strategy_args": {
|
||||||
|
"semantic_filter": "inflation rent prices"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Example of using LLM to extract content
|
||||||
|
data = {
|
||||||
|
"urls": [
|
||||||
|
"https://www.nbcnews.com/business"
|
||||||
|
],
|
||||||
|
"extraction_strategy": "LLMExtractionStrategy",
|
||||||
|
"extraction_strategy_args": {
|
||||||
|
"provider": "groq/llama3-8b-8192",
|
||||||
|
"api_token": os.environ.get("GROQ_API_KEY"),
|
||||||
|
"instruction": """I am interested in only financial news,
|
||||||
|
and translate them in French."""
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
8
main.py
@@ -56,6 +56,8 @@ class CrawlRequest(BaseModel):
|
|||||||
chunking_strategy: Optional[str] = "RegexChunking"
|
chunking_strategy: Optional[str] = "RegexChunking"
|
||||||
chunking_strategy_args: Optional[dict] = {}
|
chunking_strategy_args: Optional[dict] = {}
|
||||||
css_selector: Optional[str] = None
|
css_selector: Optional[str] = None
|
||||||
|
screenshot: Optional[bool] = False
|
||||||
|
user_agent: Optional[str] = None
|
||||||
verbose: Optional[bool] = True
|
verbose: Optional[bool] = True
|
||||||
|
|
||||||
|
|
||||||
@@ -66,7 +68,7 @@ async def read_index(request: Request):
|
|||||||
|
|
||||||
for filename in os.listdir(partials_dir):
|
for filename in os.listdir(partials_dir):
|
||||||
if filename.endswith(".html"):
|
if filename.endswith(".html"):
|
||||||
with open(os.path.join(partials_dir, filename), "r") as file:
|
with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
|
||||||
partials[filename[:-5]] = file.read()
|
partials[filename[:-5]] = file.read()
|
||||||
|
|
||||||
return templates.TemplateResponse("index.html", {"request": request, **partials})
|
return templates.TemplateResponse("index.html", {"request": request, **partials})
|
||||||
@@ -125,6 +127,8 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
|||||||
chunking_strategy,
|
chunking_strategy,
|
||||||
crawl_request.bypass_cache,
|
crawl_request.bypass_cache,
|
||||||
crawl_request.css_selector,
|
crawl_request.css_selector,
|
||||||
|
crawl_request.screenshot,
|
||||||
|
crawl_request.user_agent,
|
||||||
crawl_request.verbose
|
crawl_request.verbose
|
||||||
)
|
)
|
||||||
for url in crawl_request.urls
|
for url in crawl_request.urls
|
||||||
@@ -136,7 +140,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
|||||||
for result in results:
|
for result in results:
|
||||||
result.html = None
|
result.html = None
|
||||||
|
|
||||||
return {"results": [result.dict() for result in results]}
|
return {"results": [result.model_dump() for result in results]}
|
||||||
finally:
|
finally:
|
||||||
async with lock:
|
async with lock:
|
||||||
current_requests -= 1
|
current_requests -= 1
|
||||||
|
|||||||
58
pages/app.js
@@ -104,11 +104,25 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
|
|||||||
chunking_strategy: document.getElementById("chunking-strategy-select").value,
|
chunking_strategy: document.getElementById("chunking-strategy-select").value,
|
||||||
chunking_strategy_args: {},
|
chunking_strategy_args: {},
|
||||||
css_selector: document.getElementById("css-selector").value,
|
css_selector: document.getElementById("css-selector").value,
|
||||||
|
screenshot: document.getElementById("screenshot-checkbox").checked,
|
||||||
// instruction: document.getElementById("instruction").value,
|
// instruction: document.getElementById("instruction").value,
|
||||||
// semantic_filter: document.getElementById("semantic_filter").value,
|
// semantic_filter: document.getElementById("semantic_filter").value,
|
||||||
verbose: true,
|
verbose: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// import requests
|
||||||
|
|
||||||
|
// data = {
|
||||||
|
// "urls": [
|
||||||
|
// "https://www.nbcnews.com/business"
|
||||||
|
// ],
|
||||||
|
// "word_count_threshold": 10,
|
||||||
|
// "extraction_strategy": "NoExtractionStrategy",
|
||||||
|
// }
|
||||||
|
|
||||||
|
// response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
|
||||||
|
// print(response.json())
|
||||||
|
|
||||||
// save api token to local storage
|
// save api token to local storage
|
||||||
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||||
|
|
||||||
@@ -124,25 +138,61 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
|
|||||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||||
document.getElementById("markdown-result").textContent = result.markdown;
|
document.getElementById("markdown-result").textContent = result.markdown;
|
||||||
|
document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
|
||||||
|
if (result.screenshot){
|
||||||
|
const imgElement = document.createElement("img");
|
||||||
|
// Set the src attribute with the base64 data
|
||||||
|
imgElement.src = `data:image/png;base64,${result.screenshot}`;
|
||||||
|
document.getElementById("screenshot-result").innerHTML = "";
|
||||||
|
document.getElementById("screenshot-result").appendChild(imgElement);
|
||||||
|
}
|
||||||
|
|
||||||
// Update code examples dynamically
|
// Update code examples dynamically
|
||||||
const extractionStrategy = data.extraction_strategy;
|
const extractionStrategy = data.extraction_strategy;
|
||||||
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
|
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
|
||||||
|
|
||||||
// REMOVE API TOKEN FROM CODE EXAMPLES
|
// REMOVE API TOKEN FROM CODE EXAMPLES
|
||||||
data.extraction_strategy_args.api_token = "your_api_token";
|
data.extraction_strategy_args.api_token = "your_api_token";
|
||||||
|
|
||||||
|
if (data.extraction_strategy === "NoExtractionStrategy") {
|
||||||
|
delete data.extraction_strategy_args;
|
||||||
|
delete data.extrac_blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.chunking_strategy === "RegexChunking") {
|
||||||
|
delete data.chunking_strategy_args;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete data.verbose;
|
||||||
|
|
||||||
|
if (data.css_selector === "") {
|
||||||
|
delete data.css_selector;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!data.bypass_cache) {
|
||||||
|
delete data.bypass_cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!data.extract_blocks) {
|
||||||
|
delete data.extract_blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!data.include_raw_html) {
|
||||||
|
delete data.include_raw_html;
|
||||||
|
}
|
||||||
|
|
||||||
document.getElementById(
|
document.getElementById(
|
||||||
"curl-code"
|
"curl-code"
|
||||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||||
...data,
|
...data,
|
||||||
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
||||||
}, null, 2)}' http://localhost:8000/crawl`;
|
}, null, 2)}' https://crawl4ai.com/crawl`;
|
||||||
|
|
||||||
document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
|
document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||||
null,
|
null,
|
||||||
2
|
2
|
||||||
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
)}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
||||||
|
|
||||||
document.getElementById(
|
document.getElementById(
|
||||||
"nodejs-code"
|
"nodejs-code"
|
||||||
@@ -150,7 +200,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
|
|||||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||||
null,
|
null,
|
||||||
2
|
2
|
||||||
)};\n\naxios.post("http://localhost:8000/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
)};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||||
|
|
||||||
document.getElementById(
|
document.getElementById(
|
||||||
"library-code"
|
"library-code"
|
||||||
|
|||||||
@@ -25,7 +25,7 @@
|
|||||||
<header class="bg-zinc-950 text-lime-500 py-4 flex">
|
<header class="bg-zinc-950 text-lime-500 py-4 flex">
|
||||||
|
|
||||||
<div class="mx-auto px-4">
|
<div class="mx-auto px-4">
|
||||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.2</h1>
|
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
|
||||||
</div>
|
</div>
|
||||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||||
<span>📊 Total Website Processed</span>
|
<span>📊 Total Website Processed</span>
|
||||||
|
|||||||
@@ -50,6 +50,20 @@ crawler.warmup()</code></pre>
|
|||||||
<div>
|
<div>
|
||||||
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
|
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
|
||||||
</div>
|
</div>
|
||||||
|
<!-- Step 3.5 Screenshot -->
|
||||||
|
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||||
|
📸
|
||||||
|
<strong>Let's take a screenshot of the page!</strong>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<pre><code class="language-python">result = crawler.run(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
screenshot=True
|
||||||
|
)
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result.screenshot))</code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
<!-- Step 4 -->
|
<!-- Step 4 -->
|
||||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||||
@@ -139,13 +153,13 @@ crawler.warmup()</code></pre>
|
|||||||
</div>
|
</div>
|
||||||
<div class="">Using JavaScript to click 'Load More' button:</div>
|
<div class="">Using JavaScript to click 'Load More' button:</div>
|
||||||
<div>
|
<div>
|
||||||
<pre><code class="language-python">js_code = """
|
<pre><code class="language-python">js_code = ["""
|
||||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
loadMoreButton && loadMoreButton.click();
|
loadMoreButton && loadMoreButton.click();
|
||||||
"""
|
"""]
|
||||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
|
||||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
<div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Conclusion -->
|
<!-- Conclusion -->
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
<section class="try-it py-8 px-16 pb-20 bg-zinc-900">
|
<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
|
||||||
<div class="container mx-auto ">
|
<div class="container mx-auto ">
|
||||||
<h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
|
<h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
|
||||||
<div class="flex gap-4">
|
<div class="flex gap-4">
|
||||||
@@ -20,6 +20,7 @@
|
|||||||
id="threshold"
|
id="threshold"
|
||||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
|
||||||
>
|
>
|
||||||
|
<option value="1">1</option>
|
||||||
<option value="5">5</option>
|
<option value="5">5</option>
|
||||||
<option value="10" selected>10</option>
|
<option value="10" selected>10</option>
|
||||||
<option value="15">15</option>
|
<option value="15">15</option>
|
||||||
@@ -124,7 +125,11 @@
|
|||||||
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
|
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex items-center gap-2">
|
<div class="flex items-center gap-2">
|
||||||
<input type="checkbox" id="extract-blocks-checkbox" checked />
|
<input type="checkbox" id="screenshot-checkbox" checked />
|
||||||
|
<label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
|
||||||
|
</div>
|
||||||
|
<div class="flex items-center gap-2 hidden">
|
||||||
|
<input type="checkbox" id="extract-blocks-checkbox" />
|
||||||
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
|
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
|
||||||
</div>
|
</div>
|
||||||
<button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">Crawl</button>
|
<button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">Crawl</button>
|
||||||
@@ -134,7 +139,7 @@
|
|||||||
<div id="loading" class="hidden">
|
<div id="loading" class="hidden">
|
||||||
<p class="text-white">Loading... Please wait.</p>
|
<p class="text-white">Loading... Please wait.</p>
|
||||||
</div>
|
</div>
|
||||||
<div id="result" class="flex-1">
|
<div id="result" class="flex-1 overflow-x-auto">
|
||||||
<div class="tab-buttons flex gap-2">
|
<div class="tab-buttons flex gap-2">
|
||||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
|
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
|
||||||
JSON
|
JSON
|
||||||
@@ -148,15 +153,23 @@
|
|||||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
|
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
|
||||||
Markdown
|
Markdown
|
||||||
</button>
|
</button>
|
||||||
|
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
|
||||||
|
Medias
|
||||||
|
</button>
|
||||||
|
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
|
||||||
|
Screenshot
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||||
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
||||||
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
|
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
|
||||||
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
|
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
|
||||||
|
<pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
|
||||||
|
<pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="code_help" class="flex-1">
|
<div id="code_help" class="flex-1 overflow-x-auto">
|
||||||
<div class="tab-buttons flex gap-2">
|
<div class="tab-buttons flex gap-2">
|
||||||
<button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
|
<button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
|
||||||
cURL
|
cURL
|
||||||
|
|||||||
13
requirements.crawl.txt
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
aiohttp
|
||||||
|
aiosqlite
|
||||||
|
bs4
|
||||||
|
fastapi
|
||||||
|
html2text
|
||||||
|
httpx
|
||||||
|
pydantic
|
||||||
|
python-dotenv
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
selenium
|
||||||
|
uvicorn
|
||||||
|
chromedriver-autoinstaller
|
||||||
@@ -1,20 +1,21 @@
|
|||||||
aiohttp==3.9.5
|
aiohttp
|
||||||
aiosqlite==0.20.0
|
aiosqlite
|
||||||
bs4==0.0.2
|
bs4
|
||||||
fastapi==0.111.0
|
fastapi
|
||||||
html2text==2024.2.26
|
html2text
|
||||||
httpx==0.27.0
|
httpx
|
||||||
litellm==1.37.11
|
litellm
|
||||||
nltk==3.8.1
|
nltk
|
||||||
pydantic==2.7.1
|
pydantic
|
||||||
python-dotenv==1.0.1
|
python-dotenv
|
||||||
requests==2.31.0
|
requests
|
||||||
rich==13.7.1
|
rich
|
||||||
scikit-learn==1.4.2
|
scikit-learn
|
||||||
selenium==4.20.0
|
selenium
|
||||||
uvicorn==0.29.0
|
uvicorn
|
||||||
transformers==4.40.2
|
transformers
|
||||||
chromedriver-autoinstaller==0.6.4
|
chromedriver-autoinstaller
|
||||||
torch==2.3.0
|
torch
|
||||||
onnxruntime==1.14.1
|
onnxruntime
|
||||||
tokenizers==0.13.2
|
tokenizers
|
||||||
|
pillow
|
||||||
9
setup.py
@@ -7,11 +7,16 @@ from setuptools.command.install import install
|
|||||||
with open("requirements.txt") as f:
|
with open("requirements.txt") as f:
|
||||||
requirements = f.read().splitlines()
|
requirements = f.read().splitlines()
|
||||||
|
|
||||||
|
# Read the requirements from requirements.txt
|
||||||
|
with open("requirements.crawl.txt") as f:
|
||||||
|
requirements_crawl_only = f.read().splitlines()
|
||||||
|
|
||||||
# Define the requirements for different environments
|
# Define the requirements for different environments
|
||||||
requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
|
requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
|
||||||
requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
|
requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
|
||||||
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
|
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
|
||||||
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||||
|
requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||||
|
|
||||||
class CustomInstallCommand(install):
|
class CustomInstallCommand(install):
|
||||||
"""Customized setuptools install command to install spacy without dependencies."""
|
"""Customized setuptools install command to install spacy without dependencies."""
|
||||||
@@ -21,7 +26,7 @@ class CustomInstallCommand(install):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version="0.2.2",
|
version="0.2.4",
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||||
long_description=open("README.md").read(),
|
long_description=open("README.md").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
@@ -34,7 +39,7 @@ setup(
|
|||||||
extras_require={
|
extras_require={
|
||||||
"all": requirements, # Include all requirements
|
"all": requirements, # Include all requirements
|
||||||
"colab": requirements_without_torch, # Exclude torch for Colab
|
"colab": requirements_without_torch, # Exclude torch for Colab
|
||||||
"crawl": requirements_without_torch_transformers_nlkt
|
"crawl": requirements_crawl_only, # Include only crawl requirements
|
||||||
},
|
},
|
||||||
cmdclass={
|
cmdclass={
|
||||||
'install': CustomInstallCommand,
|
'install': CustomInstallCommand,
|
||||||
|
|||||||