Compare commits
23 Commits
new-releas
...
v0.2.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6ddccc144c | ||
|
|
53d1176d53 | ||
|
|
52c4be0696 | ||
|
|
13a3b21d19 | ||
|
|
bf00c26a83 | ||
|
|
3846648c12 | ||
|
|
eb6423875f | ||
|
|
e3524a10a7 | ||
|
|
468dad6169 | ||
|
|
bc27982992 | ||
|
|
57e5decb55 | ||
|
|
b6319c6f6e | ||
|
|
0a902f562f | ||
|
|
454135856e | ||
|
|
33fddc27ad | ||
|
|
ce052a4eb5 | ||
|
|
b43d77a56b | ||
|
|
1635a92218 | ||
|
|
2a8a1b27e1 | ||
|
|
f5f3cce2c8 | ||
|
|
a085e6315b | ||
|
|
a8d600a3b4 | ||
|
|
4a2e17447b |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -172,3 +172,5 @@ Crawl4AI.egg-info/
|
||||
|
||||
requirements0.txt
|
||||
a.txt
|
||||
|
||||
*.sh
|
||||
32
CHANGELOG.md
32
CHANGELOG.md
@@ -1,31 +1 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- 🔧 Separate Crawl and Extract JSON Semantic Chunk: Enhancing flexibility and efficiency in large-scale web crawling tasks.
|
||||
- 🔍 Colab Integration: Exploring integration with Google Colab for easy experimentation in a collaborative notebook environment.
|
||||
- 🎯 XPath and CSS Selector Support: Adding support for selective retrieval of specific elements from web pages.
|
||||
- 📷 Image Captioning: Incorporating image captioning capabilities to extract meaningful descriptions from images.
|
||||
- 💾 Embedding Data Generation and Storage: Developing functionalities to generate and store embedding data for each crawled website.
|
||||
- 🔍 Semantic Search Engine: Building a semantic search engine that fetches content, performs vector search similarity, and generates labeled chunk data based on user queries and URLs.
|
||||
|
||||
### Changed
|
||||
- None
|
||||
|
||||
### Deprecated
|
||||
- None
|
||||
|
||||
### Removed
|
||||
- None
|
||||
|
||||
### Fixed
|
||||
- None
|
||||
|
||||
### Security
|
||||
- None
|
||||
|
||||
## [1.0.0] - YYYY-MM-DD
|
||||
- Initial release
|
||||
# Changelog
|
||||
17
Dockerfile
17
Dockerfile
@@ -7,9 +7,6 @@ WORKDIR /usr/src/app
|
||||
# Copy the current directory contents into the container at /usr/src/app
|
||||
COPY . .
|
||||
|
||||
# Install any needed packages specified in requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install dependencies for Chrome and ChromeDriver
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
@@ -20,11 +17,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& mkdir -p /etc/apt/keyrings \
|
||||
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
|
||||
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get install -y chromium-chromedriver
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install spacy torch torchvision torchaudio
|
||||
|
||||
# Set display port and dbus env to avoid hanging
|
||||
ENV DISPLAY=:99
|
||||
@@ -37,4 +40,4 @@ EXPOSE 80
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
|
||||
44
Dockerfile_mac
Normal file
44
Dockerfile_mac
Normal file
@@ -0,0 +1,44 @@
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM python:3.10-slim
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy the current directory contents into the container at /usr/src/app
|
||||
COPY . .
|
||||
|
||||
# Install any needed packages specified in requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install dependencies for Chrome and ChromeDriver
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
xvfb \
|
||||
unzip \
|
||||
curl \
|
||||
gnupg2 \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt install chromium-chromedriver -y
|
||||
|
||||
# Install spacy library using pip
|
||||
RUN pip install spacy
|
||||
|
||||
# Set display port and dbus env to avoid hanging
|
||||
ENV DISPLAY=:99
|
||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Define environment variable
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
19
README.md
19
README.md
@@ -1,4 +1,4 @@
|
||||
# Crawl4AI 🕷️🤖
|
||||
# Crawl4AI v0.2.0 🕷️🤖
|
||||
|
||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||
@@ -10,7 +10,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
||||
|
||||
[](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
||||
|
||||
## Recent Changes
|
||||
## Recent Changes v0.2.0
|
||||
|
||||
- 🚀 10x faster!!
|
||||
- 📜 Execute custom JavaScript before crawling!
|
||||
@@ -28,7 +28,9 @@ To show the simplicity take a look at the first example:
|
||||
from crawl4ai import WebCrawler
|
||||
|
||||
# Create the WebCrawler instance
|
||||
crawler = WebCrawler()
|
||||
crawler = WebCrawler()
|
||||
|
||||
|
||||
|
||||
# Run the crawler with keyword filtering and CSS selector
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
@@ -134,7 +136,7 @@ source venv/bin/activate
|
||||
pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
|
||||
```
|
||||
|
||||
💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
|
||||
💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
|
||||
|
||||
crawl4ai-download-models
|
||||
|
||||
@@ -149,21 +151,22 @@ pip install -e .[all]
|
||||
|
||||
3. Use docker to run the local server:
|
||||
```bash
|
||||
docker build -t crawl4ai .
|
||||
# For Mac users
|
||||
# docker build --platform linux/amd64 -t crawl4ai .
|
||||
# For other users
|
||||
# docker build -t crawl4ai .
|
||||
docker run -d -p 8000:80 crawl4ai
|
||||
```
|
||||
|
||||
For more information about how to run Crawl4AI as a local server, please refer to the [GitHub repository](https://github.com/unclecode/crawl4ai).
|
||||
|
||||
|
||||
## Using the Local server ot REST API 🌐
|
||||
|
||||
You can also use Crawl4AI through the REST API. This method allows you to send HTTP requests to the Crawl4AI server and receive structured data in response. The base URL for the API is `https://crawl4ai.com/crawl`. If you run the local server, you can use `http://localhost:8000/crawl`. (Port is dependent on your docker configuration)
|
||||
You can also use Crawl4AI through the REST API. This method allows you to send HTTP requests to the Crawl4AI server and receive structured data in response. The base URL for the API is `https://crawl4ai.com/crawl` [Available now, on a CPU server, of course will be faster on GPU]. If you run the local server, you can use `http://localhost:8000/crawl`. (Port is dependent on your docker configuration)
|
||||
|
||||
### Example Usage
|
||||
|
||||
To use the REST API, send a POST request to `https://crawl4ai.com/crawl` with the following parameters in the request body.
|
||||
To use the REST API, send a POST request to `http://localhost:8000/crawl` with the following parameters in the request body.
|
||||
|
||||
**Example Request:**
|
||||
```json
|
||||
|
||||
@@ -16,7 +16,7 @@ class ChunkingStrategy(ABC):
|
||||
|
||||
# Regex-based chunking
|
||||
class RegexChunking(ChunkingStrategy):
|
||||
def __init__(self, patterns=None):
|
||||
def __init__(self, patterns=None, **kwargs):
|
||||
if patterns is None:
|
||||
patterns = [r'\n\n'] # Default split pattern
|
||||
self.patterns = patterns
|
||||
@@ -32,7 +32,7 @@ class RegexChunking(ChunkingStrategy):
|
||||
|
||||
# NLP-based sentence chunking
|
||||
class NlpSentenceChunking(ChunkingStrategy):
|
||||
def __init__(self):
|
||||
def __init__(self, **kwargs):
|
||||
load_nltk_punkt()
|
||||
pass
|
||||
|
||||
@@ -52,7 +52,7 @@ class NlpSentenceChunking(ChunkingStrategy):
|
||||
# Topic-based segmentation using TextTiling
|
||||
class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
def __init__(self, num_keywords=3):
|
||||
def __init__(self, num_keywords=3, **kwargs):
|
||||
import nltk as nl
|
||||
self.tokenizer = nl.toknize.TextTilingTokenizer()
|
||||
self.num_keywords = num_keywords
|
||||
@@ -82,7 +82,7 @@ class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
# Fixed-length word chunks
|
||||
class FixedLengthWordChunking(ChunkingStrategy):
|
||||
def __init__(self, chunk_size=100):
|
||||
def __init__(self, chunk_size=100, **kwargs):
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
@@ -91,7 +91,7 @@ class FixedLengthWordChunking(ChunkingStrategy):
|
||||
|
||||
# Sliding window chunking
|
||||
class SlidingWindowChunking(ChunkingStrategy):
|
||||
def __init__(self, window_size=100, step=50):
|
||||
def __init__(self, window_size=100, step=50, **kwargs):
|
||||
self.window_size = window_size
|
||||
self.step = step
|
||||
|
||||
|
||||
@@ -6,6 +6,24 @@ from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import InvalidArgumentException
|
||||
import logging
|
||||
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
logger_driver = logging.getLogger('selenium.webdriver.common.service')
|
||||
logger_driver.setLevel(logging.WARNING)
|
||||
|
||||
urllib3_logger = logging.getLogger('urllib3.connectionpool')
|
||||
urllib3_logger.setLevel(logging.WARNING)
|
||||
|
||||
# Disable http.client logging
|
||||
http_client_logger = logging.getLogger('http.client')
|
||||
http_client_logger.setLevel(logging.WARNING)
|
||||
|
||||
# Disable driver_finder and service logging
|
||||
driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finder')
|
||||
driver_finder_logger.setLevel(logging.WARNING)
|
||||
|
||||
|
||||
from typing import List
|
||||
import requests
|
||||
@@ -36,22 +54,37 @@ class CloudCrawlerStrategy(CrawlerStrategy):
|
||||
return html
|
||||
|
||||
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
def __init__(self, use_cached_html=False, js_code=None):
|
||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||
super().__init__()
|
||||
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||
self.options = Options()
|
||||
self.options.headless = True
|
||||
self.options.add_argument("--no-sandbox")
|
||||
self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--disable-gpu")
|
||||
self.options.add_argument("--disable-extensions")
|
||||
self.options.add_argument("--headless")
|
||||
# self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--disable-gpu")
|
||||
# self.options.add_argument("--disable-extensions")
|
||||
# self.options.add_argument("--disable-infobars")
|
||||
# self.options.add_argument("--disable-logging")
|
||||
# self.options.add_argument("--disable-popup-blocking")
|
||||
# self.options.add_argument("--disable-translate")
|
||||
# self.options.add_argument("--disable-default-apps")
|
||||
# self.options.add_argument("--disable-background-networking")
|
||||
# self.options.add_argument("--disable-sync")
|
||||
# self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
|
||||
# self.options.add_argument("--disable-browser-side-navigation")
|
||||
# self.options.add_argument("--dns-prefetch-disable")
|
||||
# self.options.add_argument("--disable-web-security")
|
||||
self.options.add_argument("--log-level=3")
|
||||
self.use_cached_html = use_cached_html
|
||||
self.use_cached_html = use_cached_html
|
||||
self.js_code = js_code
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
# chromedriver_autoinstaller.install()
|
||||
import chromedriver_autoinstaller
|
||||
self.service = Service(chromedriver_autoinstaller.install())
|
||||
self.service.log_path = "NUL"
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
|
||||
def crawl(self, url: str) -> str:
|
||||
@@ -62,6 +95,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
return f.read()
|
||||
|
||||
try:
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
||||
self.driver.get(url)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||
@@ -81,6 +116,9 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
||||
with open(cache_file_path, "w") as f:
|
||||
f.write(html)
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
return html
|
||||
except InvalidArgumentException:
|
||||
|
||||
@@ -46,6 +46,7 @@ class ExtractionStrategy(ABC):
|
||||
for future in as_completed(futures):
|
||||
extracted_content.extend(future.result())
|
||||
return extracted_content
|
||||
|
||||
class NoExtractionStrategy(ExtractionStrategy):
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": 0, "content": html}]
|
||||
@@ -141,7 +142,8 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
if self.provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
for ix, section in enumerate(merged_sections):
|
||||
extracted_content.extend(self.extract(ix, url, section))
|
||||
extract_func = partial(self.extract, url)
|
||||
extracted_content.extend(extract_func(ix, section))
|
||||
time.sleep(0.5) # 500 ms delay between each processing
|
||||
else:
|
||||
# Parallel processing using ThreadPoolExecutor
|
||||
@@ -156,7 +158,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
return extracted_content
|
||||
|
||||
class CosineStrategy(ExtractionStrategy):
|
||||
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'BAAI/bge-small-en-v1.5', **kwargs):
|
||||
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
|
||||
@@ -173,48 +175,96 @@ class CosineStrategy(ExtractionStrategy):
|
||||
self.max_dist = max_dist
|
||||
self.linkage_method = linkage_method
|
||||
self.top_k = top_k
|
||||
self.sim_threshold = sim_threshold
|
||||
self.timer = time.time()
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
self.buffer_embeddings = np.array([])
|
||||
self.get_embedding_method = "direct"
|
||||
|
||||
self.device = get_device()
|
||||
self.default_batch_size = calculate_batch_size(self.device)
|
||||
|
||||
if model_name == "bert-base-uncased":
|
||||
self.tokenizer, self.model = load_bert_base_uncased()
|
||||
elif model_name == "BAAI/bge-small-en-v1.5":
|
||||
if self.verbose:
|
||||
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
||||
|
||||
if False and self.device.type == "cpu":
|
||||
self.model = load_onnx_all_MiniLM_l6_v2()
|
||||
self.tokenizer = self.model.tokenizer
|
||||
self.get_embedding_method = "direct"
|
||||
else:
|
||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
self.model.eval()
|
||||
self.get_embedding_method = "batch"
|
||||
|
||||
self.buffer_embeddings = np.array([])
|
||||
|
||||
self.nlp = load_text_multilabel_classifier()
|
||||
# if model_name == "bert-base-uncased":
|
||||
# self.tokenizer, self.model = load_bert_base_uncased()
|
||||
# self.model.eval() # Ensure the model is in evaluation mode
|
||||
# self.get_embedding_method = "batch"
|
||||
# elif model_name == "BAAI/bge-small-en-v1.5":
|
||||
# self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
# self.model.eval() # Ensure the model is in evaluation mode
|
||||
# self.get_embedding_method = "batch"
|
||||
# elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
|
||||
# self.model = load_onnx_all_MiniLM_l6_v2()
|
||||
# self.tokenizer = self.model.tokenizer
|
||||
# self.get_embedding_method = "direct"
|
||||
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
|
||||
|
||||
self.nlp, self.device = load_text_multilabel_classifier()
|
||||
# self.default_batch_size = 16 if self.device.type == 'cpu' else 64
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
||||
|
||||
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
|
||||
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
|
||||
"""
|
||||
Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||||
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||||
|
||||
:param documents: List of text chunks (documents).
|
||||
:param semantic_filter: A string containing the keywords for filtering.
|
||||
:param threshold: Cosine similarity threshold for filtering documents.
|
||||
:return: Filtered list of documents.
|
||||
:param at_least_k: Minimum number of documents to return.
|
||||
:return: List of filtered documents, ensuring at least `at_least_k` documents.
|
||||
"""
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
if not semantic_filter:
|
||||
return documents
|
||||
|
||||
if len(documents) < at_least_k:
|
||||
at_least_k = len(documents) // 2
|
||||
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
# Compute embedding for the keyword filter
|
||||
query_embedding = self.get_embeddings([semantic_filter])[0]
|
||||
|
||||
# Compute embeddings for the docu ments
|
||||
# Compute embeddings for the documents
|
||||
document_embeddings = self.get_embeddings(documents)
|
||||
|
||||
# Calculate cosine similarity between the query embedding and document embeddings
|
||||
similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
|
||||
|
||||
# Filter documents based on the similarity threshold
|
||||
filtered_docs = [doc for doc, sim in zip(documents, similarities) if sim >= threshold]
|
||||
filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
|
||||
|
||||
return filtered_docs
|
||||
|
||||
def get_embeddings(self, sentences: List[str], bypass_buffer=True):
|
||||
# If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
|
||||
if len(filtered_docs) < at_least_k:
|
||||
remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
|
||||
remaining_docs.sort(key=lambda x: x[1], reverse=True)
|
||||
filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
|
||||
|
||||
# Extract the document texts from the tuples
|
||||
filtered_docs = [doc for doc, _ in filtered_docs]
|
||||
|
||||
return filtered_docs[:at_least_k]
|
||||
|
||||
def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
|
||||
"""
|
||||
Get BERT embeddings for a list of sentences.
|
||||
|
||||
@@ -224,19 +274,42 @@ class CosineStrategy(ExtractionStrategy):
|
||||
# if self.buffer_embeddings.any() and not bypass_buffer:
|
||||
# return self.buffer_embeddings
|
||||
|
||||
import torch
|
||||
# Tokenize sentences and convert to tensor
|
||||
encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
# Compute token embeddings
|
||||
with torch.no_grad():
|
||||
model_output = self.model(**encoded_input)
|
||||
if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
|
||||
import torch
|
||||
# Tokenize sentences and convert to tensor
|
||||
if batch_size is None:
|
||||
batch_size = self.default_batch_size
|
||||
|
||||
all_embeddings = []
|
||||
for i in range(0, len(sentences), batch_size):
|
||||
batch_sentences = sentences[i:i + batch_size]
|
||||
encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
|
||||
|
||||
# Ensure no gradients are calculated
|
||||
with torch.no_grad():
|
||||
model_output = self.model(**encoded_input)
|
||||
|
||||
# Get embeddings from the last hidden state (mean pooling)
|
||||
embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
|
||||
all_embeddings.append(embeddings)
|
||||
|
||||
# Get embeddings from the last hidden state (mean pooling)
|
||||
embeddings = model_output.last_hidden_state.mean(1)
|
||||
self.buffer_embeddings = embeddings.numpy()
|
||||
return embeddings.numpy()
|
||||
self.buffer_embeddings = np.vstack(all_embeddings)
|
||||
elif self.device.type == "cpu":
|
||||
# self.buffer_embeddings = self.model(sentences)
|
||||
if batch_size is None:
|
||||
batch_size = self.default_batch_size
|
||||
|
||||
all_embeddings = []
|
||||
for i in range(0, len(sentences), batch_size):
|
||||
batch_sentences = sentences[i:i + batch_size]
|
||||
embeddings = self.model(batch_sentences)
|
||||
all_embeddings.append(embeddings)
|
||||
|
||||
self.buffer_embeddings = np.vstack(all_embeddings)
|
||||
return self.buffer_embeddings
|
||||
|
||||
def hierarchical_clustering(self, sentences: List[str]):
|
||||
def hierarchical_clustering(self, sentences: List[str], embeddings = None):
|
||||
"""
|
||||
Perform hierarchical clustering on sentences and return cluster labels.
|
||||
|
||||
@@ -247,7 +320,7 @@ class CosineStrategy(ExtractionStrategy):
|
||||
from scipy.cluster.hierarchy import linkage, fcluster
|
||||
from scipy.spatial.distance import pdist
|
||||
self.timer = time.time()
|
||||
embeddings = self.get_embeddings(sentences, bypass_buffer=False)
|
||||
embeddings = self.get_embeddings(sentences, bypass_buffer=True)
|
||||
# print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
|
||||
# Compute pairwise cosine distances
|
||||
distance_matrix = pdist(embeddings, 'cosine')
|
||||
@@ -311,20 +384,33 @@ class CosineStrategy(ExtractionStrategy):
|
||||
# Convert filtered clusters to a sorted list of dictionaries
|
||||
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
||||
|
||||
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Assign tags using {self.device}")
|
||||
|
||||
for cluster, label in zip(cluster_list, labels):
|
||||
cluster['tags'] = label
|
||||
if self.device.type in ["gpu", "cuda", "mps"]:
|
||||
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
||||
|
||||
for cluster, label in zip(cluster_list, labels):
|
||||
cluster['tags'] = label
|
||||
elif self.device == "cpu":
|
||||
# Process the text with the loaded model
|
||||
texts = [cluster['content'] for cluster in cluster_list]
|
||||
# Batch process texts
|
||||
docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
|
||||
|
||||
# Process the text with the loaded model
|
||||
# for cluster in cluster_list:
|
||||
# cluster['tags'] = self.nlp(cluster['content'])[0]['label']
|
||||
# doc = self.nlp(cluster['content'])
|
||||
# tok_k = self.top_k
|
||||
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
for doc, cluster in zip(docs, cluster_list):
|
||||
tok_k = self.top_k
|
||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
|
||||
# for cluster in cluster_list:
|
||||
# doc = self.nlp(cluster['content'])
|
||||
# tok_k = self.top_k
|
||||
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
|
||||
# print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||||
|
||||
return cluster_list
|
||||
|
||||
@@ -463,4 +549,4 @@ class ContentSummarizationStrategy(ExtractionStrategy):
|
||||
|
||||
# Sort summaries by the original section index to maintain order
|
||||
summaries.sort(key=lambda x: x[0])
|
||||
return [summary for _, summary in summaries]
|
||||
return [summary for _, summary in summaries]
|
||||
|
||||
@@ -4,7 +4,56 @@ import subprocess, os
|
||||
import shutil
|
||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
||||
import argparse
|
||||
import urllib.request
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
@lru_cache()
|
||||
def get_available_memory(device):
|
||||
import torch
|
||||
if device.type == 'cuda':
|
||||
return torch.cuda.get_device_properties(device).total_memory
|
||||
elif device.type == 'mps':
|
||||
return 48 * 1024 ** 3 # Assuming 8GB for MPS, as a conservative estimate
|
||||
else:
|
||||
return 0
|
||||
|
||||
@lru_cache()
|
||||
def calculate_batch_size(device):
|
||||
available_memory = get_available_memory(device)
|
||||
|
||||
if device.type == 'cpu':
|
||||
return 16
|
||||
elif device.type in ['cuda', 'mps']:
|
||||
# Adjust these thresholds based on your model size and available memory
|
||||
if available_memory >= 31 * 1024 ** 3: # > 32GB
|
||||
return 256
|
||||
elif available_memory >= 15 * 1024 ** 3: # > 16GB to 32GB
|
||||
return 128
|
||||
elif available_memory >= 8 * 1024 ** 3: # 8GB to 16GB
|
||||
return 64
|
||||
else:
|
||||
return 32
|
||||
else:
|
||||
return 16 # Default batch size
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_device():
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device('cuda')
|
||||
elif torch.backends.mps.is_available():
|
||||
device = torch.device('mps')
|
||||
else:
|
||||
device = torch.device('cpu')
|
||||
return device
|
||||
|
||||
def set_model_device(model):
|
||||
device = get_device()
|
||||
model.to(device)
|
||||
return model, device
|
||||
|
||||
@lru_cache()
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
@@ -17,6 +66,8 @@ def load_bert_base_uncased():
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
@@ -25,17 +76,45 @@ def load_bge_small_en_v1_5():
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_onnx_all_MiniLM_l6_v2():
|
||||
from crawl4ai.onnx_embedding import DefaultEmbeddingModel
|
||||
model_path = "models/onnx/model.onnx"
|
||||
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx"
|
||||
download_path = os.path.join(__location__, model_path)
|
||||
|
||||
if not os.path.exists(download_path):
|
||||
# Define a download function with a simple progress display
|
||||
def download_with_progress(url, filename):
|
||||
def reporthook(block_num, block_size, total_size):
|
||||
downloaded = block_num * block_size
|
||||
percentage = 100 * downloaded / total_size
|
||||
if downloaded < total_size:
|
||||
print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
|
||||
else:
|
||||
print("\rDownload complete! ")
|
||||
|
||||
urllib.request.urlretrieve(url, filename, reporthook)
|
||||
|
||||
download_with_progress(model_url, download_path)
|
||||
|
||||
model = DefaultEmbeddingModel()
|
||||
return model
|
||||
|
||||
@lru_cache()
|
||||
def load_text_classifier():
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
from transformers import pipeline
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
||||
|
||||
return pipe
|
||||
|
||||
@lru_cache()
|
||||
@@ -45,20 +124,21 @@ def load_text_multilabel_classifier():
|
||||
from scipy.special import expit
|
||||
import torch
|
||||
|
||||
MODEL = "cardiffnlp/tweet-topic-21-multi"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
|
||||
class_mapping = model.config.id2label
|
||||
|
||||
# Check for available device: CUDA, MPS (for Apple Silicon), or CPU
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device("cuda")
|
||||
elif torch.backends.mps.is_available():
|
||||
device = torch.device("mps")
|
||||
else:
|
||||
device = torch.device("cpu")
|
||||
return load_spacy_model(), torch.device("cpu")
|
||||
|
||||
model.to(device)
|
||||
|
||||
MODEL = "cardiffnlp/tweet-topic-21-multi"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
class_mapping = model.config.id2label
|
||||
|
||||
def _classifier(texts, threshold=0.5, max_length=64):
|
||||
tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
|
||||
@@ -78,7 +158,7 @@ def load_text_multilabel_classifier():
|
||||
|
||||
return batch_labels
|
||||
|
||||
return _classifier
|
||||
return _classifier, device
|
||||
|
||||
@lru_cache()
|
||||
def load_nltk_punkt():
|
||||
@@ -89,6 +169,58 @@ def load_nltk_punkt():
|
||||
nltk.download('punkt')
|
||||
return nltk.data.find('tokenizers/punkt')
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_model():
|
||||
import spacy
|
||||
name = "models/reuters"
|
||||
home_folder = get_home_folder()
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
# Check if the model directory already exists
|
||||
if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
|
||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
||||
# branch = "main"
|
||||
branch = MODEL_REPO_BRANCH
|
||||
repo_folder = os.path.join(home_folder, "crawl4ai")
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
# print("[LOG] ⏬ Downloading Spacy model for the first time...")
|
||||
|
||||
# Remove existing repo folder if it exists
|
||||
if Path(repo_folder).exists():
|
||||
shutil.rmtree(repo_folder)
|
||||
shutil.rmtree(model_folder)
|
||||
|
||||
try:
|
||||
# Clone the repository
|
||||
subprocess.run(
|
||||
["git", "clone", "-b", branch, repo_url, repo_folder],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True
|
||||
)
|
||||
|
||||
# Create the models directory if it doesn't exist
|
||||
models_folder = os.path.join(home_folder, "models")
|
||||
os.makedirs(models_folder, exist_ok=True)
|
||||
|
||||
# Copy the reuters model folder to the models directory
|
||||
source_folder = os.path.join(repo_folder, "models/reuters")
|
||||
shutil.copytree(source_folder, model_folder)
|
||||
|
||||
# Remove the cloned repository
|
||||
shutil.rmtree(repo_folder)
|
||||
|
||||
# Print completion message
|
||||
# print("[LOG] ✅ Spacy Model downloaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while cloning the repository: {e}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
return spacy.load(model_folder)
|
||||
|
||||
def download_all_models(remove_existing=False):
|
||||
"""Download all models required for Crawl4AI."""
|
||||
if remove_existing:
|
||||
@@ -104,12 +236,15 @@ def download_all_models(remove_existing=False):
|
||||
print("[LOG] Existing models removed.")
|
||||
|
||||
# Load each model to trigger download
|
||||
print("[LOG] Downloading BERT Base Uncased...")
|
||||
load_bert_base_uncased()
|
||||
print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
load_bge_small_en_v1_5()
|
||||
# print("[LOG] Downloading BERT Base Uncased...")
|
||||
# load_bert_base_uncased()
|
||||
# print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
# load_bge_small_en_v1_5()
|
||||
print("[LOG] Downloading ONNX model...")
|
||||
load_onnx_all_MiniLM_l6_v2()
|
||||
print("[LOG] Downloading text classifier...")
|
||||
load_text_multilabel_classifier
|
||||
_, device = load_text_multilabel_classifier()
|
||||
print(f"[LOG] Text classifier loaded on {device}")
|
||||
print("[LOG] Downloading custom NLTK Punkt model...")
|
||||
load_nltk_punkt()
|
||||
print("[LOG] ✅ All models downloaded successfully.")
|
||||
@@ -124,4 +259,4 @@ def main():
|
||||
download_all_models(remove_existing=args.remove_existing)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
25
crawl4ai/models/onnx/config.json
Normal file
25
crawl4ai/models/onnx/config.json
Normal file
@@ -0,0 +1,25 @@
|
||||
{
|
||||
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
|
||||
"architectures": [
|
||||
"BertModel"
|
||||
],
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"classifier_dropout": null,
|
||||
"gradient_checkpointing": false,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 384,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 1536,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"max_position_embeddings": 512,
|
||||
"model_type": "bert",
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 6,
|
||||
"pad_token_id": 0,
|
||||
"position_embedding_type": "absolute",
|
||||
"transformers_version": "4.27.4",
|
||||
"type_vocab_size": 2,
|
||||
"use_cache": true,
|
||||
"vocab_size": 30522
|
||||
}
|
||||
BIN
crawl4ai/models/onnx/model.onnx
Normal file
BIN
crawl4ai/models/onnx/model.onnx
Normal file
Binary file not shown.
7
crawl4ai/models/onnx/special_tokens_map.json
Normal file
7
crawl4ai/models/onnx/special_tokens_map.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"cls_token": "[CLS]",
|
||||
"mask_token": "[MASK]",
|
||||
"pad_token": "[PAD]",
|
||||
"sep_token": "[SEP]",
|
||||
"unk_token": "[UNK]"
|
||||
}
|
||||
30686
crawl4ai/models/onnx/tokenizer.json
Normal file
30686
crawl4ai/models/onnx/tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
15
crawl4ai/models/onnx/tokenizer_config.json
Normal file
15
crawl4ai/models/onnx/tokenizer_config.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"cls_token": "[CLS]",
|
||||
"do_basic_tokenize": true,
|
||||
"do_lower_case": true,
|
||||
"mask_token": "[MASK]",
|
||||
"model_max_length": 512,
|
||||
"never_split": null,
|
||||
"pad_token": "[PAD]",
|
||||
"sep_token": "[SEP]",
|
||||
"special_tokens_map_file": "/Users/hammad/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/7dbbc90392e2f80f3d3c277d6e90027e55de9125/special_tokens_map.json",
|
||||
"strip_accents": null,
|
||||
"tokenize_chinese_chars": true,
|
||||
"tokenizer_class": "BertTokenizer",
|
||||
"unk_token": "[UNK]"
|
||||
}
|
||||
30522
crawl4ai/models/onnx/vocab.txt
Normal file
30522
crawl4ai/models/onnx/vocab.txt
Normal file
File diff suppressed because it is too large
Load Diff
50
crawl4ai/onnx_embedding.py
Normal file
50
crawl4ai/onnx_embedding.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# A dependency-light way to run the onnx model
|
||||
|
||||
|
||||
import numpy as np
|
||||
from typing import List
|
||||
import os
|
||||
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
|
||||
def normalize(v):
|
||||
norm = np.linalg.norm(v, axis=1)
|
||||
norm[norm == 0] = 1e-12
|
||||
return v / norm[:, np.newaxis]
|
||||
|
||||
# Sampel implementation of the default sentence-transformers model using ONNX
|
||||
class DefaultEmbeddingModel():
|
||||
|
||||
def __init__(self):
|
||||
from tokenizers import Tokenizer
|
||||
import onnxruntime as ort
|
||||
# max_seq_length = 256, for some reason sentence-transformers uses 256 even though the HF config has a max length of 128
|
||||
# https://github.com/UKPLab/sentence-transformers/blob/3e1929fddef16df94f8bc6e3b10598a98f46e62d/docs/_static/html/models_en_sentence_embeddings.html#LL480
|
||||
self.tokenizer = Tokenizer.from_file(os.path.join(__location__, "models/onnx/tokenizer.json"))
|
||||
self.tokenizer.enable_truncation(max_length=256)
|
||||
self.tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=256)
|
||||
self.model = ort.InferenceSession(os.path.join(__location__,"models/onnx/model.onnx"))
|
||||
|
||||
|
||||
def __call__(self, documents: List[str], batch_size: int = 32):
|
||||
all_embeddings = []
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i:i + batch_size]
|
||||
encoded = [self.tokenizer.encode(d) for d in batch]
|
||||
input_ids = np.array([e.ids for e in encoded])
|
||||
attention_mask = np.array([e.attention_mask for e in encoded])
|
||||
onnx_input = {
|
||||
"input_ids": np.array(input_ids, dtype=np.int64),
|
||||
"attention_mask": np.array(attention_mask, dtype=np.int64),
|
||||
"token_type_ids": np.array([np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64),
|
||||
}
|
||||
model_output = self.model.run(None, onnx_input)
|
||||
last_hidden_state = model_output[0]
|
||||
# Perform mean pooling with attention weighting
|
||||
input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), last_hidden_state.shape)
|
||||
embeddings = np.sum(last_hidden_state * input_mask_expanded, 1) / np.clip(input_mask_expanded.sum(1), a_min=1e-9, a_max=None)
|
||||
embeddings = normalize(embeddings).astype(np.float32)
|
||||
all_embeddings.append(embeddings)
|
||||
return np.concatenate(all_embeddings)
|
||||
|
||||
@@ -19,9 +19,10 @@ class WebCrawler:
|
||||
# db_path: str = None,
|
||||
crawler_strategy: CrawlerStrategy = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
verbose: bool = False,
|
||||
):
|
||||
# self.db_path = db_path
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy()
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
|
||||
@@ -12,7 +12,7 @@ console = Console()
|
||||
|
||||
@lru_cache()
|
||||
def create_crawler():
|
||||
crawler = WebCrawler()
|
||||
crawler = WebCrawler(verbose=True)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
@@ -86,7 +86,7 @@ def add_extraction_strategy(crawler):
|
||||
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
14
main.py
14
main.py
@@ -2,6 +2,8 @@ import os
|
||||
import importlib
|
||||
import asyncio
|
||||
from functools import lru_cache
|
||||
import logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
@@ -41,7 +43,7 @@ templates = Jinja2Templates(directory=__location__ + "/pages")
|
||||
@lru_cache()
|
||||
def get_crawler():
|
||||
# Initialize and return a WebCrawler instance
|
||||
return WebCrawler()
|
||||
return WebCrawler(verbose = True)
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[str]
|
||||
@@ -77,7 +79,7 @@ async def get_total_url_count():
|
||||
# Add endpoit to clear db
|
||||
@app.get("/clear-db")
|
||||
async def clear_database():
|
||||
clear_db()
|
||||
# clear_db()
|
||||
return JSONResponse(content={"message": "Database cleared."})
|
||||
|
||||
def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
||||
@@ -86,12 +88,15 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
||||
strategy_class = getattr(module, class_name)
|
||||
return strategy_class(*args, **kwargs)
|
||||
except ImportError:
|
||||
print("ImportError: Module not found.")
|
||||
raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
|
||||
except AttributeError:
|
||||
print("AttributeError: Class not found.")
|
||||
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
||||
|
||||
@app.post("/crawl")
|
||||
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
||||
global current_requests
|
||||
async with lock:
|
||||
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
||||
@@ -99,10 +104,15 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||
current_requests += 1
|
||||
|
||||
try:
|
||||
logging.debug("[LOG] Loading extraction and chunking strategies...")
|
||||
crawl_request.extraction_strategy_args['verbose'] = True
|
||||
crawl_request.chunking_strategy_args['verbose'] = True
|
||||
|
||||
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
|
||||
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)
|
||||
|
||||
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
|
||||
logging.debug("[LOG] Running the WebCrawler...")
|
||||
with ThreadPoolExecutor() as executor:
|
||||
loop = asyncio.get_event_loop()
|
||||
futures = [
|
||||
|
||||
@@ -136,13 +136,13 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||
...data,
|
||||
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
||||
}, null, 2)}' http://crawl4ai.com/crawl`;
|
||||
}, null, 2)}' http://localhost:8000/crawl`;
|
||||
|
||||
document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)}\n\nresponse = requests.post("http://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
||||
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
||||
|
||||
document.getElementById(
|
||||
"nodejs-code"
|
||||
@@ -150,7 +150,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)};\n\naxios.post("http://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
)};\n\naxios.post("http://localhost:8000/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
|
||||
document.getElementById(
|
||||
"library-code"
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
<header class="bg-zinc-950 text-lime-500 py-4 flex">
|
||||
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.2</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
|
||||
@@ -4,7 +4,6 @@ bs4==0.0.2
|
||||
fastapi==0.111.0
|
||||
html2text==2024.2.26
|
||||
httpx==0.27.0
|
||||
lazy_import==0.2.2
|
||||
litellm==1.37.11
|
||||
nltk==3.8.1
|
||||
pydantic==2.7.1
|
||||
@@ -16,4 +15,6 @@ selenium==4.20.0
|
||||
uvicorn==0.29.0
|
||||
transformers==4.40.2
|
||||
chromedriver-autoinstaller==0.6.4
|
||||
torch==2.3.0
|
||||
torch==2.3.0
|
||||
onnxruntime==1.14.1
|
||||
tokenizers==0.13.2
|
||||
14
setup.py
14
setup.py
@@ -1,4 +1,7 @@
|
||||
from setuptools import setup, find_packages
|
||||
import os
|
||||
import subprocess
|
||||
from setuptools.command.install import install
|
||||
|
||||
# Read the requirements from requirements.txt
|
||||
with open("requirements.txt") as f:
|
||||
@@ -10,9 +13,15 @@ requirements_without_transformers = [req for req in requirements if not req.star
|
||||
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
|
||||
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||
|
||||
class CustomInstallCommand(install):
|
||||
"""Customized setuptools install command to install spacy without dependencies."""
|
||||
def run(self):
|
||||
install.run(self)
|
||||
subprocess.check_call([os.sys.executable, '-m', 'pip', 'install', 'spacy', '--no-deps'])
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.2.0",
|
||||
version="0.2.2",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
@@ -27,6 +36,9 @@ setup(
|
||||
"colab": requirements_without_torch, # Exclude torch for Colab
|
||||
"crawl": requirements_without_torch_transformers_nlkt
|
||||
},
|
||||
cmdclass={
|
||||
'install': CustomInstallCommand,
|
||||
},
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'crawl4ai-download-models=crawl4ai.model_loader:main',
|
||||
|
||||
Reference in New Issue
Block a user