- Add ONNX embedding model for CPU devices, Update the similarithy threshold, improve the embedding speed.

This commit is contained in:
unclecode
2024-05-19 22:30:10 +08:00
parent bf00c26a83
commit 13a3b21d19
11 changed files with 61433 additions and 49 deletions

View File

@@ -157,7 +157,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
return extracted_content
class CosineStrategy(ExtractionStrategy):
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'BAAI/bge-small-en-v1.5', **kwargs):
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
"""
Initialize the strategy with clustering parameters.
@@ -174,56 +174,96 @@ class CosineStrategy(ExtractionStrategy):
self.max_dist = max_dist
self.linkage_method = linkage_method
self.top_k = top_k
self.sim_threshold = sim_threshold
self.timer = time.time()
self.verbose = kwargs.get("verbose", False)
self.buffer_embeddings = np.array([])
self.get_embedding_method = "direct"
self.device = get_device()
self.default_batch_size = calculate_batch_size(self.device)
if self.verbose:
print(f"[LOG] Loading Extraction Model {model_name}")
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
if model_name == "bert-base-uncased":
self.tokenizer, self.model = load_bert_base_uncased()
elif model_name == "BAAI/bge-small-en-v1.5":
if self.device.type == "cpu":
self.model = load_onnx_all_MiniLM_l6_v2()
self.tokenizer = self.model.tokenizer
self.get_embedding_method = "direct"
else:
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.model.eval() # Ensure the model is in evaluation mode
self.buffer_embeddings = None
self.model.eval()
self.get_embedding_method = "batch"
self.buffer_embeddings = np.array([])
# if model_name == "bert-base-uncased":
# self.tokenizer, self.model = load_bert_base_uncased()
# self.model.eval() # Ensure the model is in evaluation mode
# self.get_embedding_method = "batch"
# elif model_name == "BAAI/bge-small-en-v1.5":
# self.tokenizer, self.model = load_bge_small_en_v1_5()
# self.model.eval() # Ensure the model is in evaluation mode
# self.get_embedding_method = "batch"
# elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
# self.model = load_onnx_all_MiniLM_l6_v2()
# self.tokenizer = self.model.tokenizer
# self.get_embedding_method = "direct"
if self.verbose:
print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
self.nlp, self.device = load_text_multilabel_classifier()
# self.default_batch_size = 16 if self.device.type == 'cpu' else 64
self.default_batch_size = calculate_batch_size(self.device)
if self.verbose:
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
"""
Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
:param documents: List of text chunks (documents).
:param semantic_filter: A string containing the keywords for filtering.
:param threshold: Cosine similarity threshold for filtering documents.
:return: Filtered list of documents.
:param at_least_k: Minimum number of documents to return.
:return: List of filtered documents, ensuring at least `at_least_k` documents.
"""
from sklearn.metrics.pairwise import cosine_similarity
if not semantic_filter:
return documents
if len(documents) < at_least_k:
at_least_k = len(documents) // 2
from sklearn.metrics.pairwise import cosine_similarity
# Compute embedding for the keyword filter
query_embedding = self.get_embeddings([semantic_filter])[0]
# Compute embeddings for the docu ments
# Compute embeddings for the documents
document_embeddings = self.get_embeddings(documents)
# Calculate cosine similarity between the query embedding and document embeddings
similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
# Filter documents based on the similarity threshold
filtered_docs = [doc for doc, sim in zip(documents, similarities) if sim >= threshold]
filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
return filtered_docs
def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=True):
# If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
if len(filtered_docs) < at_least_k:
remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
remaining_docs.sort(key=lambda x: x[1], reverse=True)
filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
# Extract the document texts from the tuples
filtered_docs = [doc for doc, _ in filtered_docs]
return filtered_docs[:at_least_k]
def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
"""
Get BERT embeddings for a list of sentences.
@@ -233,29 +273,32 @@ class CosineStrategy(ExtractionStrategy):
# if self.buffer_embeddings.any() and not bypass_buffer:
# return self.buffer_embeddings
import torch
# Tokenize sentences and convert to tensor
if batch_size is None:
batch_size = self.default_batch_size
all_embeddings = []
for i in range(0, len(sentences), batch_size):
batch_sentences = sentences[i:i + batch_size]
encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
if self.device.type in ["gpu", "cuda", "mps"]:
import torch
# Tokenize sentences and convert to tensor
if batch_size is None:
batch_size = self.default_batch_size
all_embeddings = []
for i in range(0, len(sentences), batch_size):
batch_sentences = sentences[i:i + batch_size]
encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
# Ensure no gradients are calculated
with torch.no_grad():
model_output = self.model(**encoded_input)
# Get embeddings from the last hidden state (mean pooling)
embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
all_embeddings.append(embeddings)
# Ensure no gradients are calculated
with torch.no_grad():
model_output = self.model(**encoded_input)
# Get embeddings from the last hidden state (mean pooling)
embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
all_embeddings.append(embeddings)
self.buffer_embeddings = np.vstack(all_embeddings)
self.buffer_embeddings = np.vstack(all_embeddings)
elif self.device.type == "cpu":
self.buffer_embeddings = self.model(sentences)
return self.buffer_embeddings
def hierarchical_clustering(self, sentences: List[str]):
def hierarchical_clustering(self, sentences: List[str], embeddings = None):
"""
Perform hierarchical clustering on sentences and return cluster labels.
@@ -266,7 +309,7 @@ class CosineStrategy(ExtractionStrategy):
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
self.timer = time.time()
embeddings = self.get_embeddings(sentences, bypass_buffer=False)
embeddings = self.get_embeddings(sentences, bypass_buffer=True)
# print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
# Compute pairwise cosine distances
distance_matrix = pdist(embeddings, 'cosine')

View File

@@ -4,6 +4,8 @@ import subprocess, os
import shutil
from crawl4ai.config import MODEL_REPO_BRANCH
import argparse
import urllib.request
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@lru_cache()
def get_available_memory(device):
@@ -23,18 +25,20 @@ def calculate_batch_size(device):
return 16
elif device.type in ['cuda', 'mps']:
# Adjust these thresholds based on your model size and available memory
if available_memory > 32 * 1024 ** 3: # > 16GB
if available_memory >= 31 * 1024 ** 3: # > 32GB
return 256
elif available_memory > 16 * 1024 ** 3: # > 16GB
elif available_memory >= 15 * 1024 ** 3: # > 16GB to 32GB
return 128
elif available_memory > 8 * 1024 ** 3: # 8GB to 16GB
elif available_memory >= 8 * 1024 ** 3: # 8GB to 16GB
return 64
else:
return 32
else:
return 16 # Default batch size
def set_model_device(model):
@lru_cache()
def get_device():
import torch
if torch.cuda.is_available():
device = torch.device('cuda')
@@ -42,7 +46,10 @@ def set_model_device(model):
device = torch.device('mps')
else:
device = torch.device('cpu')
return device
def set_model_device(model):
device = get_device()
model.to(device)
return model, device
@@ -72,6 +79,31 @@ def load_bge_small_en_v1_5():
model, device = set_model_device(model)
return tokenizer, model
@lru_cache()
def load_onnx_all_MiniLM_l6_v2():
from crawl4ai.onnx_embedding import DefaultEmbeddingModel
model_path = "models/onnx/model.onnx"
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx"
download_path = os.path.join(__location__, model_path)
if not os.path.exists(download_path):
# Define a download function with a simple progress display
def download_with_progress(url, filename):
def reporthook(block_num, block_size, total_size):
downloaded = block_num * block_size
percentage = 100 * downloaded / total_size
if downloaded < total_size:
print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
else:
print("\rDownload complete! ")
urllib.request.urlretrieve(url, filename, reporthook)
download_with_progress(model_url, download_path)
model = DefaultEmbeddingModel()
return model
@lru_cache()
def load_text_classifier():
from transformers import AutoTokenizer, AutoModelForSequenceClassification
@@ -204,10 +236,12 @@ def download_all_models(remove_existing=False):
print("[LOG] Existing models removed.")
# Load each model to trigger download
print("[LOG] Downloading BERT Base Uncased...")
load_bert_base_uncased()
print("[LOG] Downloading BGE Small EN v1.5...")
load_bge_small_en_v1_5()
# print("[LOG] Downloading BERT Base Uncased...")
# load_bert_base_uncased()
# print("[LOG] Downloading BGE Small EN v1.5...")
# load_bge_small_en_v1_5()
print("[LOG] Downloading ONNX model...")
load_onnx_all_MiniLM_l6_v2()
print("[LOG] Downloading text classifier...")
_, device = load_text_multilabel_classifier()
print(f"[LOG] Text classifier loaded on {device}")

View File

@@ -0,0 +1,25 @@
{
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
"architectures": [
"BertModel"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.27.4",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}

Binary file not shown.

View File

@@ -0,0 +1,7 @@
{
"cls_token": "[CLS]",
"mask_token": "[MASK]",
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"unk_token": "[UNK]"
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
{
"cls_token": "[CLS]",
"do_basic_tokenize": true,
"do_lower_case": true,
"mask_token": "[MASK]",
"model_max_length": 512,
"never_split": null,
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"special_tokens_map_file": "/Users/hammad/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/7dbbc90392e2f80f3d3c277d6e90027e55de9125/special_tokens_map.json",
"strip_accents": null,
"tokenize_chinese_chars": true,
"tokenizer_class": "BertTokenizer",
"unk_token": "[UNK]"
}

30522
crawl4ai/models/onnx/vocab.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,50 @@
# A dependency-light way to run the onnx model
import numpy as np
from typing import List
import os
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
def normalize(v):
norm = np.linalg.norm(v, axis=1)
norm[norm == 0] = 1e-12
return v / norm[:, np.newaxis]
# Sampel implementation of the default sentence-transformers model using ONNX
class DefaultEmbeddingModel():
def __init__(self):
from tokenizers import Tokenizer
import onnxruntime as ort
# max_seq_length = 256, for some reason sentence-transformers uses 256 even though the HF config has a max length of 128
# https://github.com/UKPLab/sentence-transformers/blob/3e1929fddef16df94f8bc6e3b10598a98f46e62d/docs/_static/html/models_en_sentence_embeddings.html#LL480
self.tokenizer = Tokenizer.from_file(os.path.join(__location__, "models/onnx/tokenizer.json"))
self.tokenizer.enable_truncation(max_length=256)
self.tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=256)
self.model = ort.InferenceSession(os.path.join(__location__,"models/onnx/model.onnx"))
def __call__(self, documents: List[str], batch_size: int = 32):
all_embeddings = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
encoded = [self.tokenizer.encode(d) for d in batch]
input_ids = np.array([e.ids for e in encoded])
attention_mask = np.array([e.attention_mask for e in encoded])
onnx_input = {
"input_ids": np.array(input_ids, dtype=np.int64),
"attention_mask": np.array(attention_mask, dtype=np.int64),
"token_type_ids": np.array([np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64),
}
model_output = self.model.run(None, onnx_input)
last_hidden_state = model_output[0]
# Perform mean pooling with attention weighting
input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), last_hidden_state.shape)
embeddings = np.sum(last_hidden_state * input_mask_expanded, 1) / np.clip(input_mask_expanded.sum(1), a_min=1e-9, a_max=None)
embeddings = normalize(embeddings).astype(np.float32)
all_embeddings.append(embeddings)
return np.concatenate(all_embeddings)

View File

@@ -86,7 +86,7 @@ def add_extraction_strategy(crawler):
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, verbose=True)
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
)
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
print_result(result)

View File

@@ -16,3 +16,5 @@ uvicorn==0.29.0
transformers==4.40.2
chromedriver-autoinstaller==0.6.4
torch==2.3.0
onnxruntime==1.14.1
tokenizers==0.13.2