Remove dependency on Spacy model.
This commit is contained in:
@@ -1,12 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import re
|
||||
# spacy = lazy_import.lazy_module('spacy')
|
||||
# nl = lazy_import.lazy_module('nltk')
|
||||
# from nltk.corpus import stopwords
|
||||
# from nltk.tokenize import word_tokenize, TextTilingTokenizer
|
||||
from collections import Counter
|
||||
import string
|
||||
from .model_loader import load_spacy_en_core_web_sm
|
||||
from .model_loader import load_nltk_punkt
|
||||
|
||||
# Define the abstract base class for chunking strategies
|
||||
class ChunkingStrategy(ABC):
|
||||
@@ -34,15 +30,24 @@ class RegexChunking(ChunkingStrategy):
|
||||
paragraphs = new_paragraphs
|
||||
return paragraphs
|
||||
|
||||
# NLP-based sentence chunking using spaCy
|
||||
|
||||
# NLP-based sentence chunking
|
||||
class NlpSentenceChunking(ChunkingStrategy):
|
||||
def __init__(self, model='en_core_web_sm'):
|
||||
self.nlp = load_spacy_en_core_web_sm()
|
||||
def __init__(self):
|
||||
load_nltk_punkt()
|
||||
pass
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
doc = self.nlp(text)
|
||||
return [sent.text.strip() for sent in doc.sents]
|
||||
# Improved regex for sentence splitting
|
||||
# sentence_endings = re.compile(
|
||||
# r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
|
||||
# )
|
||||
# sentences = sentence_endings.split(text)
|
||||
# sens = [sent.strip() for sent in sentences if sent]
|
||||
from nltk.tokenize import sent_tokenize
|
||||
sentences = sent_tokenize(text)
|
||||
sens = [sent.strip() for sent in sentences]
|
||||
|
||||
return list(set(sens))
|
||||
|
||||
# Topic-based segmentation using TextTiling
|
||||
class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
@@ -7,7 +7,7 @@ from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTI
|
||||
from .config import *
|
||||
from .utils import *
|
||||
from functools import partial
|
||||
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
||||
from .model_loader import *
|
||||
|
||||
|
||||
import numpy as np
|
||||
@@ -45,14 +45,13 @@ class ExtractionStrategy(ABC):
|
||||
for future in as_completed(futures):
|
||||
extracted_content.extend(future.result())
|
||||
return extracted_content
|
||||
|
||||
class NoExtractionStrategy(ExtractionStrategy):
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": 0, "content": html}]
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
||||
|
||||
|
||||
class LLMExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
|
||||
"""
|
||||
@@ -166,10 +165,6 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
from transformers import BertTokenizer, BertModel, pipeline
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
import spacy
|
||||
|
||||
self.semantic_filter = semantic_filter
|
||||
self.word_count_threshold = word_count_threshold
|
||||
self.max_dist = max_dist
|
||||
@@ -184,10 +179,9 @@ class CosineStrategy(ExtractionStrategy):
|
||||
elif model_name == "BAAI/bge-small-en-v1.5":
|
||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
|
||||
self.nlp = load_spacy_model()
|
||||
self.nlp = load_text_multilabel_classifier()
|
||||
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
||||
|
||||
|
||||
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
|
||||
"""
|
||||
Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||||
@@ -310,13 +304,19 @@ class CosineStrategy(ExtractionStrategy):
|
||||
|
||||
# Convert filtered clusters to a sorted list of dictionaries
|
||||
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
||||
|
||||
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
||||
|
||||
for cluster, label in zip(cluster_list, labels):
|
||||
cluster['tags'] = label
|
||||
|
||||
# Process the text with the loaded model
|
||||
for cluster in cluster_list:
|
||||
doc = self.nlp(cluster['content'])
|
||||
tok_k = self.top_k
|
||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
# for cluster in cluster_list:
|
||||
# cluster['tags'] = self.nlp(cluster['content'])[0]['label']
|
||||
# doc = self.nlp(cluster['content'])
|
||||
# tok_k = self.top_k
|
||||
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
|
||||
# print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||||
|
||||
|
||||
@@ -28,68 +28,66 @@ def load_bge_small_en_v1_5():
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_en_core_web_sm():
|
||||
import spacy
|
||||
try:
|
||||
print("[LOG] Loading spaCy model")
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
except IOError:
|
||||
print("[LOG] ⏬ Downloading spaCy model for the first time")
|
||||
spacy.cli.download("en_core_web_sm")
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
print("[LOG] ✅ spaCy model loaded successfully")
|
||||
return nlp
|
||||
def load_text_classifier():
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
from transformers import pipeline
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
||||
|
||||
return pipe
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_model():
|
||||
import spacy
|
||||
name = "models/reuters"
|
||||
home_folder = get_home_folder()
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
# Check if the model directory already exists
|
||||
if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
|
||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
||||
# branch = "main"
|
||||
branch = MODEL_REPO_BRANCH
|
||||
repo_folder = os.path.join(home_folder, "crawl4ai")
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
def load_text_multilabel_classifier():
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
import numpy as np
|
||||
from scipy.special import expit
|
||||
import torch
|
||||
|
||||
print("[LOG] ⏬ Downloading model for the first time...")
|
||||
MODEL = "cardiffnlp/tweet-topic-21-multi"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
|
||||
class_mapping = model.config.id2label
|
||||
|
||||
# Remove existing repo folder if it exists
|
||||
if Path(repo_folder).exists():
|
||||
shutil.rmtree(repo_folder)
|
||||
shutil.rmtree(model_folder)
|
||||
# Check for available device: CUDA, MPS (for Apple Silicon), or CPU
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device("cuda")
|
||||
elif torch.backends.mps.is_available():
|
||||
device = torch.device("mps")
|
||||
else:
|
||||
device = torch.device("cpu")
|
||||
|
||||
try:
|
||||
# Clone the repository
|
||||
subprocess.run(
|
||||
["git", "clone", "-b", branch, repo_url, repo_folder],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True
|
||||
)
|
||||
model.to(device)
|
||||
|
||||
# Create the models directory if it doesn't exist
|
||||
models_folder = os.path.join(home_folder, "models")
|
||||
os.makedirs(models_folder, exist_ok=True)
|
||||
def _classifier(texts, threshold=0.5, max_length=64):
|
||||
tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
|
||||
tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
|
||||
|
||||
# Copy the reuters model folder to the models directory
|
||||
source_folder = os.path.join(repo_folder, "models/reuters")
|
||||
shutil.copytree(source_folder, model_folder)
|
||||
with torch.no_grad():
|
||||
output = model(**tokens)
|
||||
|
||||
# Remove the cloned repository
|
||||
shutil.rmtree(repo_folder)
|
||||
scores = output.logits.detach().cpu().numpy()
|
||||
scores = expit(scores)
|
||||
predictions = (scores >= threshold) * 1
|
||||
|
||||
# Print completion message
|
||||
print("[LOG] ✅ Model downloaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while cloning the repository: {e}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
batch_labels = []
|
||||
for prediction in predictions:
|
||||
labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
|
||||
batch_labels.append(labels)
|
||||
|
||||
return spacy.load(model_folder)
|
||||
return batch_labels
|
||||
|
||||
return _classifier
|
||||
|
||||
@lru_cache()
|
||||
def load_nltk_punkt():
|
||||
import nltk
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
return nltk.data.find('tokenizers/punkt')
|
||||
|
||||
def download_all_models(remove_existing=False):
|
||||
"""Download all models required for Crawl4AI."""
|
||||
@@ -110,10 +108,10 @@ def download_all_models(remove_existing=False):
|
||||
load_bert_base_uncased()
|
||||
print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
load_bge_small_en_v1_5()
|
||||
print("[LOG] Downloading spaCy EN Core Web SM...")
|
||||
load_spacy_en_core_web_sm()
|
||||
print("[LOG] Downloading custom spaCy model...")
|
||||
load_spacy_model()
|
||||
print("[LOG] Downloading text classifier...")
|
||||
load_text_multilabel_classifier
|
||||
print("[LOG] Downloading custom NLTK Punkt model...")
|
||||
load_nltk_punkt()
|
||||
print("[LOG] ✅ All models downloaded successfully.")
|
||||
|
||||
def main():
|
||||
|
||||
@@ -3,6 +3,33 @@ from spacy.training import Example
|
||||
import random
|
||||
import nltk
|
||||
from nltk.corpus import reuters
|
||||
import torch
|
||||
|
||||
def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
|
||||
# Extract the TextCategorizer component
|
||||
textcat = nlp.get_pipe("textcat_multilabel")
|
||||
|
||||
# Convert the weights to a PyTorch state dictionary
|
||||
state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
|
||||
|
||||
# Save the state dictionary
|
||||
torch.save(state_dict, f"{model_dir}/model_weights.pth")
|
||||
|
||||
# Extract and save the vocabulary
|
||||
vocab = extract_vocab(nlp)
|
||||
with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
|
||||
for word, idx in vocab.items():
|
||||
vocab_file.write(f"{word}\t{idx}\n")
|
||||
|
||||
print(f"Model weights and vocabulary saved to: {model_dir}")
|
||||
|
||||
def extract_vocab(nlp):
|
||||
# Extract vocabulary from the SpaCy model
|
||||
vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
|
||||
return vocab
|
||||
|
||||
nlp = spacy.load("models/reuters")
|
||||
save_spacy_model_as_torch(nlp, model_dir="models")
|
||||
|
||||
def train_and_save_reuters_model(model_dir="models/reuters"):
|
||||
# Ensure the Reuters corpus is downloaded
|
||||
@@ -96,8 +123,6 @@ def train_model(model_dir, additional_epochs=0):
|
||||
nlp.to_disk(model_dir)
|
||||
print(f"Model saved to: {model_dir}")
|
||||
|
||||
|
||||
|
||||
def load_model_and_predict(model_dir, text, tok_k = 3):
|
||||
# Load the trained model from the specified directory
|
||||
nlp = spacy.load(model_dir)
|
||||
@@ -111,7 +136,6 @@ def load_model_and_predict(model_dir, text, tok_k = 3):
|
||||
|
||||
return top_categories
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_and_save_reuters_model()
|
||||
train_model("models/reuters", additional_epochs=5)
|
||||
@@ -119,4 +143,4 @@ if __name__ == "__main__":
|
||||
print(reuters.categories())
|
||||
example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
|
||||
r =load_model_and_predict(model_directory, example_text)
|
||||
print(r)
|
||||
print(r)
|
||||
@@ -11,7 +11,6 @@ from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .config import *
|
||||
# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
||||
|
||||
|
||||
class WebCrawler:
|
||||
|
||||
Reference in New Issue
Block a user