Remove dependency on Spacy model.

This commit is contained in:
unclecode
2024-05-17 15:08:03 +08:00
parent f85df91ca6
commit a5f9d07dbf
18 changed files with 123 additions and 83955 deletions

View File

@@ -1,12 +1,8 @@
from abc import ABC, abstractmethod
import re
# spacy = lazy_import.lazy_module('spacy')
# nl = lazy_import.lazy_module('nltk')
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize, TextTilingTokenizer
from collections import Counter
import string
from .model_loader import load_spacy_en_core_web_sm
from .model_loader import load_nltk_punkt
# Define the abstract base class for chunking strategies
class ChunkingStrategy(ABC):
@@ -34,15 +30,24 @@ class RegexChunking(ChunkingStrategy):
paragraphs = new_paragraphs
return paragraphs
# NLP-based sentence chunking using spaCy
# NLP-based sentence chunking
class NlpSentenceChunking(ChunkingStrategy):
def __init__(self, model='en_core_web_sm'):
self.nlp = load_spacy_en_core_web_sm()
def __init__(self):
load_nltk_punkt()
pass
def chunk(self, text: str) -> list:
doc = self.nlp(text)
return [sent.text.strip() for sent in doc.sents]
# Improved regex for sentence splitting
# sentence_endings = re.compile(
# r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
# )
# sentences = sentence_endings.split(text)
# sens = [sent.strip() for sent in sentences if sent]
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
sens = [sent.strip() for sent in sentences]
return list(set(sens))
# Topic-based segmentation using TextTiling
class TopicSegmentationChunking(ChunkingStrategy):

View File

@@ -7,7 +7,7 @@ from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTI
from .config import *
from .utils import *
from functools import partial
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
from .model_loader import *
import numpy as np
@@ -45,14 +45,13 @@ class ExtractionStrategy(ABC):
for future in as_completed(futures):
extracted_content.extend(future.result())
return extracted_content
class NoExtractionStrategy(ExtractionStrategy):
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
return [{"index": 0, "content": html}]
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
class LLMExtractionStrategy(ExtractionStrategy):
def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
"""
@@ -166,10 +165,6 @@ class CosineStrategy(ExtractionStrategy):
"""
super().__init__()
from transformers import BertTokenizer, BertModel, pipeline
from transformers import AutoTokenizer, AutoModel
import spacy
self.semantic_filter = semantic_filter
self.word_count_threshold = word_count_threshold
self.max_dist = max_dist
@@ -184,10 +179,9 @@ class CosineStrategy(ExtractionStrategy):
elif model_name == "BAAI/bge-small-en-v1.5":
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.nlp = load_spacy_model()
self.nlp = load_text_multilabel_classifier()
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
"""
Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
@@ -310,13 +304,19 @@ class CosineStrategy(ExtractionStrategy):
# Convert filtered clusters to a sorted list of dictionaries
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
labels = self.nlp([cluster['content'] for cluster in cluster_list])
for cluster, label in zip(cluster_list, labels):
cluster['tags'] = label
# Process the text with the loaded model
for cluster in cluster_list:
doc = self.nlp(cluster['content'])
tok_k = self.top_k
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
cluster['tags'] = [cat for cat, _ in top_categories]
# for cluster in cluster_list:
# cluster['tags'] = self.nlp(cluster['content'])[0]['label']
# doc = self.nlp(cluster['content'])
# tok_k = self.top_k
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
# cluster['tags'] = [cat for cat, _ in top_categories]
# print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")

View File

@@ -28,68 +28,66 @@ def load_bge_small_en_v1_5():
return tokenizer, model
@lru_cache()
def load_spacy_en_core_web_sm():
import spacy
try:
print("[LOG] Loading spaCy model")
nlp = spacy.load("en_core_web_sm")
except IOError:
print("[LOG] ⏬ Downloading spaCy model for the first time")
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
print("[LOG] ✅ spaCy model loaded successfully")
return nlp
def load_text_classifier():
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
return pipe
@lru_cache()
def load_spacy_model():
import spacy
name = "models/reuters"
home_folder = get_home_folder()
model_folder = os.path.join(home_folder, name)
# Check if the model directory already exists
if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
repo_url = "https://github.com/unclecode/crawl4ai.git"
# branch = "main"
branch = MODEL_REPO_BRANCH
repo_folder = os.path.join(home_folder, "crawl4ai")
model_folder = os.path.join(home_folder, name)
def load_text_multilabel_classifier():
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import expit
import torch
print("[LOG] ⏬ Downloading model for the first time...")
MODEL = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
class_mapping = model.config.id2label
# Remove existing repo folder if it exists
if Path(repo_folder).exists():
shutil.rmtree(repo_folder)
shutil.rmtree(model_folder)
# Check for available device: CUDA, MPS (for Apple Silicon), or CPU
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
try:
# Clone the repository
subprocess.run(
["git", "clone", "-b", branch, repo_url, repo_folder],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True
)
model.to(device)
# Create the models directory if it doesn't exist
models_folder = os.path.join(home_folder, "models")
os.makedirs(models_folder, exist_ok=True)
def _classifier(texts, threshold=0.5, max_length=64):
tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
# Copy the reuters model folder to the models directory
source_folder = os.path.join(repo_folder, "models/reuters")
shutil.copytree(source_folder, model_folder)
with torch.no_grad():
output = model(**tokens)
# Remove the cloned repository
shutil.rmtree(repo_folder)
scores = output.logits.detach().cpu().numpy()
scores = expit(scores)
predictions = (scores >= threshold) * 1
# Print completion message
print("[LOG] ✅ Model downloaded successfully")
except subprocess.CalledProcessError as e:
print(f"An error occurred while cloning the repository: {e}")
except Exception as e:
print(f"An error occurred: {e}")
batch_labels = []
for prediction in predictions:
labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
batch_labels.append(labels)
return spacy.load(model_folder)
return batch_labels
return _classifier
@lru_cache()
def load_nltk_punkt():
import nltk
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
return nltk.data.find('tokenizers/punkt')
def download_all_models(remove_existing=False):
"""Download all models required for Crawl4AI."""
@@ -110,10 +108,10 @@ def download_all_models(remove_existing=False):
load_bert_base_uncased()
print("[LOG] Downloading BGE Small EN v1.5...")
load_bge_small_en_v1_5()
print("[LOG] Downloading spaCy EN Core Web SM...")
load_spacy_en_core_web_sm()
print("[LOG] Downloading custom spaCy model...")
load_spacy_model()
print("[LOG] Downloading text classifier...")
load_text_multilabel_classifier
print("[LOG] Downloading custom NLTK Punkt model...")
load_nltk_punkt()
print("[LOG] ✅ All models downloaded successfully.")
def main():

View File

@@ -3,6 +3,33 @@ from spacy.training import Example
import random
import nltk
from nltk.corpus import reuters
import torch
def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
# Extract the TextCategorizer component
textcat = nlp.get_pipe("textcat_multilabel")
# Convert the weights to a PyTorch state dictionary
state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
# Save the state dictionary
torch.save(state_dict, f"{model_dir}/model_weights.pth")
# Extract and save the vocabulary
vocab = extract_vocab(nlp)
with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
for word, idx in vocab.items():
vocab_file.write(f"{word}\t{idx}\n")
print(f"Model weights and vocabulary saved to: {model_dir}")
def extract_vocab(nlp):
# Extract vocabulary from the SpaCy model
vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
return vocab
nlp = spacy.load("models/reuters")
save_spacy_model_as_torch(nlp, model_dir="models")
def train_and_save_reuters_model(model_dir="models/reuters"):
# Ensure the Reuters corpus is downloaded
@@ -96,8 +123,6 @@ def train_model(model_dir, additional_epochs=0):
nlp.to_disk(model_dir)
print(f"Model saved to: {model_dir}")
def load_model_and_predict(model_dir, text, tok_k = 3):
# Load the trained model from the specified directory
nlp = spacy.load(model_dir)
@@ -111,7 +136,6 @@ def load_model_and_predict(model_dir, text, tok_k = 3):
return top_categories
if __name__ == "__main__":
train_and_save_reuters_model()
train_model("models/reuters", additional_epochs=5)
@@ -119,4 +143,4 @@ if __name__ == "__main__":
print(reuters.categories())
example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
r =load_model_and_predict(model_directory, example_text)
print(r)
print(r)

View File

@@ -11,7 +11,6 @@ from .crawler_strategy import *
from typing import List
from concurrent.futures import ThreadPoolExecutor
from .config import *
# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
class WebCrawler:

View File

@@ -59,12 +59,6 @@ def understanding_parameters(crawler):
cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
print_result(result)
# Retrieve raw HTML content
cprint("\n🔄 [bold cyan]'include_raw_html' parameter example:[/bold cyan]", True)
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
cprint("[LOG] 📦 [bold yellow]Crawl result (without raw HTML content):[/bold yellow]")
print_result(result)
def add_chunking_strategy(crawler):
# Adding a chunking strategy: RegexChunking
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
@@ -177,9 +171,19 @@ def main():
crawler = create_crawler()
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.", True)
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.")
crawler.always_by_pass_cache = True
cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
result = crawler.run(
url="https://www.nbcnews.com/business",
chunking_strategy=NlpSentenceChunking()
)
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
print_result(result)
basic_usage(crawler)
understanding_parameters(crawler)
add_chunking_strategy(crawler)

View File

@@ -1,144 +0,0 @@
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
seed = 0
gpu_allocator = null
[nlp]
lang = "en"
pipeline = ["textcat_multilabel"]
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 1000
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
vectors = {"@vectors":"spacy.Vectors.v1"}
[components]
[components.textcat_multilabel]
factory = "textcat_multilabel"
scorer = {"@scorers":"spacy.textcat_multilabel_scorer.v2"}
threshold = 0.5
[components.textcat_multilabel.model]
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false
length = 262144
ngram_size = 1
no_output_layer = false
nO = null
[components.textcat_multilabel.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
[components.textcat_multilabel.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
rows = [2000,2000,500,1000,500]
attrs = ["NORM","LOWER","PREFIX","SUFFIX","SHAPE"]
include_static_vectors = false
[components.textcat_multilabel.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 64
window_size = 1
maxout_pieces = 3
depth = 2
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[training]
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
before_to_disk = null
before_update = null
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
cats_score = 1.0
cats_score_desc = null
cats_micro_p = null
cats_micro_r = null
cats_micro_f = null
cats_macro_p = null
cats_macro_r = null
cats_macro_f = null
cats_macro_auc = null
cats_f_per_type = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]

View File

@@ -1,122 +0,0 @@
{
"lang":"en",
"name":"pipeline",
"version":"0.0.0",
"spacy_version":">=3.7.4,<3.8.0",
"description":"",
"author":"",
"email":"",
"url":"",
"license":"",
"spacy_git_version":"bff8725f4",
"vectors":{
"width":0,
"vectors":0,
"keys":0,
"name":null,
"mode":"default"
},
"labels":{
"textcat_multilabel":[
"acq",
"alum",
"barley",
"bop",
"carcass",
"castor-oil",
"cocoa",
"coconut",
"coconut-oil",
"coffee",
"copper",
"copra-cake",
"corn",
"cotton",
"cotton-oil",
"cpi",
"cpu",
"crude",
"dfl",
"dlr",
"dmk",
"earn",
"fuel",
"gas",
"gnp",
"gold",
"grain",
"groundnut",
"groundnut-oil",
"heat",
"hog",
"housing",
"income",
"instal-debt",
"interest",
"ipi",
"iron-steel",
"jet",
"jobs",
"l-cattle",
"lead",
"lei",
"lin-oil",
"livestock",
"lumber",
"meal-feed",
"money-fx",
"money-supply",
"naphtha",
"nat-gas",
"nickel",
"nkr",
"nzdlr",
"oat",
"oilseed",
"orange",
"palladium",
"palm-oil",
"palmkernel",
"pet-chem",
"platinum",
"potato",
"propane",
"rand",
"rape-oil",
"rapeseed",
"reserves",
"retail",
"rice",
"rubber",
"rye",
"ship",
"silver",
"sorghum",
"soy-meal",
"soy-oil",
"soybean",
"strategic-metal",
"sugar",
"sun-meal",
"sun-oil",
"sunseed",
"tea",
"tin",
"trade",
"veg-oil",
"wheat",
"wpi",
"yen",
"zinc"
]
},
"pipeline":[
"textcat_multilabel"
],
"components":[
"textcat_multilabel"
],
"disabled":[
]
}

View File

@@ -1,95 +0,0 @@
{
"labels":[
"acq",
"alum",
"barley",
"bop",
"carcass",
"castor-oil",
"cocoa",
"coconut",
"coconut-oil",
"coffee",
"copper",
"copra-cake",
"corn",
"cotton",
"cotton-oil",
"cpi",
"cpu",
"crude",
"dfl",
"dlr",
"dmk",
"earn",
"fuel",
"gas",
"gnp",
"gold",
"grain",
"groundnut",
"groundnut-oil",
"heat",
"hog",
"housing",
"income",
"instal-debt",
"interest",
"ipi",
"iron-steel",
"jet",
"jobs",
"l-cattle",
"lead",
"lei",
"lin-oil",
"livestock",
"lumber",
"meal-feed",
"money-fx",
"money-supply",
"naphtha",
"nat-gas",
"nickel",
"nkr",
"nzdlr",
"oat",
"oilseed",
"orange",
"palladium",
"palm-oil",
"palmkernel",
"pet-chem",
"platinum",
"potato",
"propane",
"rand",
"rape-oil",
"rapeseed",
"reserves",
"retail",
"rice",
"rubber",
"rye",
"ship",
"silver",
"sorghum",
"soy-meal",
"soy-oil",
"soybean",
"strategic-metal",
"sugar",
"sun-meal",
"sun-oil",
"sunseed",
"tea",
"tin",
"trade",
"veg-oil",
"wheat",
"wpi",
"yen",
"zinc"
],
"threshold":0.5
}

File diff suppressed because one or more lines are too long

View File

@@ -1 +0,0 @@
<EFBFBD>

View File

@@ -1 +0,0 @@
<EFBFBD>

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -1,3 +0,0 @@
{
"mode":"default"
}

View File

@@ -14,7 +14,6 @@ requests==2.31.0
rich==13.7.1
scikit-learn==1.4.2
selenium==4.20.0
spacy==3.7.4
uvicorn==0.29.0
transformers==4.40.2
chromedriver-autoinstaller==0.6.4

View File

@@ -1,35 +0,0 @@
import os
def install_crawl4ai():
print("Installing Crawl4AI and its dependencies...")
# Install dependencies
!pip install -U 'spacy[cuda12x]'
!apt-get update -y
!apt install chromium-chromedriver -y
!pip install chromedriver_autoinstaller
!pip install git+https://github.com/unclecode/crawl4ai.git@new-release-0.0.2
# Install ChromeDriver
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()
# Download the reuters model
repo_url = "https://github.com/unclecode/crawl4ai.git"
branch = "new-release-0.0.2"
folder_path = "models/reuters"
!git clone -b {branch} {repo_url}
!mkdir -p models
repo_folder = "crawl4ai"
source_folder = os.path.join(repo_folder, folder_path)
destination_folder = "models"
!mv "{source_folder}" "{destination_folder}"
!rm -rf "{repo_folder}"
print("Installation and model download completed successfully!")
# Run the installer
install_crawl4ai()