From e5e6a34e8097cb5ff72c026b19859b5b98246378 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 4 Aug 2024 14:54:18 +0800 Subject: [PATCH] ## [v0.2.77] - 2024-08-04 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Significant improvements in text processing and performance: - 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy. - 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks. - ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency. - 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions. These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI. --- CHANGELOG.md | 11 ++++++ README.md | 6 ++-- crawl4ai/extraction_strategy.py | 28 +++++++++------ crawl4ai/model_loader.py | 62 ++++++++++++++++++++------------- docs/md/changelog.md | 11 ++++++ docs/md/index.md | 2 +- setup.py | 2 +- 7 files changed, 81 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73f6ef69..d054dd52 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## [v0.2.77] - 2024-08-04 + +Significant improvements in text processing and performance: + +- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy. +- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks. +- ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency. +- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions. + +These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI. + ## [v0.2.76] - 2024-08-02 Major improvements in functionality, performance, and cross-platform compatibility! 🚀 diff --git a/README.md b/README.md index c9552984..f2975ad7 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.76 🕷️🤖 +# Crawl4AI v0.2.77 🕷️🤖 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) @@ -8,7 +8,7 @@ Crawl4AI simplifies web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -#### [v0.2.76] - 2024-08-02 +#### [v0.2.77] - 2024-08-02 Major improvements in functionality, performance, and cross-platform compatibility! 🚀 @@ -50,7 +50,7 @@ Major improvements in functionality, performance, and cross-platform compatibili # Crawl4AI -## 🌟 Shoutout to Contributors of v0.2.76! +## 🌟 Shoutout to Contributors of v0.2.77! A big thank you to the amazing contributors who've made this release possible: diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index f889b45c..080229f4 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -9,6 +9,7 @@ from .utils import * from functools import partial from .model_loader import * import math +import numpy as np class ExtractionStrategy(ABC): @@ -248,6 +249,9 @@ class CosineStrategy(ExtractionStrategy): self.get_embedding_method = "direct" self.device = get_device() + import torch + self.device = torch.device('cpu') + self.default_batch_size = calculate_batch_size(self.device) if self.verbose: @@ -260,7 +264,9 @@ class CosineStrategy(ExtractionStrategy): # else: self.tokenizer, self.model = load_bge_small_en_v1_5() + self.model.to(self.device) self.model.eval() + self.get_embedding_method = "batch" self.buffer_embeddings = np.array([]) @@ -282,7 +288,7 @@ class CosineStrategy(ExtractionStrategy): if self.verbose: print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.") - self.nlp, self.device = load_text_multilabel_classifier() + self.nlp, _ = load_text_multilabel_classifier() # self.default_batch_size = 16 if self.device.type == 'cpu' else 64 if self.verbose: @@ -453,21 +459,21 @@ class CosineStrategy(ExtractionStrategy): if self.verbose: print(f"[LOG] 🚀 Assign tags using {self.device}") - if self.device.type in ["gpu", "cuda", "mps"]: + if self.device.type in ["gpu", "cuda", "mps", "cpu"]: labels = self.nlp([cluster['content'] for cluster in cluster_list]) for cluster, label in zip(cluster_list, labels): cluster['tags'] = label - elif self.device == "cpu": - # Process the text with the loaded model - texts = [cluster['content'] for cluster in cluster_list] - # Batch process texts - docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"]) + # elif self.device.type == "cpu": + # # Process the text with the loaded model + # texts = [cluster['content'] for cluster in cluster_list] + # # Batch process texts + # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"]) - for doc, cluster in zip(docs, cluster_list): - tok_k = self.top_k - top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] - cluster['tags'] = [cat for cat, _ in top_categories] + # for doc, cluster in zip(docs, cluster_list): + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] # for cluster in cluster_list: # doc = self.nlp(cluster['content']) diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py index 7b22f0e1..f49a0659 100644 --- a/crawl4ai/model_loader.py +++ b/crawl4ai/model_loader.py @@ -6,6 +6,7 @@ import tarfile from .model_loader import * import argparse import urllib.request +from crawl4ai.config import MODEL_REPO_BRANCH __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) @lru_cache() @@ -141,14 +142,15 @@ def load_text_multilabel_classifier(): from scipy.special import expit import torch - # Check for available device: CUDA, MPS (for Apple Silicon), or CPU - if torch.cuda.is_available(): - device = torch.device("cuda") - elif torch.backends.mps.is_available(): - device = torch.device("mps") - else: - return load_spacy_model(), torch.device("cpu") - + # # Check for available device: CUDA, MPS (for Apple Silicon), or CPU + # if torch.cuda.is_available(): + # device = torch.device("cuda") + # elif torch.backends.mps.is_available(): + # device = torch.device("mps") + # else: + # device = torch.device("cpu") + # # return load_spacy_model(), torch.device("cpu") + MODEL = "cardiffnlp/tweet-topic-21-multi" tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None) @@ -192,51 +194,61 @@ def load_spacy_model(): import spacy name = "models/reuters" home_folder = get_home_folder() - model_folder = os.path.join(home_folder, name) + model_folder = Path(home_folder) / name # Check if the model directory already exists - if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())): + if not (model_folder.exists() and any(model_folder.iterdir())): repo_url = "https://github.com/unclecode/crawl4ai.git" - # branch = "main" branch = MODEL_REPO_BRANCH - repo_folder = os.path.join(home_folder, "crawl4ai") - model_folder = os.path.join(home_folder, name) - - # print("[LOG] ⏬ Downloading Spacy model for the first time...") + repo_folder = Path(home_folder) / "crawl4ai" + + print("[LOG] ⏬ Downloading Spacy model for the first time...") # Remove existing repo folder if it exists - if Path(repo_folder).exists(): - shutil.rmtree(repo_folder) - shutil.rmtree(model_folder) + if repo_folder.exists(): + try: + shutil.rmtree(repo_folder) + if model_folder.exists(): + shutil.rmtree(model_folder) + except PermissionError: + print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:") + print(f"- {repo_folder}") + print(f"- {model_folder}") + return None try: # Clone the repository subprocess.run( - ["git", "clone", "-b", branch, repo_url, repo_folder], + ["git", "clone", "-b", branch, repo_url, str(repo_folder)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True ) # Create the models directory if it doesn't exist - models_folder = os.path.join(home_folder, "models") - os.makedirs(models_folder, exist_ok=True) + models_folder = Path(home_folder) / "models" + models_folder.mkdir(parents=True, exist_ok=True) # Copy the reuters model folder to the models directory - source_folder = os.path.join(repo_folder, "models/reuters") + source_folder = repo_folder / "models" / "reuters" shutil.copytree(source_folder, model_folder) # Remove the cloned repository shutil.rmtree(repo_folder) - # Print completion message - # print("[LOG] ✅ Spacy Model downloaded successfully") + print("[LOG] ✅ Spacy Model downloaded successfully") except subprocess.CalledProcessError as e: print(f"An error occurred while cloning the repository: {e}") + return None except Exception as e: print(f"An error occurred: {e}") + return None - return spacy.load(model_folder) + try: + return spacy.load(str(model_folder)) + except Exception as e: + print(f"Error loading spacy model: {e}") + return None def download_all_models(remove_existing=False): """Download all models required for Crawl4AI.""" diff --git a/docs/md/changelog.md b/docs/md/changelog.md index 80e3ce8e..03e490f6 100644 --- a/docs/md/changelog.md +++ b/docs/md/changelog.md @@ -1,5 +1,16 @@ # Changelog +## [v0.2.77] - 2024-08-04 + +Significant improvements in text processing and performance: + +- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy. +- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks. +- ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency. +- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions. + +These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI. + ## [v0.2.76] - 2024-08-02 Major improvements in functionality, performance, and cross-platform compatibility! 🚀 diff --git a/docs/md/index.md b/docs/md/index.md index 8c4abb48..338bb519 100644 --- a/docs/md/index.md +++ b/docs/md/index.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.76 +# Crawl4AI v0.2.77 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. diff --git a/setup.py b/setup.py index 5918c721..af929125 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran setup( name="Crawl4AI", - version="0.2.76", + version="0.2.77", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown",