This commit is contained in:
Unclecode
2024-08-04 06:57:16 +00:00
7 changed files with 81 additions and 41 deletions

View File

@@ -1,5 +1,16 @@
# Changelog
## [v0.2.77] - 2024-08-04
Significant improvements in text processing and performance:
- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy.
- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks.
-**Performance enhancement**: Improved model loading speed due to removal of spaCy dependency.
- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions.
These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI.
## [v0.2.76] - 2024-08-02
Major improvements in functionality, performance, and cross-platform compatibility! 🚀

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.76 🕷️🤖
# Crawl4AI v0.2.77 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -8,7 +8,7 @@
Crawl4AI simplifies web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
#### [v0.2.76] - 2024-08-02
#### [v0.2.77] - 2024-08-02
Major improvements in functionality, performance, and cross-platform compatibility! 🚀
@@ -50,7 +50,7 @@ Major improvements in functionality, performance, and cross-platform compatibili
# Crawl4AI
## 🌟 Shoutout to Contributors of v0.2.76!
## 🌟 Shoutout to Contributors of v0.2.77!
A big thank you to the amazing contributors who've made this release possible:

View File

@@ -9,6 +9,7 @@ from .utils import *
from functools import partial
from .model_loader import *
import math
import numpy as np
class ExtractionStrategy(ABC):
@@ -248,6 +249,9 @@ class CosineStrategy(ExtractionStrategy):
self.get_embedding_method = "direct"
self.device = get_device()
import torch
self.device = torch.device('cpu')
self.default_batch_size = calculate_batch_size(self.device)
if self.verbose:
@@ -260,7 +264,9 @@ class CosineStrategy(ExtractionStrategy):
# else:
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.model.to(self.device)
self.model.eval()
self.get_embedding_method = "batch"
self.buffer_embeddings = np.array([])
@@ -282,7 +288,7 @@ class CosineStrategy(ExtractionStrategy):
if self.verbose:
print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
self.nlp, self.device = load_text_multilabel_classifier()
self.nlp, _ = load_text_multilabel_classifier()
# self.default_batch_size = 16 if self.device.type == 'cpu' else 64
if self.verbose:
@@ -453,21 +459,21 @@ class CosineStrategy(ExtractionStrategy):
if self.verbose:
print(f"[LOG] 🚀 Assign tags using {self.device}")
if self.device.type in ["gpu", "cuda", "mps"]:
if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
labels = self.nlp([cluster['content'] for cluster in cluster_list])
for cluster, label in zip(cluster_list, labels):
cluster['tags'] = label
elif self.device == "cpu":
# Process the text with the loaded model
texts = [cluster['content'] for cluster in cluster_list]
# Batch process texts
docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
# elif self.device.type == "cpu":
# # Process the text with the loaded model
# texts = [cluster['content'] for cluster in cluster_list]
# # Batch process texts
# docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
for doc, cluster in zip(docs, cluster_list):
tok_k = self.top_k
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
cluster['tags'] = [cat for cat, _ in top_categories]
# for doc, cluster in zip(docs, cluster_list):
# tok_k = self.top_k
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
# cluster['tags'] = [cat for cat, _ in top_categories]
# for cluster in cluster_list:
# doc = self.nlp(cluster['content'])

View File

@@ -6,6 +6,7 @@ import tarfile
from .model_loader import *
import argparse
import urllib.request
from crawl4ai.config import MODEL_REPO_BRANCH
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@lru_cache()
@@ -141,14 +142,15 @@ def load_text_multilabel_classifier():
from scipy.special import expit
import torch
# Check for available device: CUDA, MPS (for Apple Silicon), or CPU
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
return load_spacy_model(), torch.device("cpu")
# # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
# if torch.cuda.is_available():
# device = torch.device("cuda")
# elif torch.backends.mps.is_available():
# device = torch.device("mps")
# else:
# device = torch.device("cpu")
# # return load_spacy_model(), torch.device("cpu")
MODEL = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
@@ -192,51 +194,61 @@ def load_spacy_model():
import spacy
name = "models/reuters"
home_folder = get_home_folder()
model_folder = os.path.join(home_folder, name)
model_folder = Path(home_folder) / name
# Check if the model directory already exists
if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
if not (model_folder.exists() and any(model_folder.iterdir())):
repo_url = "https://github.com/unclecode/crawl4ai.git"
# branch = "main"
branch = MODEL_REPO_BRANCH
repo_folder = os.path.join(home_folder, "crawl4ai")
model_folder = os.path.join(home_folder, name)
# print("[LOG] ⏬ Downloading Spacy model for the first time...")
repo_folder = Path(home_folder) / "crawl4ai"
print("[LOG] ⏬ Downloading Spacy model for the first time...")
# Remove existing repo folder if it exists
if Path(repo_folder).exists():
shutil.rmtree(repo_folder)
shutil.rmtree(model_folder)
if repo_folder.exists():
try:
shutil.rmtree(repo_folder)
if model_folder.exists():
shutil.rmtree(model_folder)
except PermissionError:
print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:")
print(f"- {repo_folder}")
print(f"- {model_folder}")
return None
try:
# Clone the repository
subprocess.run(
["git", "clone", "-b", branch, repo_url, repo_folder],
["git", "clone", "-b", branch, repo_url, str(repo_folder)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True
)
# Create the models directory if it doesn't exist
models_folder = os.path.join(home_folder, "models")
os.makedirs(models_folder, exist_ok=True)
models_folder = Path(home_folder) / "models"
models_folder.mkdir(parents=True, exist_ok=True)
# Copy the reuters model folder to the models directory
source_folder = os.path.join(repo_folder, "models/reuters")
source_folder = repo_folder / "models" / "reuters"
shutil.copytree(source_folder, model_folder)
# Remove the cloned repository
shutil.rmtree(repo_folder)
# Print completion message
# print("[LOG] ✅ Spacy Model downloaded successfully")
print("[LOG] ✅ Spacy Model downloaded successfully")
except subprocess.CalledProcessError as e:
print(f"An error occurred while cloning the repository: {e}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
return spacy.load(model_folder)
try:
return spacy.load(str(model_folder))
except Exception as e:
print(f"Error loading spacy model: {e}")
return None
def download_all_models(remove_existing=False):
"""Download all models required for Crawl4AI."""

View File

@@ -1,5 +1,16 @@
# Changelog
## [v0.2.77] - 2024-08-04
Significant improvements in text processing and performance:
- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy.
- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks.
-**Performance enhancement**: Improved model loading speed due to removal of spaCy dependency.
- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions.
These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI.
## [v0.2.76] - 2024-08-02
Major improvements in functionality, performance, and cross-platform compatibility! 🚀

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.76
# Crawl4AI v0.2.77
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.

View File

@@ -25,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
setup(
name="Crawl4AI",
version="0.2.76",
version="0.2.77",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",