From e5e6a34e8097cb5ff72c026b19859b5b98246378 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Sun, 4 Aug 2024 14:54:18 +0800
Subject: [PATCH] ## [v0.2.77] - 2024-08-04
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Significant improvements in text processing and performance:

- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy.
- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks.
- ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency.
- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions.

These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI.
---
 CHANGELOG.md                    | 11 ++++++
 README.md                       |  6 ++--
 crawl4ai/extraction_strategy.py | 28 +++++++++------
 crawl4ai/model_loader.py        | 62 ++++++++++++++++++++-------------
 docs/md/changelog.md            | 11 ++++++
 docs/md/index.md                |  2 +-
 setup.py                        |  2 +-
 7 files changed, 81 insertions(+), 41 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 73f6ef69..d054dd52 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## [v0.2.77] - 2024-08-04
+
+Significant improvements in text processing and performance:
+
+- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy.
+- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks.
+- ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency.
+- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions.
+
+These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI.
+
 ## [v0.2.76] - 2024-08-02
 
 Major improvements in functionality, performance, and cross-platform compatibility! 🚀
diff --git a/README.md b/README.md
index c9552984..f2975ad7 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.76 🕷️🤖
+# Crawl4AI v0.2.77 🕷️🤖
 
 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -8,7 +8,7 @@
 
 Crawl4AI simplifies web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
 
-#### [v0.2.76] - 2024-08-02
+#### [v0.2.77] - 2024-08-02
 
 Major improvements in functionality, performance, and cross-platform compatibility! 🚀
 
@@ -50,7 +50,7 @@ Major improvements in functionality, performance, and cross-platform compatibili
 
 # Crawl4AI
 
-## 🌟 Shoutout to Contributors of v0.2.76!
+## 🌟 Shoutout to Contributors of v0.2.77!
 
 A big thank you to the amazing contributors who've made this release possible:
 
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index f889b45c..080229f4 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -9,6 +9,7 @@ from .utils import *
 from functools import partial
 from .model_loader import *
 import math
+import numpy as np
 
 
 class ExtractionStrategy(ABC):
@@ -248,6 +249,9 @@ class CosineStrategy(ExtractionStrategy):
         self.get_embedding_method = "direct"
         
         self.device = get_device()
+        import torch
+        self.device = torch.device('cpu')
+        
         self.default_batch_size = calculate_batch_size(self.device)
 
         if self.verbose:
@@ -260,7 +264,9 @@ class CosineStrategy(ExtractionStrategy):
         # else:
 
         self.tokenizer, self.model = load_bge_small_en_v1_5()
+        self.model.to(self.device)
         self.model.eval()  
+        
         self.get_embedding_method = "batch"
         
         self.buffer_embeddings = np.array([])
@@ -282,7 +288,7 @@ class CosineStrategy(ExtractionStrategy):
         if self.verbose:
             print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
             
-        self.nlp, self.device = load_text_multilabel_classifier()
+        self.nlp, _ = load_text_multilabel_classifier()
         # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
         
         if self.verbose:
@@ -453,21 +459,21 @@ class CosineStrategy(ExtractionStrategy):
         if self.verbose:
             print(f"[LOG] 🚀 Assign tags using {self.device}")
         
-        if self.device.type in ["gpu", "cuda", "mps"]:
+        if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
             labels = self.nlp([cluster['content'] for cluster in cluster_list])
             
             for cluster, label in zip(cluster_list, labels):
                 cluster['tags'] = label
-        elif self.device == "cpu":
-            # Process the text with the loaded model
-            texts = [cluster['content'] for cluster in cluster_list]
-            # Batch process texts
-            docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
+        # elif self.device.type == "cpu":
+        #     # Process the text with the loaded model
+        #     texts = [cluster['content'] for cluster in cluster_list]
+        #     # Batch process texts
+        #     docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
 
-            for doc, cluster in zip(docs, cluster_list):
-                tok_k = self.top_k
-                top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
-                cluster['tags'] = [cat for cat, _ in top_categories]
+        #     for doc, cluster in zip(docs, cluster_list):
+        #         tok_k = self.top_k
+        #         top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+        #         cluster['tags'] = [cat for cat, _ in top_categories]
                             
             # for cluster in  cluster_list:
             #     doc = self.nlp(cluster['content'])
diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py
index 7b22f0e1..f49a0659 100644
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -6,6 +6,7 @@ import tarfile
 from .model_loader import *
 import argparse
 import urllib.request
+from crawl4ai.config import MODEL_REPO_BRANCH
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
 @lru_cache()
@@ -141,14 +142,15 @@ def load_text_multilabel_classifier():
     from scipy.special import expit
     import torch
 
-    # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-    else:
-        return load_spacy_model(), torch.device("cpu")
-
+    # # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
+    # if torch.cuda.is_available():
+    #     device = torch.device("cuda")
+    # elif torch.backends.mps.is_available():
+    #     device = torch.device("mps")
+    # else:
+    #     device = torch.device("cpu")
+    #     # return load_spacy_model(), torch.device("cpu")
+    
 
     MODEL = "cardiffnlp/tweet-topic-21-multi"
     tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
@@ -192,51 +194,61 @@ def load_spacy_model():
     import spacy
     name = "models/reuters"
     home_folder = get_home_folder()
-    model_folder = os.path.join(home_folder, name)
+    model_folder = Path(home_folder) / name
     
     # Check if the model directory already exists
-    if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
+    if not (model_folder.exists() and any(model_folder.iterdir())):
         repo_url = "https://github.com/unclecode/crawl4ai.git"
-        # branch = "main"
         branch = MODEL_REPO_BRANCH 
-        repo_folder = os.path.join(home_folder, "crawl4ai")
-        model_folder = os.path.join(home_folder, name)
-
-        # print("[LOG] ⏬ Downloading Spacy model for the first time...")
+        repo_folder = Path(home_folder) / "crawl4ai"
+        
+        print("[LOG] ⏬ Downloading Spacy model for the first time...")
 
         # Remove existing repo folder if it exists
-        if Path(repo_folder).exists():
-            shutil.rmtree(repo_folder)
-            shutil.rmtree(model_folder)
+        if repo_folder.exists():
+            try:
+                shutil.rmtree(repo_folder)
+                if model_folder.exists():
+                    shutil.rmtree(model_folder)
+            except PermissionError:
+                print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:")
+                print(f"- {repo_folder}")
+                print(f"- {model_folder}")
+                return None
 
         try:
             # Clone the repository
             subprocess.run(
-                ["git", "clone", "-b", branch, repo_url, repo_folder],
+                ["git", "clone", "-b", branch, repo_url, str(repo_folder)],
                 stdout=subprocess.DEVNULL,
                 stderr=subprocess.DEVNULL,
                 check=True
             )
 
             # Create the models directory if it doesn't exist
-            models_folder = os.path.join(home_folder, "models")
-            os.makedirs(models_folder, exist_ok=True)
+            models_folder = Path(home_folder) / "models"
+            models_folder.mkdir(parents=True, exist_ok=True)
 
             # Copy the reuters model folder to the models directory
-            source_folder = os.path.join(repo_folder, "models/reuters")
+            source_folder = repo_folder / "models" / "reuters"
             shutil.copytree(source_folder, model_folder)
 
             # Remove the cloned repository
             shutil.rmtree(repo_folder)
 
-            # Print completion message
-            # print("[LOG] ✅ Spacy Model downloaded successfully")
+            print("[LOG] ✅ Spacy Model downloaded successfully")
         except subprocess.CalledProcessError as e:
             print(f"An error occurred while cloning the repository: {e}")
+            return None
         except Exception as e:
             print(f"An error occurred: {e}")
+            return None
 
-    return spacy.load(model_folder)
+    try:
+        return spacy.load(str(model_folder))
+    except Exception as e:
+        print(f"Error loading spacy model: {e}")
+        return None
 
 def download_all_models(remove_existing=False):
     """Download all models required for Crawl4AI."""
diff --git a/docs/md/changelog.md b/docs/md/changelog.md
index 80e3ce8e..03e490f6 100644
--- a/docs/md/changelog.md
+++ b/docs/md/changelog.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## [v0.2.77] - 2024-08-04
+
+Significant improvements in text processing and performance:
+
+- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy.
+- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks.
+- ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency.
+- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions.
+
+These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI.
+
 ## [v0.2.76] - 2024-08-02
 
 Major improvements in functionality, performance, and cross-platform compatibility! 🚀
diff --git a/docs/md/index.md b/docs/md/index.md
index 8c4abb48..338bb519 100644
--- a/docs/md/index.md
+++ b/docs/md/index.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.76
+# Crawl4AI v0.2.77
 
 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
 
diff --git a/setup.py b/setup.py
index 5918c721..af929125 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
 
 setup(
     name="Crawl4AI",
-    version="0.2.76",
+    version="0.2.77",
     description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",