## [v0.2.77] - 2024-08-04

Significant improvements in text processing and performance: - 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy. - 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks. - ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency. - 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions. These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI.
2024-08-04 14:54:18 +08:00
parent 897e766728
commit e5e6a34e80
7 changed files with 81 additions and 41 deletions
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -6,6 +6,7 @@ import tarfile
 from .model_loader import *
 import argparse
 import urllib.request
+from crawl4ai.config import MODEL_REPO_BRANCH
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

@lru_cache()
@@ -141,14 +142,15 @@ def load_text_multilabel_classifier():
    from scipy.special import expit
    import torch

-    # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-    else:
-        return load_spacy_model(), torch.device("cpu")
-
+    # # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
+    # if torch.cuda.is_available():
+    #     device = torch.device("cuda")
+    # elif torch.backends.mps.is_available():
+    #     device = torch.device("mps")
+    # else:
+    #     device = torch.device("cpu")
+    #     # return load_spacy_model(), torch.device("cpu")
+    

    MODEL = "cardiffnlp/tweet-topic-21-multi"
    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
@@ -192,51 +194,61 @@ def load_spacy_model():
    import spacy
    name = "models/reuters"
    home_folder = get_home_folder()
-    model_folder = os.path.join(home_folder, name)
+    model_folder = Path(home_folder) / name
    
    # Check if the model directory already exists
-    if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
+    if not (model_folder.exists() and any(model_folder.iterdir())):
        repo_url = "https://github.com/unclecode/crawl4ai.git"
-        # branch = "main"
        branch = MODEL_REPO_BRANCH 
-        repo_folder = os.path.join(home_folder, "crawl4ai")
-        model_folder = os.path.join(home_folder, name)
-
-        # print("[LOG] ⏬ Downloading Spacy model for the first time...")
+        repo_folder = Path(home_folder) / "crawl4ai"
+        
+        print("[LOG] ⏬ Downloading Spacy model for the first time...")

        # Remove existing repo folder if it exists
-        if Path(repo_folder).exists():
-            shutil.rmtree(repo_folder)
-            shutil.rmtree(model_folder)
+        if repo_folder.exists():
+            try:
+                shutil.rmtree(repo_folder)
+                if model_folder.exists():
+                    shutil.rmtree(model_folder)
+            except PermissionError:
+                print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:")
+                print(f"- {repo_folder}")
+                print(f"- {model_folder}")
+                return None

        try:
            # Clone the repository
            subprocess.run(
-                ["git", "clone", "-b", branch, repo_url, repo_folder],
+                ["git", "clone", "-b", branch, repo_url, str(repo_folder)],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
                check=True
            )

            # Create the models directory if it doesn't exist
-            models_folder = os.path.join(home_folder, "models")
-            os.makedirs(models_folder, exist_ok=True)
+            models_folder = Path(home_folder) / "models"
+            models_folder.mkdir(parents=True, exist_ok=True)

            # Copy the reuters model folder to the models directory
-            source_folder = os.path.join(repo_folder, "models/reuters")
+            source_folder = repo_folder / "models" / "reuters"
            shutil.copytree(source_folder, model_folder)

            # Remove the cloned repository
            shutil.rmtree(repo_folder)

-            # Print completion message
-            # print("[LOG] ✅ Spacy Model downloaded successfully")
+            print("[LOG] ✅ Spacy Model downloaded successfully")
        except subprocess.CalledProcessError as e:
            print(f"An error occurred while cloning the repository: {e}")
+            return None
        except Exception as e:
            print(f"An error occurred: {e}")
+            return None

-    return spacy.load(model_folder)
+    try:
+        return spacy.load(str(model_folder))
+    except Exception as e:
+        print(f"Error loading spacy model: {e}")
+        return None

 def download_all_models(remove_existing=False):
    """Download all models required for Crawl4AI."""