Update model loader to support GPU, MPS, and CPU

2024-05-17 21:39:22 +08:00
parent ce052a4eb5
commit 33fddc27ad
1 changed files with 64 additions and 9 deletions
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -45,18 +45,21 @@ def load_text_multilabel_classifier():
    from scipy.special import expit
    import torch

-    MODEL = "cardiffnlp/tweet-topic-21-multi"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
-    class_mapping = model.config.id2label
-
    # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
-        device = torch.device("cpu")
+        return load_spacy_model()
+        # device = torch.device("cpu")
+
+
+    MODEL = "cardiffnlp/tweet-topic-21-multi"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
+    class_mapping = model.config.id2label
+

    model.to(device)

@@ -78,7 +81,7 @@ def load_text_multilabel_classifier():

        return batch_labels

-    return _classifier
+    return _classifier, "gpu"

@lru_cache()
 def load_nltk_punkt():
@@ -89,6 +92,58 @@ def load_nltk_punkt():
        nltk.download('punkt')
    return nltk.data.find('tokenizers/punkt')

+
+@lru_cache()
+def load_spacy_model():
+    import spacy
+    name = "models/reuters"
+    home_folder = get_home_folder()
+    model_folder = os.path.join(home_folder, name)
+    
+    # Check if the model directory already exists
+    if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
+        repo_url = "https://github.com/unclecode/crawl4ai.git"
+        # branch = "main"
+        branch = MODEL_REPO_BRANCH 
+        repo_folder = os.path.join(home_folder, "crawl4ai")
+        model_folder = os.path.join(home_folder, name)
+
+        print("[LOG] ⏬ Downloading model for the first time...")
+
+        # Remove existing repo folder if it exists
+        if Path(repo_folder).exists():
+            shutil.rmtree(repo_folder)
+            shutil.rmtree(model_folder)
+
+        try:
+            # Clone the repository
+            subprocess.run(
+                ["git", "clone", "-b", branch, repo_url, repo_folder],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True
+            )
+
+            # Create the models directory if it doesn't exist
+            models_folder = os.path.join(home_folder, "models")
+            os.makedirs(models_folder, exist_ok=True)
+
+            # Copy the reuters model folder to the models directory
+            source_folder = os.path.join(repo_folder, "models/reuters")
+            shutil.copytree(source_folder, model_folder)
+
+            # Remove the cloned repository
+            shutil.rmtree(repo_folder)
+
+            # Print completion message
+            print("[LOG] ✅ Model downloaded successfully")
+        except subprocess.CalledProcessError as e:
+            print(f"An error occurred while cloning the repository: {e}")
+        except Exception as e:
+            print(f"An error occurred: {e}")
+
+    return spacy.load(model_folder), "cpu"
+
 def download_all_models(remove_existing=False):
    """Download all models required for Crawl4AI."""
    if remove_existing:
@@ -109,7 +164,7 @@ def download_all_models(remove_existing=False):
    print("[LOG] Downloading BGE Small EN v1.5...")
    load_bge_small_en_v1_5()
    print("[LOG] Downloading text classifier...")
-    load_text_multilabel_classifier
+    load_text_multilabel_classifier()
    print("[LOG] Downloading custom NLTK Punkt model...")
    load_nltk_punkt()
    print("[LOG] ✅ All models downloaded successfully.")
@@ -124,4 +179,4 @@ def main():
    download_all_models(remove_existing=args.remove_existing)

 if __name__ == "__main__":
-    main()
+    main()