Remove dependency on Spacy model.

2024-05-17 15:08:03 +08:00
parent f85df91ca6
commit a5f9d07dbf
18 changed files with 123 additions and 83955 deletions
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -1,12 +1,8 @@
 from abc import ABC, abstractmethod
 import re
-# spacy = lazy_import.lazy_module('spacy')
-# nl = lazy_import.lazy_module('nltk')
-# from nltk.corpus import stopwords
-# from nltk.tokenize import word_tokenize, TextTilingTokenizer
 from collections import Counter
 import string
-from .model_loader import load_spacy_en_core_web_sm
+from .model_loader import load_nltk_punkt

 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
@@ -34,15 +30,24 @@ class RegexChunking(ChunkingStrategy):
            paragraphs = new_paragraphs
        return paragraphs
    
-# NLP-based sentence chunking using spaCy
-
+# NLP-based sentence chunking 
 class NlpSentenceChunking(ChunkingStrategy):
-    def __init__(self, model='en_core_web_sm'):
-        self.nlp = load_spacy_en_core_web_sm()
+    def __init__(self):
+        load_nltk_punkt()
+        pass

    def chunk(self, text: str) -> list:
-        doc = self.nlp(text)
-        return [sent.text.strip() for sent in doc.sents]
+        # Improved regex for sentence splitting
+        # sentence_endings = re.compile(
+        #     r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
+        # )
+        # sentences = sentence_endings.split(text)
+        # sens =  [sent.strip() for sent in sentences if sent]            
+        from nltk.tokenize import sent_tokenize
+        sentences = sent_tokenize(text)
+        sens =  [sent.strip() for sent in sentences]        
+        
+        return list(set(sens))
    
 # Topic-based segmentation using TextTiling
 class TopicSegmentationChunking(ChunkingStrategy):
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -7,7 +7,7 @@ from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTI
 from .config import *
 from .utils import *
 from functools import partial
-from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
+from .model_loader import *


 import numpy as np
@@ -45,14 +45,13 @@ class ExtractionStrategy(ABC):
            for future in as_completed(futures):
                extracted_content.extend(future.result())
        return extracted_content    
-
 class NoExtractionStrategy(ExtractionStrategy):
    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": 0, "content": html}]
    
    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
-    
+   
 class LLMExtractionStrategy(ExtractionStrategy):
    def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
        """
@@ -166,10 +165,6 @@ class CosineStrategy(ExtractionStrategy):
        """
        super().__init__()
        
-        from transformers import BertTokenizer, BertModel, pipeline
-        from transformers import AutoTokenizer, AutoModel     
-        import spacy  
-
        self.semantic_filter = semantic_filter
        self.word_count_threshold = word_count_threshold
        self.max_dist = max_dist
@@ -184,10 +179,9 @@ class CosineStrategy(ExtractionStrategy):
        elif model_name == "BAAI/bge-small-en-v1.5":
            self.tokenizer, self.model = load_bge_small_en_v1_5()

-        self.nlp = load_spacy_model()
+        self.nlp = load_text_multilabel_classifier()
        print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")

-
    def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
        """
        Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
@@ -310,13 +304,19 @@ class CosineStrategy(ExtractionStrategy):

        # Convert filtered clusters to a sorted list of dictionaries
        cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
+        
+        labels = self.nlp([cluster['content'] for cluster in cluster_list])
+        
+        for cluster, label in zip(cluster_list, labels):
+            cluster['tags'] = label

        # Process the text with the loaded model
-        for cluster in  cluster_list:
-            doc = self.nlp(cluster['content'])
-            tok_k = self.top_k
-            top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
-            cluster['tags'] = [cat for cat, _ in top_categories]
+        # for cluster in  cluster_list:
+        #     cluster['tags'] = self.nlp(cluster['content'])[0]['label']
+            # doc = self.nlp(cluster['content'])
+            # tok_k = self.top_k
+            # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+            # cluster['tags'] = [cat for cat, _ in top_categories]
        
        # print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
        
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -28,68 +28,66 @@ def load_bge_small_en_v1_5():
    return tokenizer, model

@lru_cache()
-def load_spacy_en_core_web_sm():
-    import spacy
-    try:
-        print("[LOG] Loading spaCy model")
-        nlp = spacy.load("en_core_web_sm")
-    except IOError:
-        print("[LOG] ⏬ Downloading spaCy model for the first time")
-        spacy.cli.download("en_core_web_sm")
-        nlp = spacy.load("en_core_web_sm")    
-    print("[LOG] ✅ spaCy model loaded successfully")
-    return nlp
+def load_text_classifier():
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    from transformers import pipeline
+
+    tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
+    model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
+    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
+
+    return pipe

@lru_cache()
-def load_spacy_model():
-    import spacy
-    name = "models/reuters"
-    home_folder = get_home_folder()
-    model_folder = os.path.join(home_folder, name)
-    
-    # Check if the model directory already exists
-    if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
-        repo_url = "https://github.com/unclecode/crawl4ai.git"
-        # branch = "main"
-        branch = MODEL_REPO_BRANCH 
-        repo_folder = os.path.join(home_folder, "crawl4ai")
-        model_folder = os.path.join(home_folder, name)
+def load_text_multilabel_classifier():
+    from transformers import AutoModelForSequenceClassification, AutoTokenizer
+    import numpy as np
+    from scipy.special import expit
+    import torch

-        print("[LOG] ⏬ Downloading model for the first time...")
+    MODEL = "cardiffnlp/tweet-topic-21-multi"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
+    class_mapping = model.config.id2label

-        # Remove existing repo folder if it exists
-        if Path(repo_folder).exists():
-            shutil.rmtree(repo_folder)
-            shutil.rmtree(model_folder)
+    # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")

-        try:
-            # Clone the repository
-            subprocess.run(
-                ["git", "clone", "-b", branch, repo_url, repo_folder],
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-                check=True
-            )
+    model.to(device)

-            # Create the models directory if it doesn't exist
-            models_folder = os.path.join(home_folder, "models")
-            os.makedirs(models_folder, exist_ok=True)
+    def _classifier(texts, threshold=0.5, max_length=64):
+        tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
+        tokens = {key: val.to(device) for key, val in tokens.items()}  # Move tokens to the selected device

-            # Copy the reuters model folder to the models directory
-            source_folder = os.path.join(repo_folder, "models/reuters")
-            shutil.copytree(source_folder, model_folder)
+        with torch.no_grad():
+            output = model(**tokens)

-            # Remove the cloned repository
-            shutil.rmtree(repo_folder)
+        scores = output.logits.detach().cpu().numpy()
+        scores = expit(scores)
+        predictions = (scores >= threshold) * 1

-            # Print completion message
-            print("[LOG] ✅ Model downloaded successfully")
-        except subprocess.CalledProcessError as e:
-            print(f"An error occurred while cloning the repository: {e}")
-        except Exception as e:
-            print(f"An error occurred: {e}")
+        batch_labels = []
+        for prediction in predictions:
+            labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
+            batch_labels.append(labels)

-    return spacy.load(model_folder)
+        return batch_labels
+
+    return _classifier
+
+@lru_cache()
+def load_nltk_punkt():
+    import nltk
+    try:
+        nltk.data.find('tokenizers/punkt')
+    except LookupError:
+        nltk.download('punkt')
+    return nltk.data.find('tokenizers/punkt')

 def download_all_models(remove_existing=False):
    """Download all models required for Crawl4AI."""
@@ -110,10 +108,10 @@ def download_all_models(remove_existing=False):
    load_bert_base_uncased()
    print("[LOG] Downloading BGE Small EN v1.5...")
    load_bge_small_en_v1_5()
-    print("[LOG] Downloading spaCy EN Core Web SM...")
-    load_spacy_en_core_web_sm()
-    print("[LOG] Downloading custom spaCy model...")
-    load_spacy_model()
+    print("[LOG] Downloading text classifier...")
+    load_text_multilabel_classifier
+    print("[LOG] Downloading custom NLTK Punkt model...")
+    load_nltk_punkt()
    print("[LOG] ✅ All models downloaded successfully.")

 def main():
--- a/crawl4ai/train.py
+++ b/crawl4ai/train.py
@@ -3,6 +3,33 @@ from spacy.training import Example
 import random
 import nltk
 from nltk.corpus import reuters
+import torch
+
+def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
+    # Extract the TextCategorizer component
+    textcat = nlp.get_pipe("textcat_multilabel")
+
+    # Convert the weights to a PyTorch state dictionary
+    state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
+
+    # Save the state dictionary
+    torch.save(state_dict, f"{model_dir}/model_weights.pth")
+
+    # Extract and save the vocabulary
+    vocab = extract_vocab(nlp)
+    with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
+        for word, idx in vocab.items():
+            vocab_file.write(f"{word}\t{idx}\n")
+    
+    print(f"Model weights and vocabulary saved to: {model_dir}")
+
+def extract_vocab(nlp):
+    # Extract vocabulary from the SpaCy model
+    vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
+    return vocab
+
+nlp = spacy.load("models/reuters")
+save_spacy_model_as_torch(nlp, model_dir="models")

 def train_and_save_reuters_model(model_dir="models/reuters"):
    # Ensure the Reuters corpus is downloaded
@@ -96,8 +123,6 @@ def train_model(model_dir, additional_epochs=0):
    nlp.to_disk(model_dir)
    print(f"Model saved to: {model_dir}")

-
-
 def load_model_and_predict(model_dir, text, tok_k = 3):
    # Load the trained model from the specified directory
    nlp = spacy.load(model_dir)
@@ -111,7 +136,6 @@ def load_model_and_predict(model_dir, text, tok_k = 3):
    
    return top_categories    

-
 if __name__ == "__main__":
    train_and_save_reuters_model()
    train_model("models/reuters", additional_epochs=5)
@@ -119,4 +143,4 @@ if __name__ == "__main__":
    print(reuters.categories())
    example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
    r =load_model_and_predict(model_directory, example_text)
-    print(r)
+    print(r)
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -11,7 +11,6 @@ from .crawler_strategy import *
 from typing import List
 from concurrent.futures import ThreadPoolExecutor
 from .config import *
-# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model


 class WebCrawler: