Remove dependency on Spacy model.

2024-05-17 15:08:03 +08:00
parent f85df91ca6
commit a5f9d07dbf
18 changed files with 123 additions and 83955 deletions
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -1,12 +1,8 @@
 from abc import ABC, abstractmethod
 import re
-# spacy = lazy_import.lazy_module('spacy')
-# nl = lazy_import.lazy_module('nltk')
-# from nltk.corpus import stopwords
-# from nltk.tokenize import word_tokenize, TextTilingTokenizer
 from collections import Counter
 import string
-from .model_loader import load_spacy_en_core_web_sm
+from .model_loader import load_nltk_punkt

 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
@@ -34,15 +30,24 @@ class RegexChunking(ChunkingStrategy):
            paragraphs = new_paragraphs
        return paragraphs
    
-# NLP-based sentence chunking using spaCy
-
+# NLP-based sentence chunking 
 class NlpSentenceChunking(ChunkingStrategy):
-    def __init__(self, model='en_core_web_sm'):
-        self.nlp = load_spacy_en_core_web_sm()
+    def __init__(self):
+        load_nltk_punkt()
+        pass

    def chunk(self, text: str) -> list:
-        doc = self.nlp(text)
-        return [sent.text.strip() for sent in doc.sents]
+        # Improved regex for sentence splitting
+        # sentence_endings = re.compile(
+        #     r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
+        # )
+        # sentences = sentence_endings.split(text)
+        # sens =  [sent.strip() for sent in sentences if sent]            
+        from nltk.tokenize import sent_tokenize
+        sentences = sent_tokenize(text)
+        sens =  [sent.strip() for sent in sentences]        
+        
+        return list(set(sens))
    
 # Topic-based segmentation using TextTiling
 class TopicSegmentationChunking(ChunkingStrategy):
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -7,7 +7,7 @@ from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTI
 from .config import *
 from .utils import *
 from functools import partial
-from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
+from .model_loader import *


 import numpy as np
@@ -45,14 +45,13 @@ class ExtractionStrategy(ABC):
            for future in as_completed(futures):
                extracted_content.extend(future.result())
        return extracted_content    
-
 class NoExtractionStrategy(ExtractionStrategy):
    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": 0, "content": html}]
    
    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
-    
+   
 class LLMExtractionStrategy(ExtractionStrategy):
    def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
        """
@@ -166,10 +165,6 @@ class CosineStrategy(ExtractionStrategy):
        """
        super().__init__()
        
-        from transformers import BertTokenizer, BertModel, pipeline
-        from transformers import AutoTokenizer, AutoModel     
-        import spacy  
-
        self.semantic_filter = semantic_filter
        self.word_count_threshold = word_count_threshold
        self.max_dist = max_dist
@@ -184,10 +179,9 @@ class CosineStrategy(ExtractionStrategy):
        elif model_name == "BAAI/bge-small-en-v1.5":
            self.tokenizer, self.model = load_bge_small_en_v1_5()

-        self.nlp = load_spacy_model()
+        self.nlp = load_text_multilabel_classifier()
        print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")

-
    def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
        """
        Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
@@ -310,13 +304,19 @@ class CosineStrategy(ExtractionStrategy):

        # Convert filtered clusters to a sorted list of dictionaries
        cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
+        
+        labels = self.nlp([cluster['content'] for cluster in cluster_list])
+        
+        for cluster, label in zip(cluster_list, labels):
+            cluster['tags'] = label

        # Process the text with the loaded model
-        for cluster in  cluster_list:
-            doc = self.nlp(cluster['content'])
-            tok_k = self.top_k
-            top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
-            cluster['tags'] = [cat for cat, _ in top_categories]
+        # for cluster in  cluster_list:
+        #     cluster['tags'] = self.nlp(cluster['content'])[0]['label']
+            # doc = self.nlp(cluster['content'])
+            # tok_k = self.top_k
+            # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+            # cluster['tags'] = [cat for cat, _ in top_categories]
        
        # print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
        
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -28,68 +28,66 @@ def load_bge_small_en_v1_5():
    return tokenizer, model

@lru_cache()
-def load_spacy_en_core_web_sm():
-    import spacy
-    try:
-        print("[LOG] Loading spaCy model")
-        nlp = spacy.load("en_core_web_sm")
-    except IOError:
-        print("[LOG] ⏬ Downloading spaCy model for the first time")
-        spacy.cli.download("en_core_web_sm")
-        nlp = spacy.load("en_core_web_sm")    
-    print("[LOG] ✅ spaCy model loaded successfully")
-    return nlp
+def load_text_classifier():
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    from transformers import pipeline
+
+    tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
+    model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
+    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
+
+    return pipe

@lru_cache()
-def load_spacy_model():
-    import spacy
-    name = "models/reuters"
-    home_folder = get_home_folder()
-    model_folder = os.path.join(home_folder, name)
-    
-    # Check if the model directory already exists
-    if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
-        repo_url = "https://github.com/unclecode/crawl4ai.git"
-        # branch = "main"
-        branch = MODEL_REPO_BRANCH 
-        repo_folder = os.path.join(home_folder, "crawl4ai")
-        model_folder = os.path.join(home_folder, name)
+def load_text_multilabel_classifier():
+    from transformers import AutoModelForSequenceClassification, AutoTokenizer
+    import numpy as np
+    from scipy.special import expit
+    import torch

-        print("[LOG] ⏬ Downloading model for the first time...")
+    MODEL = "cardiffnlp/tweet-topic-21-multi"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
+    class_mapping = model.config.id2label

-        # Remove existing repo folder if it exists
-        if Path(repo_folder).exists():
-            shutil.rmtree(repo_folder)
-            shutil.rmtree(model_folder)
+    # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")

-        try:
-            # Clone the repository
-            subprocess.run(
-                ["git", "clone", "-b", branch, repo_url, repo_folder],
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-                check=True
-            )
+    model.to(device)

-            # Create the models directory if it doesn't exist
-            models_folder = os.path.join(home_folder, "models")
-            os.makedirs(models_folder, exist_ok=True)
+    def _classifier(texts, threshold=0.5, max_length=64):
+        tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
+        tokens = {key: val.to(device) for key, val in tokens.items()}  # Move tokens to the selected device

-            # Copy the reuters model folder to the models directory
-            source_folder = os.path.join(repo_folder, "models/reuters")
-            shutil.copytree(source_folder, model_folder)
+        with torch.no_grad():
+            output = model(**tokens)

-            # Remove the cloned repository
-            shutil.rmtree(repo_folder)
+        scores = output.logits.detach().cpu().numpy()
+        scores = expit(scores)
+        predictions = (scores >= threshold) * 1

-            # Print completion message
-            print("[LOG] ✅ Model downloaded successfully")
-        except subprocess.CalledProcessError as e:
-            print(f"An error occurred while cloning the repository: {e}")
-        except Exception as e:
-            print(f"An error occurred: {e}")
+        batch_labels = []
+        for prediction in predictions:
+            labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
+            batch_labels.append(labels)

-    return spacy.load(model_folder)
+        return batch_labels
+
+    return _classifier
+
+@lru_cache()
+def load_nltk_punkt():
+    import nltk
+    try:
+        nltk.data.find('tokenizers/punkt')
+    except LookupError:
+        nltk.download('punkt')
+    return nltk.data.find('tokenizers/punkt')

 def download_all_models(remove_existing=False):
    """Download all models required for Crawl4AI."""
@@ -110,10 +108,10 @@ def download_all_models(remove_existing=False):
    load_bert_base_uncased()
    print("[LOG] Downloading BGE Small EN v1.5...")
    load_bge_small_en_v1_5()
-    print("[LOG] Downloading spaCy EN Core Web SM...")
-    load_spacy_en_core_web_sm()
-    print("[LOG] Downloading custom spaCy model...")
-    load_spacy_model()
+    print("[LOG] Downloading text classifier...")
+    load_text_multilabel_classifier
+    print("[LOG] Downloading custom NLTK Punkt model...")
+    load_nltk_punkt()
    print("[LOG] ✅ All models downloaded successfully.")

 def main():
--- a/crawl4ai/train.py
+++ b/crawl4ai/train.py
@@ -3,6 +3,33 @@ from spacy.training import Example
 import random
 import nltk
 from nltk.corpus import reuters
+import torch
+
+def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
+    # Extract the TextCategorizer component
+    textcat = nlp.get_pipe("textcat_multilabel")
+
+    # Convert the weights to a PyTorch state dictionary
+    state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
+
+    # Save the state dictionary
+    torch.save(state_dict, f"{model_dir}/model_weights.pth")
+
+    # Extract and save the vocabulary
+    vocab = extract_vocab(nlp)
+    with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
+        for word, idx in vocab.items():
+            vocab_file.write(f"{word}\t{idx}\n")
+    
+    print(f"Model weights and vocabulary saved to: {model_dir}")
+
+def extract_vocab(nlp):
+    # Extract vocabulary from the SpaCy model
+    vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
+    return vocab
+
+nlp = spacy.load("models/reuters")
+save_spacy_model_as_torch(nlp, model_dir="models")

 def train_and_save_reuters_model(model_dir="models/reuters"):
    # Ensure the Reuters corpus is downloaded
@@ -96,8 +123,6 @@ def train_model(model_dir, additional_epochs=0):
    nlp.to_disk(model_dir)
    print(f"Model saved to: {model_dir}")

-
-
 def load_model_and_predict(model_dir, text, tok_k = 3):
    # Load the trained model from the specified directory
    nlp = spacy.load(model_dir)
@@ -111,7 +136,6 @@ def load_model_and_predict(model_dir, text, tok_k = 3):
    
    return top_categories    

-
 if __name__ == "__main__":
    train_and_save_reuters_model()
    train_model("models/reuters", additional_epochs=5)
@@ -119,4 +143,4 @@ if __name__ == "__main__":
    print(reuters.categories())
    example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
    r =load_model_and_predict(model_directory, example_text)
-    print(r)
+    print(r)
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -11,7 +11,6 @@ from .crawler_strategy import *
 from typing import List
 from concurrent.futures import ThreadPoolExecutor
 from .config import *
-# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model


 class WebCrawler:
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -59,12 +59,6 @@ def understanding_parameters(crawler):
    cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
    print_result(result)

-    # Retrieve raw HTML content
-    cprint("\n🔄 [bold cyan]'include_raw_html' parameter example:[/bold cyan]", True)
-    result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
-    cprint("[LOG] 📦 [bold yellow]Crawl result (without raw HTML content):[/bold yellow]")
-    print_result(result)
-
 def add_chunking_strategy(crawler):
    # Adding a chunking strategy: RegexChunking
    cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
@@ -177,9 +171,19 @@ def main():

    crawler = create_crawler()
    
-    cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.", True)
+    cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.")
    crawler.always_by_pass_cache = True

+
+    cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
+    cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
+    result = crawler.run(
+        url="https://www.nbcnews.com/business",
+        chunking_strategy=NlpSentenceChunking()
+    )
+    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
+    print_result(result)
+
    basic_usage(crawler)
    understanding_parameters(crawler)
    add_chunking_strategy(crawler)
--- a/models/reuters/config.cfg
+++ b/models/reuters/config.cfg
@@ -1,144 +0,0 @@
-[paths]
-train = null
-dev = null
-vectors = null
-init_tok2vec = null
-
-[system]
-seed = 0
-gpu_allocator = null
-
-[nlp]
-lang = "en"
-pipeline = ["textcat_multilabel"]
-disabled = []
-before_creation = null
-after_creation = null
-after_pipeline_creation = null
-batch_size = 1000
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-vectors = {"@vectors":"spacy.Vectors.v1"}
-
-[components]
-
-[components.textcat_multilabel]
-factory = "textcat_multilabel"
-scorer = {"@scorers":"spacy.textcat_multilabel_scorer.v2"}
-threshold = 0.5
-
-[components.textcat_multilabel.model]
-@architectures = "spacy.TextCatEnsemble.v2"
-nO = null
-
-[components.textcat_multilabel.model.linear_model]
-@architectures = "spacy.TextCatBOW.v3"
-exclusive_classes = false
-length = 262144
-ngram_size = 1
-no_output_layer = false
-nO = null
-
-[components.textcat_multilabel.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v2"
-
-[components.textcat_multilabel.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v2"
-width = 64
-rows = [2000,2000,500,1000,500]
-attrs = ["NORM","LOWER","PREFIX","SUFFIX","SHAPE"]
-include_static_vectors = false
-
-[components.textcat_multilabel.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v2"
-width = 64
-window_size = 1
-maxout_pieces = 3
-depth = 2
-
-[corpora]
-
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths.dev}
-gold_preproc = false
-max_length = 0
-limit = 0
-augmenter = null
-
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths.train}
-gold_preproc = false
-max_length = 0
-limit = 0
-augmenter = null
-
-[training]
-seed = ${system.seed}
-gpu_allocator = ${system.gpu_allocator}
-dropout = 0.1
-accumulate_gradient = 1
-patience = 1600
-max_epochs = 0
-max_steps = 20000
-eval_frequency = 200
-frozen_components = []
-annotating_components = []
-dev_corpus = "corpora.dev"
-train_corpus = "corpora.train"
-before_to_disk = null
-before_update = null
-
-[training.batcher]
-@batchers = "spacy.batch_by_words.v1"
-discard_oversize = false
-tolerance = 0.2
-get_length = null
-
-[training.batcher.size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-t = 0.0
-
-[training.logger]
-@loggers = "spacy.ConsoleLogger.v1"
-progress_bar = false
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = false
-eps = 0.00000001
-learn_rate = 0.001
-
-[training.score_weights]
-cats_score = 1.0
-cats_score_desc = null
-cats_micro_p = null
-cats_micro_r = null
-cats_micro_f = null
-cats_macro_p = null
-cats_macro_r = null
-cats_macro_f = null
-cats_macro_auc = null
-cats_f_per_type = null
-
-[pretraining]
-
-[initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = null
-lookups = null
-before_init = null
-after_init = null
-
-[initialize.components]
-
-[initialize.tokenizer]
--- a/models/reuters/meta.json
+++ b/models/reuters/meta.json
@@ -1,122 +0,0 @@
-{
-  "lang":"en",
-  "name":"pipeline",
-  "version":"0.0.0",
-  "spacy_version":">=3.7.4,<3.8.0",
-  "description":"",
-  "author":"",
-  "email":"",
-  "url":"",
-  "license":"",
-  "spacy_git_version":"bff8725f4",
-  "vectors":{
-    "width":0,
-    "vectors":0,
-    "keys":0,
-    "name":null,
-    "mode":"default"
-  },
-  "labels":{
-    "textcat_multilabel":[
-      "acq",
-      "alum",
-      "barley",
-      "bop",
-      "carcass",
-      "castor-oil",
-      "cocoa",
-      "coconut",
-      "coconut-oil",
-      "coffee",
-      "copper",
-      "copra-cake",
-      "corn",
-      "cotton",
-      "cotton-oil",
-      "cpi",
-      "cpu",
-      "crude",
-      "dfl",
-      "dlr",
-      "dmk",
-      "earn",
-      "fuel",
-      "gas",
-      "gnp",
-      "gold",
-      "grain",
-      "groundnut",
-      "groundnut-oil",
-      "heat",
-      "hog",
-      "housing",
-      "income",
-      "instal-debt",
-      "interest",
-      "ipi",
-      "iron-steel",
-      "jet",
-      "jobs",
-      "l-cattle",
-      "lead",
-      "lei",
-      "lin-oil",
-      "livestock",
-      "lumber",
-      "meal-feed",
-      "money-fx",
-      "money-supply",
-      "naphtha",
-      "nat-gas",
-      "nickel",
-      "nkr",
-      "nzdlr",
-      "oat",
-      "oilseed",
-      "orange",
-      "palladium",
-      "palm-oil",
-      "palmkernel",
-      "pet-chem",
-      "platinum",
-      "potato",
-      "propane",
-      "rand",
-      "rape-oil",
-      "rapeseed",
-      "reserves",
-      "retail",
-      "rice",
-      "rubber",
-      "rye",
-      "ship",
-      "silver",
-      "sorghum",
-      "soy-meal",
-      "soy-oil",
-      "soybean",
-      "strategic-metal",
-      "sugar",
-      "sun-meal",
-      "sun-oil",
-      "sunseed",
-      "tea",
-      "tin",
-      "trade",
-      "veg-oil",
-      "wheat",
-      "wpi",
-      "yen",
-      "zinc"
-    ]
-  },
-  "pipeline":[
-    "textcat_multilabel"
-  ],
-  "components":[
-    "textcat_multilabel"
-  ],
-  "disabled":[
-
-  ]
-}
--- a/models/reuters/textcat_multilabel/cfg
+++ b/models/reuters/textcat_multilabel/cfg
@@ -1,95 +0,0 @@
-{
-  "labels":[
-    "acq",
-    "alum",
-    "barley",
-    "bop",
-    "carcass",
-    "castor-oil",
-    "cocoa",
-    "coconut",
-    "coconut-oil",
-    "coffee",
-    "copper",
-    "copra-cake",
-    "corn",
-    "cotton",
-    "cotton-oil",
-    "cpi",
-    "cpu",
-    "crude",
-    "dfl",
-    "dlr",
-    "dmk",
-    "earn",
-    "fuel",
-    "gas",
-    "gnp",
-    "gold",
-    "grain",
-    "groundnut",
-    "groundnut-oil",
-    "heat",
-    "hog",
-    "housing",
-    "income",
-    "instal-debt",
-    "interest",
-    "ipi",
-    "iron-steel",
-    "jet",
-    "jobs",
-    "l-cattle",
-    "lead",
-    "lei",
-    "lin-oil",
-    "livestock",
-    "lumber",
-    "meal-feed",
-    "money-fx",
-    "money-supply",
-    "naphtha",
-    "nat-gas",
-    "nickel",
-    "nkr",
-    "nzdlr",
-    "oat",
-    "oilseed",
-    "orange",
-    "palladium",
-    "palm-oil",
-    "palmkernel",
-    "pet-chem",
-    "platinum",
-    "potato",
-    "propane",
-    "rand",
-    "rape-oil",
-    "rapeseed",
-    "reserves",
-    "retail",
-    "rice",
-    "rubber",
-    "rye",
-    "ship",
-    "silver",
-    "sorghum",
-    "soy-meal",
-    "soy-oil",
-    "soybean",
-    "strategic-metal",
-    "sugar",
-    "sun-meal",
-    "sun-oil",
-    "sunseed",
-    "tea",
-    "tin",
-    "trade",
-    "veg-oil",
-    "wheat",
-    "wpi",
-    "yen",
-    "zinc"
-  ],
-  "threshold":0.5
-}
--- a/models/reuters/textcat_multilabel/model
+++ b/models/reuters/textcat_multilabel/model
--- a/models/reuters/tokenizer
+++ b/models/reuters/tokenizer
--- a/models/reuters/vocab/key2row
+++ b/models/reuters/vocab/key2row
@@ -1 +0,0 @@
-<EFBFBD>
--- a/models/reuters/vocab/lookups.bin
+++ b/models/reuters/vocab/lookups.bin
@@ -1 +0,0 @@
-<EFBFBD>
--- a/models/reuters/vocab/strings.json
+++ b/models/reuters/vocab/strings.json
--- a/models/reuters/vocab/vectors
+++ b/models/reuters/vocab/vectors
--- a/models/reuters/vocab/vectors.cfg
+++ b/models/reuters/vocab/vectors.cfg
@@ -1,3 +0,0 @@
-{
-  "mode":"default"
-}
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,6 @@ requests==2.31.0
 rich==13.7.1
 scikit-learn==1.4.2
 selenium==4.20.0
-spacy==3.7.4
 uvicorn==0.29.0
 transformers==4.40.2
 chromedriver-autoinstaller==0.6.4
--- a/setup_colab.py
+++ b/setup_colab.py
@@ -1,35 +0,0 @@
-import os
-
-def install_crawl4ai():
-    print("Installing Crawl4AI and its dependencies...")
-    
-    # Install dependencies
-    !pip install -U 'spacy[cuda12x]'
-    !apt-get update -y
-    !apt install chromium-chromedriver -y
-    !pip install chromedriver_autoinstaller
-    !pip install git+https://github.com/unclecode/crawl4ai.git@new-release-0.0.2
-    
-    # Install ChromeDriver
-    import chromedriver_autoinstaller
-    chromedriver_autoinstaller.install()
-    
-    # Download the reuters model
-    repo_url = "https://github.com/unclecode/crawl4ai.git"
-    branch = "new-release-0.0.2"
-    folder_path = "models/reuters"
-    
-    !git clone -b {branch} {repo_url}
-    !mkdir -p models
-    
-    repo_folder = "crawl4ai"
-    source_folder = os.path.join(repo_folder, folder_path)
-    destination_folder = "models"
-    
-    !mv "{source_folder}" "{destination_folder}"
-    !rm -rf "{repo_folder}"
-    
-    print("Installation and model download completed successfully!")
-
-# Run the installer
-install_crawl4ai()