Remove dependency on Spacy model.

2024-05-17 15:08:03 +08:00
parent f85df91ca6
commit a5f9d07dbf
18 changed files with 123 additions and 83955 deletions
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -1,12 +1,8 @@
 from abc import ABC, abstractmethod
 import re
 # spacy = lazy_import.lazy_module('spacy')
 # nl = lazy_import.lazy_module('nltk')
 # from nltk.corpus import stopwords
 # from nltk.tokenize import word_tokenize, TextTilingTokenizer
 from collections import Counter
 import string
-from .model_loader import load_spacy_en_core_web_sm
+from .model_loader import load_nltk_punkt
 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
@@ -34,15 +30,24 @@ class RegexChunking(ChunkingStrategy):
            paragraphs = new_paragraphs
        return paragraphs
-# NLP-based sentence chunking using spaCy
+# NLP-based sentence chunking 
 class NlpSentenceChunking(ChunkingStrategy):
-    def __init__(self, model='en_core_web_sm'):
+    def __init__(self):
-        self.nlp = load_spacy_en_core_web_sm()
+        load_nltk_punkt()
        pass
    def chunk(self, text: str) -> list:
-        doc = self.nlp(text)
+        # Improved regex for sentence splitting
-        return [sent.text.strip() for sent in doc.sents]
+        # sentence_endings = re.compile(
        #     r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
        # )
        # sentences = sentence_endings.split(text)
        # sens =  [sent.strip() for sent in sentences if sent]            
        from nltk.tokenize import sent_tokenize
        sentences = sent_tokenize(text)
        sens =  [sent.strip() for sent in sentences]        
        return list(set(sens))
 # Topic-based segmentation using TextTiling
 class TopicSegmentationChunking(ChunkingStrategy):
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -7,7 +7,7 @@ from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTI
 from .config import *
 from .utils import *
 from functools import partial
-from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
+from .model_loader import *
 import numpy as np
@@ -45,14 +45,13 @@ class ExtractionStrategy(ABC):
            for future in as_completed(futures):
                extracted_content.extend(future.result())
        return extracted_content    
 class NoExtractionStrategy(ExtractionStrategy):
    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": 0, "content": html}]
    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
-    
+   
 class LLMExtractionStrategy(ExtractionStrategy):
    def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
        """
@@ -166,10 +165,6 @@ class CosineStrategy(ExtractionStrategy):
        """
        super().__init__()
        from transformers import BertTokenizer, BertModel, pipeline
        from transformers import AutoTokenizer, AutoModel     
        import spacy  
        self.semantic_filter = semantic_filter
        self.word_count_threshold = word_count_threshold
        self.max_dist = max_dist
@@ -184,10 +179,9 @@ class CosineStrategy(ExtractionStrategy):
        elif model_name == "BAAI/bge-small-en-v1.5":
            self.tokenizer, self.model = load_bge_small_en_v1_5()
-        self.nlp = load_spacy_model()
+        self.nlp = load_text_multilabel_classifier()
        print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
    def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
        """
        Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
@@ -310,13 +304,19 @@ class CosineStrategy(ExtractionStrategy):
        # Convert filtered clusters to a sorted list of dictionaries
        cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
        labels = self.nlp([cluster['content'] for cluster in cluster_list])
        for cluster, label in zip(cluster_list, labels):
            cluster['tags'] = label
        # Process the text with the loaded model
-        for cluster in  cluster_list:
+        # for cluster in  cluster_list:
-            doc = self.nlp(cluster['content'])
+        #     cluster['tags'] = self.nlp(cluster['content'])[0]['label']
-            tok_k = self.top_k
+            # doc = self.nlp(cluster['content'])
-            top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+            # tok_k = self.top_k
-            cluster['tags'] = [cat for cat, _ in top_categories]
+            # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
            # cluster['tags'] = [cat for cat, _ in top_categories]
        # print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -28,68 +28,66 @@ def load_bge_small_en_v1_5():
    return tokenizer, model
@lru_cache()
-def load_spacy_en_core_web_sm():
+def load_text_classifier():
-    import spacy
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    try:
+    from transformers import pipeline
-        print("[LOG] Loading spaCy model")
+
-        nlp = spacy.load("en_core_web_sm")
+    tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
-    except IOError:
+    model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
-        print("[LOG] ⏬ Downloading spaCy model for the first time")
+    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
-        spacy.cli.download("en_core_web_sm")
+
-        nlp = spacy.load("en_core_web_sm")    
+    return pipe
    print("[LOG] ✅ spaCy model loaded successfully")
    return nlp
@lru_cache()
-def load_spacy_model():
+def load_text_multilabel_classifier():
-    import spacy
+    from transformers import AutoModelForSequenceClassification, AutoTokenizer
-    name = "models/reuters"
+    import numpy as np
-    home_folder = get_home_folder()
+    from scipy.special import expit
-    model_folder = os.path.join(home_folder, name)
+    import torch
    # Check if the model directory already exists
    if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
        repo_url = "https://github.com/unclecode/crawl4ai.git"
        # branch = "main"
        branch = MODEL_REPO_BRANCH 
        repo_folder = os.path.join(home_folder, "crawl4ai")
        model_folder = os.path.join(home_folder, name)
-        print("[LOG] ⏬ Downloading model for the first time...")
+    MODEL = "cardiffnlp/tweet-topic-21-multi"
    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
    class_mapping = model.config.id2label
-        # Remove existing repo folder if it exists
+    # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
-        if Path(repo_folder).exists():
+    if torch.cuda.is_available():
-            shutil.rmtree(repo_folder)
+        device = torch.device("cuda")
-            shutil.rmtree(model_folder)
+    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
-        try:
+    model.to(device)
            # Clone the repository
            subprocess.run(
                ["git", "clone", "-b", branch, repo_url, repo_folder],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
                check=True
            )
-            # Create the models directory if it doesn't exist
+    def _classifier(texts, threshold=0.5, max_length=64):
-            models_folder = os.path.join(home_folder, "models")
+        tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
-            os.makedirs(models_folder, exist_ok=True)
+        tokens = {key: val.to(device) for key, val in tokens.items()}  # Move tokens to the selected device
-            # Copy the reuters model folder to the models directory
+        with torch.no_grad():
-            source_folder = os.path.join(repo_folder, "models/reuters")
+            output = model(**tokens)
            shutil.copytree(source_folder, model_folder)
-            # Remove the cloned repository
+        scores = output.logits.detach().cpu().numpy()
-            shutil.rmtree(repo_folder)
+        scores = expit(scores)
        predictions = (scores >= threshold) * 1
-            # Print completion message
+        batch_labels = []
-            print("[LOG] ✅ Model downloaded successfully")
+        for prediction in predictions:
-        except subprocess.CalledProcessError as e:
+            labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
-            print(f"An error occurred while cloning the repository: {e}")
+            batch_labels.append(labels)
        except Exception as e:
            print(f"An error occurred: {e}")
-    return spacy.load(model_folder)
+        return batch_labels
    return _classifier
@lru_cache()
 def load_nltk_punkt():
    import nltk
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    return nltk.data.find('tokenizers/punkt')
 def download_all_models(remove_existing=False):
    """Download all models required for Crawl4AI."""
@@ -110,10 +108,10 @@ def download_all_models(remove_existing=False):
    load_bert_base_uncased()
    print("[LOG] Downloading BGE Small EN v1.5...")
    load_bge_small_en_v1_5()
-    print("[LOG] Downloading spaCy EN Core Web SM...")
+    print("[LOG] Downloading text classifier...")
-    load_spacy_en_core_web_sm()
+    load_text_multilabel_classifier
-    print("[LOG] Downloading custom spaCy model...")
+    print("[LOG] Downloading custom NLTK Punkt model...")
-    load_spacy_model()
+    load_nltk_punkt()
    print("[LOG] ✅ All models downloaded successfully.")
 def main():
--- a/crawl4ai/train.py
+++ b/crawl4ai/train.py
@@ -3,6 +3,33 @@ from spacy.training import Example
 import random
 import nltk
 from nltk.corpus import reuters
 import torch
 def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
    # Extract the TextCategorizer component
    textcat = nlp.get_pipe("textcat_multilabel")
    # Convert the weights to a PyTorch state dictionary
    state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
    # Save the state dictionary
    torch.save(state_dict, f"{model_dir}/model_weights.pth")
    # Extract and save the vocabulary
    vocab = extract_vocab(nlp)
    with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
        for word, idx in vocab.items():
            vocab_file.write(f"{word}\t{idx}\n")
    print(f"Model weights and vocabulary saved to: {model_dir}")
 def extract_vocab(nlp):
    # Extract vocabulary from the SpaCy model
    vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
    return vocab
 nlp = spacy.load("models/reuters")
 save_spacy_model_as_torch(nlp, model_dir="models")
 def train_and_save_reuters_model(model_dir="models/reuters"):
    # Ensure the Reuters corpus is downloaded
@@ -96,8 +123,6 @@ def train_model(model_dir, additional_epochs=0):
    nlp.to_disk(model_dir)
    print(f"Model saved to: {model_dir}")
 def load_model_and_predict(model_dir, text, tok_k = 3):
    # Load the trained model from the specified directory
    nlp = spacy.load(model_dir)
@@ -111,7 +136,6 @@ def load_model_and_predict(model_dir, text, tok_k = 3):
    return top_categories    
 if __name__ == "__main__":
    train_and_save_reuters_model()
    train_model("models/reuters", additional_epochs=5)
@@ -119,4 +143,4 @@ if __name__ == "__main__":
    print(reuters.categories())
    example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
    r =load_model_and_predict(model_directory, example_text)
-    print(r)
+    print(r)
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -11,7 +11,6 @@ from .crawler_strategy import *
 from typing import List
 from concurrent.futures import ThreadPoolExecutor
 from .config import *
 # from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
 class WebCrawler:
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -59,12 +59,6 @@ def understanding_parameters(crawler):
    cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
    print_result(result)
    # Retrieve raw HTML content
    cprint("\n🔄 [bold cyan]'include_raw_html' parameter example:[/bold cyan]", True)
    result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
    cprint("[LOG] 📦 [bold yellow]Crawl result (without raw HTML content):[/bold yellow]")
    print_result(result)
 def add_chunking_strategy(crawler):
    # Adding a chunking strategy: RegexChunking
    cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
@@ -177,9 +171,19 @@ def main():
    crawler = create_crawler()
-    cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.", True)
+    cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.")
    crawler.always_by_pass_cache = True
    cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
    cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        chunking_strategy=NlpSentenceChunking()
    )
    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
    print_result(result)
    basic_usage(crawler)
    understanding_parameters(crawler)
    add_chunking_strategy(crawler)
--- a/models/reuters/config.cfg
+++ b/models/reuters/config.cfg
@@ -1,144 +0,0 @@
 [paths]
 train = null
 dev = null
 vectors = null
 init_tok2vec = null
 [system]
 seed = 0
 gpu_allocator = null
 [nlp]
 lang = "en"
 pipeline = ["textcat_multilabel"]
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 batch_size = 1000
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 vectors = {"@vectors":"spacy.Vectors.v1"}
 [components]
 [components.textcat_multilabel]
 factory = "textcat_multilabel"
 scorer = {"@scorers":"spacy.textcat_multilabel_scorer.v2"}
 threshold = 0.5
 [components.textcat_multilabel.model]
@architectures = "spacy.TextCatEnsemble.v2"
 nO = null
 [components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
 length = 262144
 ngram_size = 1
 no_output_layer = false
 nO = null
 [components.textcat_multilabel.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
 [components.textcat_multilabel.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
 rows = [2000,2000,500,1000,500]
 attrs = ["NORM","LOWER","PREFIX","SUFFIX","SHAPE"]
 include_static_vectors = false
 [components.textcat_multilabel.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 64
 window_size = 1
 maxout_pieces = 3
 depth = 2
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 gold_preproc = false
 max_length = 0
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 gold_preproc = false
 max_length = 0
 limit = 0
 augmenter = null
 [training]
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 before_to_disk = null
 before_update = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 cats_score = 1.0
 cats_score_desc = null
 cats_micro_p = null
 cats_micro_r = null
 cats_micro_f = null
 cats_macro_p = null
 cats_macro_r = null
 cats_macro_f = null
 cats_macro_auc = null
 cats_f_per_type = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
--- a/models/reuters/meta.json
+++ b/models/reuters/meta.json
@@ -1,122 +0,0 @@
 {
  "lang":"en",
  "name":"pipeline",
  "version":"0.0.0",
  "spacy_version":">=3.7.4,<3.8.0",
  "description":"",
  "author":"",
  "email":"",
  "url":"",
  "license":"",
  "spacy_git_version":"bff8725f4",
  "vectors":{
    "width":0,
    "vectors":0,
    "keys":0,
    "name":null,
    "mode":"default"
  },
  "labels":{
    "textcat_multilabel":[
      "acq",
      "alum",
      "barley",
      "bop",
      "carcass",
      "castor-oil",
      "cocoa",
      "coconut",
      "coconut-oil",
      "coffee",
      "copper",
      "copra-cake",
      "corn",
      "cotton",
      "cotton-oil",
      "cpi",
      "cpu",
      "crude",
      "dfl",
      "dlr",
      "dmk",
      "earn",
      "fuel",
      "gas",
      "gnp",
      "gold",
      "grain",
      "groundnut",
      "groundnut-oil",
      "heat",
      "hog",
      "housing",
      "income",
      "instal-debt",
      "interest",
      "ipi",
      "iron-steel",
      "jet",
      "jobs",
      "l-cattle",
      "lead",
      "lei",
      "lin-oil",
      "livestock",
      "lumber",
      "meal-feed",
      "money-fx",
      "money-supply",
      "naphtha",
      "nat-gas",
      "nickel",
      "nkr",
      "nzdlr",
      "oat",
      "oilseed",
      "orange",
      "palladium",
      "palm-oil",
      "palmkernel",
      "pet-chem",
      "platinum",
      "potato",
      "propane",
      "rand",
      "rape-oil",
      "rapeseed",
      "reserves",
      "retail",
      "rice",
      "rubber",
      "rye",
      "ship",
      "silver",
      "sorghum",
      "soy-meal",
      "soy-oil",
      "soybean",
      "strategic-metal",
      "sugar",
      "sun-meal",
      "sun-oil",
      "sunseed",
      "tea",
      "tin",
      "trade",
      "veg-oil",
      "wheat",
      "wpi",
      "yen",
      "zinc"
    ]
  },
  "pipeline":[
    "textcat_multilabel"
  ],
  "components":[
    "textcat_multilabel"
  ],
  "disabled":[
  ]
 }
--- a/models/reuters/textcat_multilabel/cfg
+++ b/models/reuters/textcat_multilabel/cfg
@@ -1,95 +0,0 @@
 {
  "labels":[
    "acq",
    "alum",
    "barley",
    "bop",
    "carcass",
    "castor-oil",
    "cocoa",
    "coconut",
    "coconut-oil",
    "coffee",
    "copper",
    "copra-cake",
    "corn",
    "cotton",
    "cotton-oil",
    "cpi",
    "cpu",
    "crude",
    "dfl",
    "dlr",
    "dmk",
    "earn",
    "fuel",
    "gas",
    "gnp",
    "gold",
    "grain",
    "groundnut",
    "groundnut-oil",
    "heat",
    "hog",
    "housing",
    "income",
    "instal-debt",
    "interest",
    "ipi",
    "iron-steel",
    "jet",
    "jobs",
    "l-cattle",
    "lead",
    "lei",
    "lin-oil",
    "livestock",
    "lumber",
    "meal-feed",
    "money-fx",
    "money-supply",
    "naphtha",
    "nat-gas",
    "nickel",
    "nkr",
    "nzdlr",
    "oat",
    "oilseed",
    "orange",
    "palladium",
    "palm-oil",
    "palmkernel",
    "pet-chem",
    "platinum",
    "potato",
    "propane",
    "rand",
    "rape-oil",
    "rapeseed",
    "reserves",
    "retail",
    "rice",
    "rubber",
    "rye",
    "ship",
    "silver",
    "sorghum",
    "soy-meal",
    "soy-oil",
    "soybean",
    "strategic-metal",
    "sugar",
    "sun-meal",
    "sun-oil",
    "sunseed",
    "tea",
    "tin",
    "trade",
    "veg-oil",
    "wheat",
    "wpi",
    "yen",
    "zinc"
  ],
  "threshold":0.5
 }
--- a/models/reuters/textcat_multilabel/model
+++ b/models/reuters/textcat_multilabel/model
--- a/models/reuters/tokenizer
+++ b/models/reuters/tokenizer
--- a/models/reuters/vocab/key2row
+++ b/models/reuters/vocab/key2row
@@ -1 +0,0 @@
 <EFBFBD>
--- a/models/reuters/vocab/lookups.bin
+++ b/models/reuters/vocab/lookups.bin
@@ -1 +0,0 @@
 <EFBFBD>
--- a/models/reuters/vocab/strings.json
+++ b/models/reuters/vocab/strings.json
--- a/models/reuters/vocab/vectors
+++ b/models/reuters/vocab/vectors
--- a/models/reuters/vocab/vectors.cfg
+++ b/models/reuters/vocab/vectors.cfg
@@ -1,3 +0,0 @@
 {
  "mode":"default"
 }
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,6 @@ requests==2.31.0
 rich==13.7.1
 scikit-learn==1.4.2
 selenium==4.20.0
 spacy==3.7.4
 uvicorn==0.29.0
 transformers==4.40.2
 chromedriver-autoinstaller==0.6.4
--- a/setup_colab.py
+++ b/setup_colab.py
@@ -1,35 +0,0 @@
 import os
 def install_crawl4ai():
    print("Installing Crawl4AI and its dependencies...")
    # Install dependencies
    !pip install -U 'spacy[cuda12x]'
    !apt-get update -y
    !apt install chromium-chromedriver -y
    !pip install chromedriver_autoinstaller
    !pip install git+https://github.com/unclecode/crawl4ai.git@new-release-0.0.2
    # Install ChromeDriver
    import chromedriver_autoinstaller
    chromedriver_autoinstaller.install()
    # Download the reuters model
    repo_url = "https://github.com/unclecode/crawl4ai.git"
    branch = "new-release-0.0.2"
    folder_path = "models/reuters"
    !git clone -b {branch} {repo_url}
    !mkdir -p models
    repo_folder = "crawl4ai"
    source_folder = os.path.join(repo_folder, folder_path)
    destination_folder = "models"
    !mv "{source_folder}" "{destination_folder}"
    !rm -rf "{repo_folder}"
    print("Installation and model download completed successfully!")
 # Run the installer
 install_crawl4ai()