Update:

- Fix Spacy model issue - Update Readme and requirements.txt
2024-05-16 19:50:20 +08:00
parent 6a6365ae0a
commit c8589f8da3
8 changed files with 137 additions and 70 deletions
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -6,6 +6,7 @@ import re
 # from nltk.tokenize import word_tokenize, TextTilingTokenizer
 from collections import Counter
 import string
+from .model_loader import load_spacy_en_core_web_sm

 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
@@ -37,13 +38,7 @@ class RegexChunking(ChunkingStrategy):

 class NlpSentenceChunking(ChunkingStrategy):
    def __init__(self, model='en_core_web_sm'):
-        import spacy
-        try:
-            self.nlp = spacy.load(model)
-        except IOError:
-            spacy.cli.download("en_core_web_sm")
-            self.nlp = spacy.load(model)
-            # raise ImportError(f"Spacy model '{model}' not found. Please download the model using 'python -m spacy download {model}'")
+        self.nlp = load_spacy_en_core_web_sm()

    def chunk(self, text: str) -> list:
        doc = self.nlp(text)
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -5,7 +5,7 @@ load_dotenv()  # Load environment variables from .env file

 # Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
 DEFAULT_PROVIDER = "openai/gpt-4-turbo"
-
+MODEL_REPO_BRANCH = "new-release-0.0.2"
 # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
 PROVIDER_MODELS = {
    "ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -1,20 +1,86 @@
 from functools import lru_cache
-from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
-import spacy
+from .utils import get_home_folder
+from pathlib import Path
+import subprocess, os
+import shutil
+from .config import MODEL_REPO_BRANCH

@lru_cache()
 def load_bert_base_uncased():
+    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
    model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
    return tokenizer, model

@lru_cache()
 def load_bge_small_en_v1_5():
+    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
    model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
    model.eval()
    return tokenizer, model

+@lru_cache()
+def load_spacy_en_core_web_sm():
+    import spacy
+    try:
+        print("[LOG] Loading spaCy model")
+        nlp = spacy.load("en_core_web_sm")
+    except IOError:
+        print("[LOG] ⏬ Downloading spaCy model for the first time")
+        spacy.cli.download("en_core_web_sm")
+        nlp = spacy.load("en_core_web_sm")    
+    print("[LOG] ✅ spaCy model loaded successfully")
+    return nlp
+
+
@lru_cache()
 def load_spacy_model():
-    return spacy.load("models/reuters")
+    import spacy
+    name = "models/reuters"
+    home_folder = get_home_folder()
+    model_folder = os.path.join(home_folder, name)
+    
+    # Check if the model directory already exists
+    if True or not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
+        repo_url = "https://github.com/unclecode/crawl4ai.git"
+        # branch = "main"
+        branch = MODEL_REPO_BRANCH 
+        repo_folder = os.path.join(home_folder, "crawl4ai")
+        model_folder = os.path.join(home_folder, name)
+
+        print("[LOG] ⏬ Downloading model for the first time...")
+
+        # Remove existing repo folder if it exists
+        if Path(repo_folder).exists():
+            shutil.rmtree(repo_folder)
+            shutil.rmtree(model_folder)
+
+        try:
+            # Clone the repository
+            subprocess.run(
+                ["git", "clone", "-b", branch, repo_url, repo_folder],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True
+            )
+
+            # Create the models directory if it doesn't exist
+            models_folder = os.path.join(home_folder, "models")
+            os.makedirs(models_folder, exist_ok=True)
+
+            # Copy the reuters model folder to the models directory
+            source_folder = os.path.join(repo_folder, "models/reuters")
+            shutil.copytree(source_folder, model_folder)
+
+            # Remove the cloned repository
+            shutil.rmtree(repo_folder)
+
+            # Print completion message
+            print("[LOG] ✅ Model downloaded successfully")
+        except subprocess.CalledProcessError as e:
+            print(f"An error occurred while cloning the repository: {e}")
+        except Exception as e:
+            print(f"An error occurred: {e}")
+
+    return spacy.load(model_folder)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -9,10 +9,19 @@ import os
 from html2text import HTML2Text
 from .prompts import PROMPT_EXTRACT_BLOCKS
 from .config import *
+from pathlib import Path

 class InvalidCSSSelectorError(Exception):
    pass

+
+def get_home_folder():
+    home_folder = os.path.join(Path.home(), ".crawl4ai")
+    os.makedirs(home_folder, exist_ok=True)
+    os.makedirs(f"{home_folder}/cache", exist_ok=True)
+    os.makedirs(f"{home_folder}/models", exist_ok=True)
+    return home_folder    
+
 def beautify_html(escaped_html):
    """
    Beautifies an escaped HTML string.
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -34,13 +34,16 @@ class WebCrawler:
        # if not db_path:
            # self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
        
-        flush_db()
+        # flush_db()
        init_db()
        
        self.ready = False
        
    def warmup(self):
-        print("[LOG] 🌤️ Warming up the WebCrawler")
+        
+        
+        
+        print("[LOG] 🌤️  Warming up the WebCrawler")
        result = self.run(
            url='https://crawl4ai.uccode.io/',
            word_count_threshold=5,