Update:
- Fix Spacy model issue - Update Readme and requirements.txt
This commit is contained in:
@@ -6,6 +6,7 @@ import re
|
||||
# from nltk.tokenize import word_tokenize, TextTilingTokenizer
|
||||
from collections import Counter
|
||||
import string
|
||||
from .model_loader import load_spacy_en_core_web_sm
|
||||
|
||||
# Define the abstract base class for chunking strategies
|
||||
class ChunkingStrategy(ABC):
|
||||
@@ -37,13 +38,7 @@ class RegexChunking(ChunkingStrategy):
|
||||
|
||||
class NlpSentenceChunking(ChunkingStrategy):
|
||||
def __init__(self, model='en_core_web_sm'):
|
||||
import spacy
|
||||
try:
|
||||
self.nlp = spacy.load(model)
|
||||
except IOError:
|
||||
spacy.cli.download("en_core_web_sm")
|
||||
self.nlp = spacy.load(model)
|
||||
# raise ImportError(f"Spacy model '{model}' not found. Please download the model using 'python -m spacy download {model}'")
|
||||
self.nlp = load_spacy_en_core_web_sm()
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
doc = self.nlp(text)
|
||||
|
||||
@@ -5,7 +5,7 @@ load_dotenv() # Load environment variables from .env file
|
||||
|
||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||
|
||||
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
PROVIDER_MODELS = {
|
||||
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
||||
|
||||
@@ -1,20 +1,86 @@
|
||||
from functools import lru_cache
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
import spacy
|
||||
from .utils import get_home_folder
|
||||
from pathlib import Path
|
||||
import subprocess, os
|
||||
import shutil
|
||||
from .config import MODEL_REPO_BRANCH
|
||||
|
||||
@lru_cache()
|
||||
def load_bert_base_uncased():
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_bge_small_en_v1_5():
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model.eval()
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_en_core_web_sm():
|
||||
import spacy
|
||||
try:
|
||||
print("[LOG] Loading spaCy model")
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
except IOError:
|
||||
print("[LOG] ⏬ Downloading spaCy model for the first time")
|
||||
spacy.cli.download("en_core_web_sm")
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
print("[LOG] ✅ spaCy model loaded successfully")
|
||||
return nlp
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_model():
|
||||
return spacy.load("models/reuters")
|
||||
import spacy
|
||||
name = "models/reuters"
|
||||
home_folder = get_home_folder()
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
# Check if the model directory already exists
|
||||
if True or not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
|
||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
||||
# branch = "main"
|
||||
branch = MODEL_REPO_BRANCH
|
||||
repo_folder = os.path.join(home_folder, "crawl4ai")
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
print("[LOG] ⏬ Downloading model for the first time...")
|
||||
|
||||
# Remove existing repo folder if it exists
|
||||
if Path(repo_folder).exists():
|
||||
shutil.rmtree(repo_folder)
|
||||
shutil.rmtree(model_folder)
|
||||
|
||||
try:
|
||||
# Clone the repository
|
||||
subprocess.run(
|
||||
["git", "clone", "-b", branch, repo_url, repo_folder],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True
|
||||
)
|
||||
|
||||
# Create the models directory if it doesn't exist
|
||||
models_folder = os.path.join(home_folder, "models")
|
||||
os.makedirs(models_folder, exist_ok=True)
|
||||
|
||||
# Copy the reuters model folder to the models directory
|
||||
source_folder = os.path.join(repo_folder, "models/reuters")
|
||||
shutil.copytree(source_folder, model_folder)
|
||||
|
||||
# Remove the cloned repository
|
||||
shutil.rmtree(repo_folder)
|
||||
|
||||
# Print completion message
|
||||
print("[LOG] ✅ Model downloaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while cloning the repository: {e}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
return spacy.load(model_folder)
|
||||
@@ -9,10 +9,19 @@ import os
|
||||
from html2text import HTML2Text
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||
from .config import *
|
||||
from pathlib import Path
|
||||
|
||||
class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
return home_folder
|
||||
|
||||
def beautify_html(escaped_html):
|
||||
"""
|
||||
Beautifies an escaped HTML string.
|
||||
|
||||
@@ -34,13 +34,16 @@ class WebCrawler:
|
||||
# if not db_path:
|
||||
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
|
||||
|
||||
flush_db()
|
||||
# flush_db()
|
||||
init_db()
|
||||
|
||||
self.ready = False
|
||||
|
||||
def warmup(self):
|
||||
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||
|
||||
|
||||
|
||||
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||
result = self.run(
|
||||
url='https://crawl4ai.uccode.io/',
|
||||
word_count_threshold=5,
|
||||
|
||||
Reference in New Issue
Block a user