Remove dependency on Spacy model.
This commit is contained in:
@@ -1,12 +1,8 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
import re
|
import re
|
||||||
# spacy = lazy_import.lazy_module('spacy')
|
|
||||||
# nl = lazy_import.lazy_module('nltk')
|
|
||||||
# from nltk.corpus import stopwords
|
|
||||||
# from nltk.tokenize import word_tokenize, TextTilingTokenizer
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import string
|
import string
|
||||||
from .model_loader import load_spacy_en_core_web_sm
|
from .model_loader import load_nltk_punkt
|
||||||
|
|
||||||
# Define the abstract base class for chunking strategies
|
# Define the abstract base class for chunking strategies
|
||||||
class ChunkingStrategy(ABC):
|
class ChunkingStrategy(ABC):
|
||||||
@@ -34,15 +30,24 @@ class RegexChunking(ChunkingStrategy):
|
|||||||
paragraphs = new_paragraphs
|
paragraphs = new_paragraphs
|
||||||
return paragraphs
|
return paragraphs
|
||||||
|
|
||||||
# NLP-based sentence chunking using spaCy
|
# NLP-based sentence chunking
|
||||||
|
|
||||||
class NlpSentenceChunking(ChunkingStrategy):
|
class NlpSentenceChunking(ChunkingStrategy):
|
||||||
def __init__(self, model='en_core_web_sm'):
|
def __init__(self):
|
||||||
self.nlp = load_spacy_en_core_web_sm()
|
load_nltk_punkt()
|
||||||
|
pass
|
||||||
|
|
||||||
def chunk(self, text: str) -> list:
|
def chunk(self, text: str) -> list:
|
||||||
doc = self.nlp(text)
|
# Improved regex for sentence splitting
|
||||||
return [sent.text.strip() for sent in doc.sents]
|
# sentence_endings = re.compile(
|
||||||
|
# r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
|
||||||
|
# )
|
||||||
|
# sentences = sentence_endings.split(text)
|
||||||
|
# sens = [sent.strip() for sent in sentences if sent]
|
||||||
|
from nltk.tokenize import sent_tokenize
|
||||||
|
sentences = sent_tokenize(text)
|
||||||
|
sens = [sent.strip() for sent in sentences]
|
||||||
|
|
||||||
|
return list(set(sens))
|
||||||
|
|
||||||
# Topic-based segmentation using TextTiling
|
# Topic-based segmentation using TextTiling
|
||||||
class TopicSegmentationChunking(ChunkingStrategy):
|
class TopicSegmentationChunking(ChunkingStrategy):
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTI
|
|||||||
from .config import *
|
from .config import *
|
||||||
from .utils import *
|
from .utils import *
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
from .model_loader import *
|
||||||
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -45,14 +45,13 @@ class ExtractionStrategy(ABC):
|
|||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
extracted_content.extend(future.result())
|
extracted_content.extend(future.result())
|
||||||
return extracted_content
|
return extracted_content
|
||||||
|
|
||||||
class NoExtractionStrategy(ExtractionStrategy):
|
class NoExtractionStrategy(ExtractionStrategy):
|
||||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||||
return [{"index": 0, "content": html}]
|
return [{"index": 0, "content": html}]
|
||||||
|
|
||||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||||
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
||||||
|
|
||||||
class LLMExtractionStrategy(ExtractionStrategy):
|
class LLMExtractionStrategy(ExtractionStrategy):
|
||||||
def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
|
def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
|
||||||
"""
|
"""
|
||||||
@@ -166,10 +165,6 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
from transformers import BertTokenizer, BertModel, pipeline
|
|
||||||
from transformers import AutoTokenizer, AutoModel
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
self.semantic_filter = semantic_filter
|
self.semantic_filter = semantic_filter
|
||||||
self.word_count_threshold = word_count_threshold
|
self.word_count_threshold = word_count_threshold
|
||||||
self.max_dist = max_dist
|
self.max_dist = max_dist
|
||||||
@@ -184,10 +179,9 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
elif model_name == "BAAI/bge-small-en-v1.5":
|
elif model_name == "BAAI/bge-small-en-v1.5":
|
||||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||||
|
|
||||||
self.nlp = load_spacy_model()
|
self.nlp = load_text_multilabel_classifier()
|
||||||
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
||||||
|
|
||||||
|
|
||||||
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
|
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||||||
@@ -310,13 +304,19 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
# Convert filtered clusters to a sorted list of dictionaries
|
# Convert filtered clusters to a sorted list of dictionaries
|
||||||
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
||||||
|
|
||||||
|
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
||||||
|
|
||||||
|
for cluster, label in zip(cluster_list, labels):
|
||||||
|
cluster['tags'] = label
|
||||||
|
|
||||||
# Process the text with the loaded model
|
# Process the text with the loaded model
|
||||||
for cluster in cluster_list:
|
# for cluster in cluster_list:
|
||||||
doc = self.nlp(cluster['content'])
|
# cluster['tags'] = self.nlp(cluster['content'])[0]['label']
|
||||||
tok_k = self.top_k
|
# doc = self.nlp(cluster['content'])
|
||||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
# tok_k = self.top_k
|
||||||
cluster['tags'] = [cat for cat, _ in top_categories]
|
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||||
|
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||||||
|
|
||||||
# print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
# print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||||||
|
|
||||||
|
|||||||
@@ -28,68 +28,66 @@ def load_bge_small_en_v1_5():
|
|||||||
return tokenizer, model
|
return tokenizer, model
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def load_spacy_en_core_web_sm():
|
def load_text_classifier():
|
||||||
import spacy
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||||
try:
|
from transformers import pipeline
|
||||||
print("[LOG] Loading spaCy model")
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||||
except IOError:
|
model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||||
print("[LOG] ⏬ Downloading spaCy model for the first time")
|
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
||||||
spacy.cli.download("en_core_web_sm")
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
return pipe
|
||||||
print("[LOG] ✅ spaCy model loaded successfully")
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def load_spacy_model():
|
def load_text_multilabel_classifier():
|
||||||
import spacy
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||||
name = "models/reuters"
|
import numpy as np
|
||||||
home_folder = get_home_folder()
|
from scipy.special import expit
|
||||||
model_folder = os.path.join(home_folder, name)
|
import torch
|
||||||
|
|
||||||
# Check if the model directory already exists
|
|
||||||
if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
|
|
||||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
|
||||||
# branch = "main"
|
|
||||||
branch = MODEL_REPO_BRANCH
|
|
||||||
repo_folder = os.path.join(home_folder, "crawl4ai")
|
|
||||||
model_folder = os.path.join(home_folder, name)
|
|
||||||
|
|
||||||
print("[LOG] ⏬ Downloading model for the first time...")
|
MODEL = "cardiffnlp/tweet-topic-21-multi"
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
|
||||||
|
class_mapping = model.config.id2label
|
||||||
|
|
||||||
# Remove existing repo folder if it exists
|
# Check for available device: CUDA, MPS (for Apple Silicon), or CPU
|
||||||
if Path(repo_folder).exists():
|
if torch.cuda.is_available():
|
||||||
shutil.rmtree(repo_folder)
|
device = torch.device("cuda")
|
||||||
shutil.rmtree(model_folder)
|
elif torch.backends.mps.is_available():
|
||||||
|
device = torch.device("mps")
|
||||||
|
else:
|
||||||
|
device = torch.device("cpu")
|
||||||
|
|
||||||
try:
|
model.to(device)
|
||||||
# Clone the repository
|
|
||||||
subprocess.run(
|
|
||||||
["git", "clone", "-b", branch, repo_url, repo_folder],
|
|
||||||
stdout=subprocess.DEVNULL,
|
|
||||||
stderr=subprocess.DEVNULL,
|
|
||||||
check=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create the models directory if it doesn't exist
|
def _classifier(texts, threshold=0.5, max_length=64):
|
||||||
models_folder = os.path.join(home_folder, "models")
|
tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
|
||||||
os.makedirs(models_folder, exist_ok=True)
|
tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
|
||||||
|
|
||||||
# Copy the reuters model folder to the models directory
|
with torch.no_grad():
|
||||||
source_folder = os.path.join(repo_folder, "models/reuters")
|
output = model(**tokens)
|
||||||
shutil.copytree(source_folder, model_folder)
|
|
||||||
|
|
||||||
# Remove the cloned repository
|
scores = output.logits.detach().cpu().numpy()
|
||||||
shutil.rmtree(repo_folder)
|
scores = expit(scores)
|
||||||
|
predictions = (scores >= threshold) * 1
|
||||||
|
|
||||||
# Print completion message
|
batch_labels = []
|
||||||
print("[LOG] ✅ Model downloaded successfully")
|
for prediction in predictions:
|
||||||
except subprocess.CalledProcessError as e:
|
labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
|
||||||
print(f"An error occurred while cloning the repository: {e}")
|
batch_labels.append(labels)
|
||||||
except Exception as e:
|
|
||||||
print(f"An error occurred: {e}")
|
|
||||||
|
|
||||||
return spacy.load(model_folder)
|
return batch_labels
|
||||||
|
|
||||||
|
return _classifier
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def load_nltk_punkt():
|
||||||
|
import nltk
|
||||||
|
try:
|
||||||
|
nltk.data.find('tokenizers/punkt')
|
||||||
|
except LookupError:
|
||||||
|
nltk.download('punkt')
|
||||||
|
return nltk.data.find('tokenizers/punkt')
|
||||||
|
|
||||||
def download_all_models(remove_existing=False):
|
def download_all_models(remove_existing=False):
|
||||||
"""Download all models required for Crawl4AI."""
|
"""Download all models required for Crawl4AI."""
|
||||||
@@ -110,10 +108,10 @@ def download_all_models(remove_existing=False):
|
|||||||
load_bert_base_uncased()
|
load_bert_base_uncased()
|
||||||
print("[LOG] Downloading BGE Small EN v1.5...")
|
print("[LOG] Downloading BGE Small EN v1.5...")
|
||||||
load_bge_small_en_v1_5()
|
load_bge_small_en_v1_5()
|
||||||
print("[LOG] Downloading spaCy EN Core Web SM...")
|
print("[LOG] Downloading text classifier...")
|
||||||
load_spacy_en_core_web_sm()
|
load_text_multilabel_classifier
|
||||||
print("[LOG] Downloading custom spaCy model...")
|
print("[LOG] Downloading custom NLTK Punkt model...")
|
||||||
load_spacy_model()
|
load_nltk_punkt()
|
||||||
print("[LOG] ✅ All models downloaded successfully.")
|
print("[LOG] ✅ All models downloaded successfully.")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
@@ -3,6 +3,33 @@ from spacy.training import Example
|
|||||||
import random
|
import random
|
||||||
import nltk
|
import nltk
|
||||||
from nltk.corpus import reuters
|
from nltk.corpus import reuters
|
||||||
|
import torch
|
||||||
|
|
||||||
|
def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
|
||||||
|
# Extract the TextCategorizer component
|
||||||
|
textcat = nlp.get_pipe("textcat_multilabel")
|
||||||
|
|
||||||
|
# Convert the weights to a PyTorch state dictionary
|
||||||
|
state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
|
||||||
|
|
||||||
|
# Save the state dictionary
|
||||||
|
torch.save(state_dict, f"{model_dir}/model_weights.pth")
|
||||||
|
|
||||||
|
# Extract and save the vocabulary
|
||||||
|
vocab = extract_vocab(nlp)
|
||||||
|
with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
|
||||||
|
for word, idx in vocab.items():
|
||||||
|
vocab_file.write(f"{word}\t{idx}\n")
|
||||||
|
|
||||||
|
print(f"Model weights and vocabulary saved to: {model_dir}")
|
||||||
|
|
||||||
|
def extract_vocab(nlp):
|
||||||
|
# Extract vocabulary from the SpaCy model
|
||||||
|
vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
|
||||||
|
return vocab
|
||||||
|
|
||||||
|
nlp = spacy.load("models/reuters")
|
||||||
|
save_spacy_model_as_torch(nlp, model_dir="models")
|
||||||
|
|
||||||
def train_and_save_reuters_model(model_dir="models/reuters"):
|
def train_and_save_reuters_model(model_dir="models/reuters"):
|
||||||
# Ensure the Reuters corpus is downloaded
|
# Ensure the Reuters corpus is downloaded
|
||||||
@@ -96,8 +123,6 @@ def train_model(model_dir, additional_epochs=0):
|
|||||||
nlp.to_disk(model_dir)
|
nlp.to_disk(model_dir)
|
||||||
print(f"Model saved to: {model_dir}")
|
print(f"Model saved to: {model_dir}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_model_and_predict(model_dir, text, tok_k = 3):
|
def load_model_and_predict(model_dir, text, tok_k = 3):
|
||||||
# Load the trained model from the specified directory
|
# Load the trained model from the specified directory
|
||||||
nlp = spacy.load(model_dir)
|
nlp = spacy.load(model_dir)
|
||||||
@@ -111,7 +136,6 @@ def load_model_and_predict(model_dir, text, tok_k = 3):
|
|||||||
|
|
||||||
return top_categories
|
return top_categories
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
train_and_save_reuters_model()
|
train_and_save_reuters_model()
|
||||||
train_model("models/reuters", additional_epochs=5)
|
train_model("models/reuters", additional_epochs=5)
|
||||||
@@ -119,4 +143,4 @@ if __name__ == "__main__":
|
|||||||
print(reuters.categories())
|
print(reuters.categories())
|
||||||
example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
|
example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
|
||||||
r =load_model_and_predict(model_directory, example_text)
|
r =load_model_and_predict(model_directory, example_text)
|
||||||
print(r)
|
print(r)
|
||||||
@@ -11,7 +11,6 @@ from .crawler_strategy import *
|
|||||||
from typing import List
|
from typing import List
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from .config import *
|
from .config import *
|
||||||
# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
|
||||||
|
|
||||||
|
|
||||||
class WebCrawler:
|
class WebCrawler:
|
||||||
|
|||||||
@@ -59,12 +59,6 @@ def understanding_parameters(crawler):
|
|||||||
cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
|
cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
# Retrieve raw HTML content
|
|
||||||
cprint("\n🔄 [bold cyan]'include_raw_html' parameter example:[/bold cyan]", True)
|
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
|
||||||
cprint("[LOG] 📦 [bold yellow]Crawl result (without raw HTML content):[/bold yellow]")
|
|
||||||
print_result(result)
|
|
||||||
|
|
||||||
def add_chunking_strategy(crawler):
|
def add_chunking_strategy(crawler):
|
||||||
# Adding a chunking strategy: RegexChunking
|
# Adding a chunking strategy: RegexChunking
|
||||||
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
|
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
|
||||||
@@ -177,9 +171,19 @@ def main():
|
|||||||
|
|
||||||
crawler = create_crawler()
|
crawler = create_crawler()
|
||||||
|
|
||||||
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.", True)
|
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.")
|
||||||
crawler.always_by_pass_cache = True
|
crawler.always_by_pass_cache = True
|
||||||
|
|
||||||
|
|
||||||
|
cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
|
||||||
|
cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
|
||||||
|
result = crawler.run(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
chunking_strategy=NlpSentenceChunking()
|
||||||
|
)
|
||||||
|
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
basic_usage(crawler)
|
basic_usage(crawler)
|
||||||
understanding_parameters(crawler)
|
understanding_parameters(crawler)
|
||||||
add_chunking_strategy(crawler)
|
add_chunking_strategy(crawler)
|
||||||
|
|||||||
@@ -1,144 +0,0 @@
|
|||||||
[paths]
|
|
||||||
train = null
|
|
||||||
dev = null
|
|
||||||
vectors = null
|
|
||||||
init_tok2vec = null
|
|
||||||
|
|
||||||
[system]
|
|
||||||
seed = 0
|
|
||||||
gpu_allocator = null
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
pipeline = ["textcat_multilabel"]
|
|
||||||
disabled = []
|
|
||||||
before_creation = null
|
|
||||||
after_creation = null
|
|
||||||
after_pipeline_creation = null
|
|
||||||
batch_size = 1000
|
|
||||||
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
|
||||||
vectors = {"@vectors":"spacy.Vectors.v1"}
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.textcat_multilabel]
|
|
||||||
factory = "textcat_multilabel"
|
|
||||||
scorer = {"@scorers":"spacy.textcat_multilabel_scorer.v2"}
|
|
||||||
threshold = 0.5
|
|
||||||
|
|
||||||
[components.textcat_multilabel.model]
|
|
||||||
@architectures = "spacy.TextCatEnsemble.v2"
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
|
||||||
@architectures = "spacy.TextCatBOW.v3"
|
|
||||||
exclusive_classes = false
|
|
||||||
length = 262144
|
|
||||||
ngram_size = 1
|
|
||||||
no_output_layer = false
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.textcat_multilabel.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2Vec.v2"
|
|
||||||
|
|
||||||
[components.textcat_multilabel.model.tok2vec.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v2"
|
|
||||||
width = 64
|
|
||||||
rows = [2000,2000,500,1000,500]
|
|
||||||
attrs = ["NORM","LOWER","PREFIX","SUFFIX","SHAPE"]
|
|
||||||
include_static_vectors = false
|
|
||||||
|
|
||||||
[components.textcat_multilabel.model.tok2vec.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
|
||||||
width = 64
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
||||||
depth = 2
|
|
||||||
|
|
||||||
[corpora]
|
|
||||||
|
|
||||||
[corpora.dev]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths.dev}
|
|
||||||
gold_preproc = false
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
augmenter = null
|
|
||||||
|
|
||||||
[corpora.train]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths.train}
|
|
||||||
gold_preproc = false
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
augmenter = null
|
|
||||||
|
|
||||||
[training]
|
|
||||||
seed = ${system.seed}
|
|
||||||
gpu_allocator = ${system.gpu_allocator}
|
|
||||||
dropout = 0.1
|
|
||||||
accumulate_gradient = 1
|
|
||||||
patience = 1600
|
|
||||||
max_epochs = 0
|
|
||||||
max_steps = 20000
|
|
||||||
eval_frequency = 200
|
|
||||||
frozen_components = []
|
|
||||||
annotating_components = []
|
|
||||||
dev_corpus = "corpora.dev"
|
|
||||||
train_corpus = "corpora.train"
|
|
||||||
before_to_disk = null
|
|
||||||
before_update = null
|
|
||||||
|
|
||||||
[training.batcher]
|
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
|
||||||
discard_oversize = false
|
|
||||||
tolerance = 0.2
|
|
||||||
get_length = null
|
|
||||||
|
|
||||||
[training.batcher.size]
|
|
||||||
@schedules = "compounding.v1"
|
|
||||||
start = 100
|
|
||||||
stop = 1000
|
|
||||||
compound = 1.001
|
|
||||||
t = 0.0
|
|
||||||
|
|
||||||
[training.logger]
|
|
||||||
@loggers = "spacy.ConsoleLogger.v1"
|
|
||||||
progress_bar = false
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = false
|
|
||||||
eps = 0.00000001
|
|
||||||
learn_rate = 0.001
|
|
||||||
|
|
||||||
[training.score_weights]
|
|
||||||
cats_score = 1.0
|
|
||||||
cats_score_desc = null
|
|
||||||
cats_micro_p = null
|
|
||||||
cats_micro_r = null
|
|
||||||
cats_micro_f = null
|
|
||||||
cats_macro_p = null
|
|
||||||
cats_macro_r = null
|
|
||||||
cats_macro_f = null
|
|
||||||
cats_macro_auc = null
|
|
||||||
cats_f_per_type = null
|
|
||||||
|
|
||||||
[pretraining]
|
|
||||||
|
|
||||||
[initialize]
|
|
||||||
vectors = ${paths.vectors}
|
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
|
||||||
vocab_data = null
|
|
||||||
lookups = null
|
|
||||||
before_init = null
|
|
||||||
after_init = null
|
|
||||||
|
|
||||||
[initialize.components]
|
|
||||||
|
|
||||||
[initialize.tokenizer]
|
|
||||||
@@ -1,122 +0,0 @@
|
|||||||
{
|
|
||||||
"lang":"en",
|
|
||||||
"name":"pipeline",
|
|
||||||
"version":"0.0.0",
|
|
||||||
"spacy_version":">=3.7.4,<3.8.0",
|
|
||||||
"description":"",
|
|
||||||
"author":"",
|
|
||||||
"email":"",
|
|
||||||
"url":"",
|
|
||||||
"license":"",
|
|
||||||
"spacy_git_version":"bff8725f4",
|
|
||||||
"vectors":{
|
|
||||||
"width":0,
|
|
||||||
"vectors":0,
|
|
||||||
"keys":0,
|
|
||||||
"name":null,
|
|
||||||
"mode":"default"
|
|
||||||
},
|
|
||||||
"labels":{
|
|
||||||
"textcat_multilabel":[
|
|
||||||
"acq",
|
|
||||||
"alum",
|
|
||||||
"barley",
|
|
||||||
"bop",
|
|
||||||
"carcass",
|
|
||||||
"castor-oil",
|
|
||||||
"cocoa",
|
|
||||||
"coconut",
|
|
||||||
"coconut-oil",
|
|
||||||
"coffee",
|
|
||||||
"copper",
|
|
||||||
"copra-cake",
|
|
||||||
"corn",
|
|
||||||
"cotton",
|
|
||||||
"cotton-oil",
|
|
||||||
"cpi",
|
|
||||||
"cpu",
|
|
||||||
"crude",
|
|
||||||
"dfl",
|
|
||||||
"dlr",
|
|
||||||
"dmk",
|
|
||||||
"earn",
|
|
||||||
"fuel",
|
|
||||||
"gas",
|
|
||||||
"gnp",
|
|
||||||
"gold",
|
|
||||||
"grain",
|
|
||||||
"groundnut",
|
|
||||||
"groundnut-oil",
|
|
||||||
"heat",
|
|
||||||
"hog",
|
|
||||||
"housing",
|
|
||||||
"income",
|
|
||||||
"instal-debt",
|
|
||||||
"interest",
|
|
||||||
"ipi",
|
|
||||||
"iron-steel",
|
|
||||||
"jet",
|
|
||||||
"jobs",
|
|
||||||
"l-cattle",
|
|
||||||
"lead",
|
|
||||||
"lei",
|
|
||||||
"lin-oil",
|
|
||||||
"livestock",
|
|
||||||
"lumber",
|
|
||||||
"meal-feed",
|
|
||||||
"money-fx",
|
|
||||||
"money-supply",
|
|
||||||
"naphtha",
|
|
||||||
"nat-gas",
|
|
||||||
"nickel",
|
|
||||||
"nkr",
|
|
||||||
"nzdlr",
|
|
||||||
"oat",
|
|
||||||
"oilseed",
|
|
||||||
"orange",
|
|
||||||
"palladium",
|
|
||||||
"palm-oil",
|
|
||||||
"palmkernel",
|
|
||||||
"pet-chem",
|
|
||||||
"platinum",
|
|
||||||
"potato",
|
|
||||||
"propane",
|
|
||||||
"rand",
|
|
||||||
"rape-oil",
|
|
||||||
"rapeseed",
|
|
||||||
"reserves",
|
|
||||||
"retail",
|
|
||||||
"rice",
|
|
||||||
"rubber",
|
|
||||||
"rye",
|
|
||||||
"ship",
|
|
||||||
"silver",
|
|
||||||
"sorghum",
|
|
||||||
"soy-meal",
|
|
||||||
"soy-oil",
|
|
||||||
"soybean",
|
|
||||||
"strategic-metal",
|
|
||||||
"sugar",
|
|
||||||
"sun-meal",
|
|
||||||
"sun-oil",
|
|
||||||
"sunseed",
|
|
||||||
"tea",
|
|
||||||
"tin",
|
|
||||||
"trade",
|
|
||||||
"veg-oil",
|
|
||||||
"wheat",
|
|
||||||
"wpi",
|
|
||||||
"yen",
|
|
||||||
"zinc"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"pipeline":[
|
|
||||||
"textcat_multilabel"
|
|
||||||
],
|
|
||||||
"components":[
|
|
||||||
"textcat_multilabel"
|
|
||||||
],
|
|
||||||
"disabled":[
|
|
||||||
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -1,95 +0,0 @@
|
|||||||
{
|
|
||||||
"labels":[
|
|
||||||
"acq",
|
|
||||||
"alum",
|
|
||||||
"barley",
|
|
||||||
"bop",
|
|
||||||
"carcass",
|
|
||||||
"castor-oil",
|
|
||||||
"cocoa",
|
|
||||||
"coconut",
|
|
||||||
"coconut-oil",
|
|
||||||
"coffee",
|
|
||||||
"copper",
|
|
||||||
"copra-cake",
|
|
||||||
"corn",
|
|
||||||
"cotton",
|
|
||||||
"cotton-oil",
|
|
||||||
"cpi",
|
|
||||||
"cpu",
|
|
||||||
"crude",
|
|
||||||
"dfl",
|
|
||||||
"dlr",
|
|
||||||
"dmk",
|
|
||||||
"earn",
|
|
||||||
"fuel",
|
|
||||||
"gas",
|
|
||||||
"gnp",
|
|
||||||
"gold",
|
|
||||||
"grain",
|
|
||||||
"groundnut",
|
|
||||||
"groundnut-oil",
|
|
||||||
"heat",
|
|
||||||
"hog",
|
|
||||||
"housing",
|
|
||||||
"income",
|
|
||||||
"instal-debt",
|
|
||||||
"interest",
|
|
||||||
"ipi",
|
|
||||||
"iron-steel",
|
|
||||||
"jet",
|
|
||||||
"jobs",
|
|
||||||
"l-cattle",
|
|
||||||
"lead",
|
|
||||||
"lei",
|
|
||||||
"lin-oil",
|
|
||||||
"livestock",
|
|
||||||
"lumber",
|
|
||||||
"meal-feed",
|
|
||||||
"money-fx",
|
|
||||||
"money-supply",
|
|
||||||
"naphtha",
|
|
||||||
"nat-gas",
|
|
||||||
"nickel",
|
|
||||||
"nkr",
|
|
||||||
"nzdlr",
|
|
||||||
"oat",
|
|
||||||
"oilseed",
|
|
||||||
"orange",
|
|
||||||
"palladium",
|
|
||||||
"palm-oil",
|
|
||||||
"palmkernel",
|
|
||||||
"pet-chem",
|
|
||||||
"platinum",
|
|
||||||
"potato",
|
|
||||||
"propane",
|
|
||||||
"rand",
|
|
||||||
"rape-oil",
|
|
||||||
"rapeseed",
|
|
||||||
"reserves",
|
|
||||||
"retail",
|
|
||||||
"rice",
|
|
||||||
"rubber",
|
|
||||||
"rye",
|
|
||||||
"ship",
|
|
||||||
"silver",
|
|
||||||
"sorghum",
|
|
||||||
"soy-meal",
|
|
||||||
"soy-oil",
|
|
||||||
"soybean",
|
|
||||||
"strategic-metal",
|
|
||||||
"sugar",
|
|
||||||
"sun-meal",
|
|
||||||
"sun-oil",
|
|
||||||
"sunseed",
|
|
||||||
"tea",
|
|
||||||
"tin",
|
|
||||||
"trade",
|
|
||||||
"veg-oil",
|
|
||||||
"wheat",
|
|
||||||
"wpi",
|
|
||||||
"yen",
|
|
||||||
"zinc"
|
|
||||||
],
|
|
||||||
"threshold":0.5
|
|
||||||
}
|
|
||||||
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -1 +0,0 @@
|
|||||||
<EFBFBD>
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
<EFBFBD>
|
|
||||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -1,3 +0,0 @@
|
|||||||
{
|
|
||||||
"mode":"default"
|
|
||||||
}
|
|
||||||
@@ -14,7 +14,6 @@ requests==2.31.0
|
|||||||
rich==13.7.1
|
rich==13.7.1
|
||||||
scikit-learn==1.4.2
|
scikit-learn==1.4.2
|
||||||
selenium==4.20.0
|
selenium==4.20.0
|
||||||
spacy==3.7.4
|
|
||||||
uvicorn==0.29.0
|
uvicorn==0.29.0
|
||||||
transformers==4.40.2
|
transformers==4.40.2
|
||||||
chromedriver-autoinstaller==0.6.4
|
chromedriver-autoinstaller==0.6.4
|
||||||
|
|||||||
@@ -1,35 +0,0 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
def install_crawl4ai():
|
|
||||||
print("Installing Crawl4AI and its dependencies...")
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
!pip install -U 'spacy[cuda12x]'
|
|
||||||
!apt-get update -y
|
|
||||||
!apt install chromium-chromedriver -y
|
|
||||||
!pip install chromedriver_autoinstaller
|
|
||||||
!pip install git+https://github.com/unclecode/crawl4ai.git@new-release-0.0.2
|
|
||||||
|
|
||||||
# Install ChromeDriver
|
|
||||||
import chromedriver_autoinstaller
|
|
||||||
chromedriver_autoinstaller.install()
|
|
||||||
|
|
||||||
# Download the reuters model
|
|
||||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
|
||||||
branch = "new-release-0.0.2"
|
|
||||||
folder_path = "models/reuters"
|
|
||||||
|
|
||||||
!git clone -b {branch} {repo_url}
|
|
||||||
!mkdir -p models
|
|
||||||
|
|
||||||
repo_folder = "crawl4ai"
|
|
||||||
source_folder = os.path.join(repo_folder, folder_path)
|
|
||||||
destination_folder = "models"
|
|
||||||
|
|
||||||
!mv "{source_folder}" "{destination_folder}"
|
|
||||||
!rm -rf "{repo_folder}"
|
|
||||||
|
|
||||||
print("Installation and model download completed successfully!")
|
|
||||||
|
|
||||||
# Run the installer
|
|
||||||
install_crawl4ai()
|
|
||||||
Reference in New Issue
Block a user