Remove dependency on Spacy model.
This commit is contained in:
@@ -1,12 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import re
|
||||
# spacy = lazy_import.lazy_module('spacy')
|
||||
# nl = lazy_import.lazy_module('nltk')
|
||||
# from nltk.corpus import stopwords
|
||||
# from nltk.tokenize import word_tokenize, TextTilingTokenizer
|
||||
from collections import Counter
|
||||
import string
|
||||
from .model_loader import load_spacy_en_core_web_sm
|
||||
from .model_loader import load_nltk_punkt
|
||||
|
||||
# Define the abstract base class for chunking strategies
|
||||
class ChunkingStrategy(ABC):
|
||||
@@ -34,15 +30,24 @@ class RegexChunking(ChunkingStrategy):
|
||||
paragraphs = new_paragraphs
|
||||
return paragraphs
|
||||
|
||||
# NLP-based sentence chunking using spaCy
|
||||
|
||||
# NLP-based sentence chunking
|
||||
class NlpSentenceChunking(ChunkingStrategy):
|
||||
def __init__(self, model='en_core_web_sm'):
|
||||
self.nlp = load_spacy_en_core_web_sm()
|
||||
def __init__(self):
|
||||
load_nltk_punkt()
|
||||
pass
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
doc = self.nlp(text)
|
||||
return [sent.text.strip() for sent in doc.sents]
|
||||
# Improved regex for sentence splitting
|
||||
# sentence_endings = re.compile(
|
||||
# r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
|
||||
# )
|
||||
# sentences = sentence_endings.split(text)
|
||||
# sens = [sent.strip() for sent in sentences if sent]
|
||||
from nltk.tokenize import sent_tokenize
|
||||
sentences = sent_tokenize(text)
|
||||
sens = [sent.strip() for sent in sentences]
|
||||
|
||||
return list(set(sens))
|
||||
|
||||
# Topic-based segmentation using TextTiling
|
||||
class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
@@ -7,7 +7,7 @@ from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTI
|
||||
from .config import *
|
||||
from .utils import *
|
||||
from functools import partial
|
||||
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
||||
from .model_loader import *
|
||||
|
||||
|
||||
import numpy as np
|
||||
@@ -45,14 +45,13 @@ class ExtractionStrategy(ABC):
|
||||
for future in as_completed(futures):
|
||||
extracted_content.extend(future.result())
|
||||
return extracted_content
|
||||
|
||||
class NoExtractionStrategy(ExtractionStrategy):
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": 0, "content": html}]
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
||||
|
||||
|
||||
class LLMExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
|
||||
"""
|
||||
@@ -166,10 +165,6 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
from transformers import BertTokenizer, BertModel, pipeline
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
import spacy
|
||||
|
||||
self.semantic_filter = semantic_filter
|
||||
self.word_count_threshold = word_count_threshold
|
||||
self.max_dist = max_dist
|
||||
@@ -184,10 +179,9 @@ class CosineStrategy(ExtractionStrategy):
|
||||
elif model_name == "BAAI/bge-small-en-v1.5":
|
||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
|
||||
self.nlp = load_spacy_model()
|
||||
self.nlp = load_text_multilabel_classifier()
|
||||
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
||||
|
||||
|
||||
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
|
||||
"""
|
||||
Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||||
@@ -310,13 +304,19 @@ class CosineStrategy(ExtractionStrategy):
|
||||
|
||||
# Convert filtered clusters to a sorted list of dictionaries
|
||||
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
||||
|
||||
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
||||
|
||||
for cluster, label in zip(cluster_list, labels):
|
||||
cluster['tags'] = label
|
||||
|
||||
# Process the text with the loaded model
|
||||
for cluster in cluster_list:
|
||||
doc = self.nlp(cluster['content'])
|
||||
tok_k = self.top_k
|
||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
# for cluster in cluster_list:
|
||||
# cluster['tags'] = self.nlp(cluster['content'])[0]['label']
|
||||
# doc = self.nlp(cluster['content'])
|
||||
# tok_k = self.top_k
|
||||
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
|
||||
# print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||||
|
||||
|
||||
@@ -28,68 +28,66 @@ def load_bge_small_en_v1_5():
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_en_core_web_sm():
|
||||
import spacy
|
||||
try:
|
||||
print("[LOG] Loading spaCy model")
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
except IOError:
|
||||
print("[LOG] ⏬ Downloading spaCy model for the first time")
|
||||
spacy.cli.download("en_core_web_sm")
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
print("[LOG] ✅ spaCy model loaded successfully")
|
||||
return nlp
|
||||
def load_text_classifier():
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
from transformers import pipeline
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
||||
|
||||
return pipe
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_model():
|
||||
import spacy
|
||||
name = "models/reuters"
|
||||
home_folder = get_home_folder()
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
# Check if the model directory already exists
|
||||
if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
|
||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
||||
# branch = "main"
|
||||
branch = MODEL_REPO_BRANCH
|
||||
repo_folder = os.path.join(home_folder, "crawl4ai")
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
def load_text_multilabel_classifier():
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
import numpy as np
|
||||
from scipy.special import expit
|
||||
import torch
|
||||
|
||||
print("[LOG] ⏬ Downloading model for the first time...")
|
||||
MODEL = "cardiffnlp/tweet-topic-21-multi"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
|
||||
class_mapping = model.config.id2label
|
||||
|
||||
# Remove existing repo folder if it exists
|
||||
if Path(repo_folder).exists():
|
||||
shutil.rmtree(repo_folder)
|
||||
shutil.rmtree(model_folder)
|
||||
# Check for available device: CUDA, MPS (for Apple Silicon), or CPU
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device("cuda")
|
||||
elif torch.backends.mps.is_available():
|
||||
device = torch.device("mps")
|
||||
else:
|
||||
device = torch.device("cpu")
|
||||
|
||||
try:
|
||||
# Clone the repository
|
||||
subprocess.run(
|
||||
["git", "clone", "-b", branch, repo_url, repo_folder],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True
|
||||
)
|
||||
model.to(device)
|
||||
|
||||
# Create the models directory if it doesn't exist
|
||||
models_folder = os.path.join(home_folder, "models")
|
||||
os.makedirs(models_folder, exist_ok=True)
|
||||
def _classifier(texts, threshold=0.5, max_length=64):
|
||||
tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
|
||||
tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
|
||||
|
||||
# Copy the reuters model folder to the models directory
|
||||
source_folder = os.path.join(repo_folder, "models/reuters")
|
||||
shutil.copytree(source_folder, model_folder)
|
||||
with torch.no_grad():
|
||||
output = model(**tokens)
|
||||
|
||||
# Remove the cloned repository
|
||||
shutil.rmtree(repo_folder)
|
||||
scores = output.logits.detach().cpu().numpy()
|
||||
scores = expit(scores)
|
||||
predictions = (scores >= threshold) * 1
|
||||
|
||||
# Print completion message
|
||||
print("[LOG] ✅ Model downloaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while cloning the repository: {e}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
batch_labels = []
|
||||
for prediction in predictions:
|
||||
labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
|
||||
batch_labels.append(labels)
|
||||
|
||||
return spacy.load(model_folder)
|
||||
return batch_labels
|
||||
|
||||
return _classifier
|
||||
|
||||
@lru_cache()
|
||||
def load_nltk_punkt():
|
||||
import nltk
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
return nltk.data.find('tokenizers/punkt')
|
||||
|
||||
def download_all_models(remove_existing=False):
|
||||
"""Download all models required for Crawl4AI."""
|
||||
@@ -110,10 +108,10 @@ def download_all_models(remove_existing=False):
|
||||
load_bert_base_uncased()
|
||||
print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
load_bge_small_en_v1_5()
|
||||
print("[LOG] Downloading spaCy EN Core Web SM...")
|
||||
load_spacy_en_core_web_sm()
|
||||
print("[LOG] Downloading custom spaCy model...")
|
||||
load_spacy_model()
|
||||
print("[LOG] Downloading text classifier...")
|
||||
load_text_multilabel_classifier
|
||||
print("[LOG] Downloading custom NLTK Punkt model...")
|
||||
load_nltk_punkt()
|
||||
print("[LOG] ✅ All models downloaded successfully.")
|
||||
|
||||
def main():
|
||||
|
||||
@@ -3,6 +3,33 @@ from spacy.training import Example
|
||||
import random
|
||||
import nltk
|
||||
from nltk.corpus import reuters
|
||||
import torch
|
||||
|
||||
def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
|
||||
# Extract the TextCategorizer component
|
||||
textcat = nlp.get_pipe("textcat_multilabel")
|
||||
|
||||
# Convert the weights to a PyTorch state dictionary
|
||||
state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
|
||||
|
||||
# Save the state dictionary
|
||||
torch.save(state_dict, f"{model_dir}/model_weights.pth")
|
||||
|
||||
# Extract and save the vocabulary
|
||||
vocab = extract_vocab(nlp)
|
||||
with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
|
||||
for word, idx in vocab.items():
|
||||
vocab_file.write(f"{word}\t{idx}\n")
|
||||
|
||||
print(f"Model weights and vocabulary saved to: {model_dir}")
|
||||
|
||||
def extract_vocab(nlp):
|
||||
# Extract vocabulary from the SpaCy model
|
||||
vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
|
||||
return vocab
|
||||
|
||||
nlp = spacy.load("models/reuters")
|
||||
save_spacy_model_as_torch(nlp, model_dir="models")
|
||||
|
||||
def train_and_save_reuters_model(model_dir="models/reuters"):
|
||||
# Ensure the Reuters corpus is downloaded
|
||||
@@ -96,8 +123,6 @@ def train_model(model_dir, additional_epochs=0):
|
||||
nlp.to_disk(model_dir)
|
||||
print(f"Model saved to: {model_dir}")
|
||||
|
||||
|
||||
|
||||
def load_model_and_predict(model_dir, text, tok_k = 3):
|
||||
# Load the trained model from the specified directory
|
||||
nlp = spacy.load(model_dir)
|
||||
@@ -111,7 +136,6 @@ def load_model_and_predict(model_dir, text, tok_k = 3):
|
||||
|
||||
return top_categories
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_and_save_reuters_model()
|
||||
train_model("models/reuters", additional_epochs=5)
|
||||
@@ -119,4 +143,4 @@ if __name__ == "__main__":
|
||||
print(reuters.categories())
|
||||
example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
|
||||
r =load_model_and_predict(model_directory, example_text)
|
||||
print(r)
|
||||
print(r)
|
||||
@@ -11,7 +11,6 @@ from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .config import *
|
||||
# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
||||
|
||||
|
||||
class WebCrawler:
|
||||
|
||||
@@ -59,12 +59,6 @@ def understanding_parameters(crawler):
|
||||
cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
# Retrieve raw HTML content
|
||||
cprint("\n🔄 [bold cyan]'include_raw_html' parameter example:[/bold cyan]", True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
||||
cprint("[LOG] 📦 [bold yellow]Crawl result (without raw HTML content):[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def add_chunking_strategy(crawler):
|
||||
# Adding a chunking strategy: RegexChunking
|
||||
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
|
||||
@@ -177,9 +171,19 @@ def main():
|
||||
|
||||
crawler = create_crawler()
|
||||
|
||||
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.", True)
|
||||
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.")
|
||||
crawler.always_by_pass_cache = True
|
||||
|
||||
|
||||
cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
|
||||
cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
basic_usage(crawler)
|
||||
understanding_parameters(crawler)
|
||||
add_chunking_strategy(crawler)
|
||||
|
||||
@@ -1,144 +0,0 @@
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
vectors = null
|
||||
init_tok2vec = null
|
||||
|
||||
[system]
|
||||
seed = 0
|
||||
gpu_allocator = null
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["textcat_multilabel"]
|
||||
disabled = []
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
batch_size = 1000
|
||||
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||
vectors = {"@vectors":"spacy.Vectors.v1"}
|
||||
|
||||
[components]
|
||||
|
||||
[components.textcat_multilabel]
|
||||
factory = "textcat_multilabel"
|
||||
scorer = {"@scorers":"spacy.textcat_multilabel_scorer.v2"}
|
||||
threshold = 0.5
|
||||
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = false
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = 64
|
||||
rows = [2000,2000,500,1000,500]
|
||||
attrs = ["NORM","LOWER","PREFIX","SUFFIX","SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = 64
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
depth = 2
|
||||
|
||||
[corpora]
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.dev}
|
||||
gold_preproc = false
|
||||
max_length = 0
|
||||
limit = 0
|
||||
augmenter = null
|
||||
|
||||
[corpora.train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
gold_preproc = false
|
||||
max_length = 0
|
||||
limit = 0
|
||||
augmenter = null
|
||||
|
||||
[training]
|
||||
seed = ${system.seed}
|
||||
gpu_allocator = ${system.gpu_allocator}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 200
|
||||
frozen_components = []
|
||||
annotating_components = []
|
||||
dev_corpus = "corpora.dev"
|
||||
train_corpus = "corpora.train"
|
||||
before_to_disk = null
|
||||
before_update = null
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
get_length = null
|
||||
|
||||
[training.batcher.size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
t = 0.0
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
progress_bar = false
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = false
|
||||
eps = 0.00000001
|
||||
learn_rate = 0.001
|
||||
|
||||
[training.score_weights]
|
||||
cats_score = 1.0
|
||||
cats_score_desc = null
|
||||
cats_micro_p = null
|
||||
cats_micro_r = null
|
||||
cats_micro_f = null
|
||||
cats_macro_p = null
|
||||
cats_macro_r = null
|
||||
cats_macro_f = null
|
||||
cats_macro_auc = null
|
||||
cats_f_per_type = null
|
||||
|
||||
[pretraining]
|
||||
|
||||
[initialize]
|
||||
vectors = ${paths.vectors}
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
vocab_data = null
|
||||
lookups = null
|
||||
before_init = null
|
||||
after_init = null
|
||||
|
||||
[initialize.components]
|
||||
|
||||
[initialize.tokenizer]
|
||||
@@ -1,122 +0,0 @@
|
||||
{
|
||||
"lang":"en",
|
||||
"name":"pipeline",
|
||||
"version":"0.0.0",
|
||||
"spacy_version":">=3.7.4,<3.8.0",
|
||||
"description":"",
|
||||
"author":"",
|
||||
"email":"",
|
||||
"url":"",
|
||||
"license":"",
|
||||
"spacy_git_version":"bff8725f4",
|
||||
"vectors":{
|
||||
"width":0,
|
||||
"vectors":0,
|
||||
"keys":0,
|
||||
"name":null,
|
||||
"mode":"default"
|
||||
},
|
||||
"labels":{
|
||||
"textcat_multilabel":[
|
||||
"acq",
|
||||
"alum",
|
||||
"barley",
|
||||
"bop",
|
||||
"carcass",
|
||||
"castor-oil",
|
||||
"cocoa",
|
||||
"coconut",
|
||||
"coconut-oil",
|
||||
"coffee",
|
||||
"copper",
|
||||
"copra-cake",
|
||||
"corn",
|
||||
"cotton",
|
||||
"cotton-oil",
|
||||
"cpi",
|
||||
"cpu",
|
||||
"crude",
|
||||
"dfl",
|
||||
"dlr",
|
||||
"dmk",
|
||||
"earn",
|
||||
"fuel",
|
||||
"gas",
|
||||
"gnp",
|
||||
"gold",
|
||||
"grain",
|
||||
"groundnut",
|
||||
"groundnut-oil",
|
||||
"heat",
|
||||
"hog",
|
||||
"housing",
|
||||
"income",
|
||||
"instal-debt",
|
||||
"interest",
|
||||
"ipi",
|
||||
"iron-steel",
|
||||
"jet",
|
||||
"jobs",
|
||||
"l-cattle",
|
||||
"lead",
|
||||
"lei",
|
||||
"lin-oil",
|
||||
"livestock",
|
||||
"lumber",
|
||||
"meal-feed",
|
||||
"money-fx",
|
||||
"money-supply",
|
||||
"naphtha",
|
||||
"nat-gas",
|
||||
"nickel",
|
||||
"nkr",
|
||||
"nzdlr",
|
||||
"oat",
|
||||
"oilseed",
|
||||
"orange",
|
||||
"palladium",
|
||||
"palm-oil",
|
||||
"palmkernel",
|
||||
"pet-chem",
|
||||
"platinum",
|
||||
"potato",
|
||||
"propane",
|
||||
"rand",
|
||||
"rape-oil",
|
||||
"rapeseed",
|
||||
"reserves",
|
||||
"retail",
|
||||
"rice",
|
||||
"rubber",
|
||||
"rye",
|
||||
"ship",
|
||||
"silver",
|
||||
"sorghum",
|
||||
"soy-meal",
|
||||
"soy-oil",
|
||||
"soybean",
|
||||
"strategic-metal",
|
||||
"sugar",
|
||||
"sun-meal",
|
||||
"sun-oil",
|
||||
"sunseed",
|
||||
"tea",
|
||||
"tin",
|
||||
"trade",
|
||||
"veg-oil",
|
||||
"wheat",
|
||||
"wpi",
|
||||
"yen",
|
||||
"zinc"
|
||||
]
|
||||
},
|
||||
"pipeline":[
|
||||
"textcat_multilabel"
|
||||
],
|
||||
"components":[
|
||||
"textcat_multilabel"
|
||||
],
|
||||
"disabled":[
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,95 +0,0 @@
|
||||
{
|
||||
"labels":[
|
||||
"acq",
|
||||
"alum",
|
||||
"barley",
|
||||
"bop",
|
||||
"carcass",
|
||||
"castor-oil",
|
||||
"cocoa",
|
||||
"coconut",
|
||||
"coconut-oil",
|
||||
"coffee",
|
||||
"copper",
|
||||
"copra-cake",
|
||||
"corn",
|
||||
"cotton",
|
||||
"cotton-oil",
|
||||
"cpi",
|
||||
"cpu",
|
||||
"crude",
|
||||
"dfl",
|
||||
"dlr",
|
||||
"dmk",
|
||||
"earn",
|
||||
"fuel",
|
||||
"gas",
|
||||
"gnp",
|
||||
"gold",
|
||||
"grain",
|
||||
"groundnut",
|
||||
"groundnut-oil",
|
||||
"heat",
|
||||
"hog",
|
||||
"housing",
|
||||
"income",
|
||||
"instal-debt",
|
||||
"interest",
|
||||
"ipi",
|
||||
"iron-steel",
|
||||
"jet",
|
||||
"jobs",
|
||||
"l-cattle",
|
||||
"lead",
|
||||
"lei",
|
||||
"lin-oil",
|
||||
"livestock",
|
||||
"lumber",
|
||||
"meal-feed",
|
||||
"money-fx",
|
||||
"money-supply",
|
||||
"naphtha",
|
||||
"nat-gas",
|
||||
"nickel",
|
||||
"nkr",
|
||||
"nzdlr",
|
||||
"oat",
|
||||
"oilseed",
|
||||
"orange",
|
||||
"palladium",
|
||||
"palm-oil",
|
||||
"palmkernel",
|
||||
"pet-chem",
|
||||
"platinum",
|
||||
"potato",
|
||||
"propane",
|
||||
"rand",
|
||||
"rape-oil",
|
||||
"rapeseed",
|
||||
"reserves",
|
||||
"retail",
|
||||
"rice",
|
||||
"rubber",
|
||||
"rye",
|
||||
"ship",
|
||||
"silver",
|
||||
"sorghum",
|
||||
"soy-meal",
|
||||
"soy-oil",
|
||||
"soybean",
|
||||
"strategic-metal",
|
||||
"sugar",
|
||||
"sun-meal",
|
||||
"sun-oil",
|
||||
"sunseed",
|
||||
"tea",
|
||||
"tin",
|
||||
"trade",
|
||||
"veg-oil",
|
||||
"wheat",
|
||||
"wpi",
|
||||
"yen",
|
||||
"zinc"
|
||||
],
|
||||
"threshold":0.5
|
||||
}
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -1 +0,0 @@
|
||||
<EFBFBD>
|
||||
@@ -1 +0,0 @@
|
||||
<EFBFBD>
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -1,3 +0,0 @@
|
||||
{
|
||||
"mode":"default"
|
||||
}
|
||||
@@ -14,7 +14,6 @@ requests==2.31.0
|
||||
rich==13.7.1
|
||||
scikit-learn==1.4.2
|
||||
selenium==4.20.0
|
||||
spacy==3.7.4
|
||||
uvicorn==0.29.0
|
||||
transformers==4.40.2
|
||||
chromedriver-autoinstaller==0.6.4
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
import os
|
||||
|
||||
def install_crawl4ai():
|
||||
print("Installing Crawl4AI and its dependencies...")
|
||||
|
||||
# Install dependencies
|
||||
!pip install -U 'spacy[cuda12x]'
|
||||
!apt-get update -y
|
||||
!apt install chromium-chromedriver -y
|
||||
!pip install chromedriver_autoinstaller
|
||||
!pip install git+https://github.com/unclecode/crawl4ai.git@new-release-0.0.2
|
||||
|
||||
# Install ChromeDriver
|
||||
import chromedriver_autoinstaller
|
||||
chromedriver_autoinstaller.install()
|
||||
|
||||
# Download the reuters model
|
||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
||||
branch = "new-release-0.0.2"
|
||||
folder_path = "models/reuters"
|
||||
|
||||
!git clone -b {branch} {repo_url}
|
||||
!mkdir -p models
|
||||
|
||||
repo_folder = "crawl4ai"
|
||||
source_folder = os.path.join(repo_folder, folder_path)
|
||||
destination_folder = "models"
|
||||
|
||||
!mv "{source_folder}" "{destination_folder}"
|
||||
!rm -rf "{repo_folder}"
|
||||
|
||||
print("Installation and model download completed successfully!")
|
||||
|
||||
# Run the installer
|
||||
install_crawl4ai()
|
||||
Reference in New Issue
Block a user