Improve libraries import

This commit is contained in:
unclecode
2024-05-13 02:46:35 +08:00
parent 11393183f7
commit 5fea6c064b
5 changed files with 231 additions and 125 deletions

View File

@@ -1,9 +1,9 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import re import re
import spacy # spacy = lazy_import.lazy_module('spacy')
import nltk # nl = lazy_import.lazy_module('nltk')
from nltk.corpus import stopwords # from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TextTilingTokenizer # from nltk.tokenize import word_tokenize, TextTilingTokenizer
from collections import Counter from collections import Counter
import string import string
@@ -34,8 +34,10 @@ class RegexChunking(ChunkingStrategy):
return paragraphs return paragraphs
# NLP-based sentence chunking using spaCy # NLP-based sentence chunking using spaCy
class NlpSentenceChunking(ChunkingStrategy): class NlpSentenceChunking(ChunkingStrategy):
def __init__(self, model='en_core_web_sm'): def __init__(self, model='en_core_web_sm'):
import spacy
self.nlp = spacy.load(model) self.nlp = spacy.load(model)
def chunk(self, text: str) -> list: def chunk(self, text: str) -> list:
@@ -44,8 +46,10 @@ class NlpSentenceChunking(ChunkingStrategy):
# Topic-based segmentation using TextTiling # Topic-based segmentation using TextTiling
class TopicSegmentationChunking(ChunkingStrategy): class TopicSegmentationChunking(ChunkingStrategy):
def __init__(self, num_keywords=3): def __init__(self, num_keywords=3):
self.tokenizer = TextTilingTokenizer() import nltk as nl
self.tokenizer = nl.toknize.TextTilingTokenizer()
self.num_keywords = num_keywords self.num_keywords = num_keywords
def chunk(self, text: str) -> list: def chunk(self, text: str) -> list:
@@ -55,8 +59,9 @@ class TopicSegmentationChunking(ChunkingStrategy):
def extract_keywords(self, text: str) -> list: def extract_keywords(self, text: str) -> list:
# Tokenize and remove stopwords and punctuation # Tokenize and remove stopwords and punctuation
tokens = word_tokenize(text) import nltk as nl
tokens = [token.lower() for token in tokens if token not in stopwords.words('english') and token not in string.punctuation] tokens = nl.toknize.word_tokenize(text)
tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation]
# Calculate frequency distribution # Calculate frequency distribution
freq_dist = Counter(tokens) freq_dist = Counter(tokens)

View File

@@ -1,20 +1,12 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, List, Dict, Optional, Union from typing import Any, List, Dict, Optional, Union
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
from transformers import BertTokenizer, BertModel, pipeline
from transformers import AutoTokenizer, AutoModel
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
import nltk
from nltk.tokenize import TextTilingTokenizer
import json, time import json, time
import torch
import spacy
# from optimum.intel import IPEXModel # from optimum.intel import IPEXModel
from .prompts import PROMPT_EXTRACT_BLOCKS from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import * from .config import *
from .utils import * from .utils import *
from functools import partial
class ExtractionStrategy(ABC): class ExtractionStrategy(ABC):
""" """
@@ -50,23 +42,32 @@ class ExtractionStrategy(ABC):
parsed_json.extend(future.result()) parsed_json.extend(future.result())
return parsed_json return parsed_json
class NoExtractionStrategy(ExtractionStrategy):
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
return [{"index": 0, "content": html}]
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
class LLMExtractionStrategy(ExtractionStrategy): class LLMExtractionStrategy(ExtractionStrategy):
def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None): def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None):
""" """
Initialize the strategy with clustering parameters. Initialize the strategy with clustering parameters.
:param word_count_threshold: Minimum number of words per cluster. :param word_count_threshold: Minimum number of words per cluster.
:param max_dist: The maximum cophenetic distance on the dendrogram to form clusters. :param max_dist: The maximum cophenetic distance on the dendrogram to form clusters.
:param linkage_method: The linkage method for hierarchical clustering. :param linkage_method: The linkage method for hierarchical clustering.
""" """
super().__init__() super().__init__()
self.provider = provider self.provider = provider
self.api_token = api_token self.api_token = api_token or PROVIDER_MODELS.get(provider, None) or os.getenv("OPENAI_API_KEY")
def extract(self, url: str, html: str, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
if not self.api_token:
raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
def extract(self, url: str, html: str) -> List[Dict[str, Any]]:
print("Extracting blocks ...")
variable_values = { variable_values = {
"URL": url, "URL": url,
"HTML": escape_json_string(sanitize_html(html)), "HTML": escape_json_string(sanitize_html(html)),
@@ -78,7 +79,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
"{" + variable + "}", variable_values[variable] "{" + variable + "}", variable_values[variable]
) )
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token) response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token)
try: try:
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
@@ -96,28 +97,54 @@ class LLMExtractionStrategy(ExtractionStrategy):
"tags": ["error"], "tags": ["error"],
"content": unparsed "content": unparsed
}) })
print("Extracted", len(blocks), "blocks.")
return blocks return blocks
def run(self, url: str, sections: List[str], provider: str, api_token: Optional[str]) -> List[Dict[str, Any]]: def _merge(self, documents):
chunks = []
sections = []
total_token_so_far = 0
for document in documents:
if total_token_so_far < CHUNK_TOKEN_THRESHOLD:
chunk = document.split(' ')
total_token_so_far += len(chunk) * 1.3
chunks.append(document)
else:
sections.append('\n\n'.join(chunks))
chunks = [document]
total_token_so_far = len(document.split(' ')) * 1.3
if chunks:
sections.append('\n\n'.join(chunks))
return sections
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
""" """
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
""" """
merged_sections = self._merge(sections)
parsed_json = [] parsed_json = []
if provider.startswith("groq/"): if self.provider.startswith("groq/"):
# Sequential processing with a delay # Sequential processing with a delay
for section in sections: for section in merged_sections:
parsed_json.extend(self.extract(url, section, provider, api_token)) parsed_json.extend(self.extract(url, section))
time.sleep(0.5) # 500 ms delay between each processing time.sleep(0.5) # 500 ms delay between each processing
else: else:
# Parallel processing using ThreadPoolExecutor # Parallel processing using ThreadPoolExecutor
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(self.extract, url, section, provider, api_token) for section in sections] extract_func = partial(self.extract, url)
futures = [executor.submit(extract_func, section) for section in merged_sections]
for future in as_completed(futures): for future in as_completed(futures):
parsed_json.extend(future.result()) parsed_json.extend(future.result())
return parsed_json return parsed_json
class HierarchicalClusteringStrategy(ExtractionStrategy): class CosinegStrategy(ExtractionStrategy):
def __init__(self, word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'BAAI/bge-small-en-v1.5'): def __init__(self, word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'BAAI/bge-small-en-v1.5'):
""" """
Initialize the strategy with clustering parameters. Initialize the strategy with clustering parameters.
@@ -128,6 +155,10 @@ class HierarchicalClusteringStrategy(ExtractionStrategy):
:param top_k: Number of top categories to extract. :param top_k: Number of top categories to extract.
""" """
super().__init__() super().__init__()
from transformers import BertTokenizer, BertModel, pipeline
from transformers import AutoTokenizer, AutoModel
import spacy
self.word_count_threshold = word_count_threshold self.word_count_threshold = word_count_threshold
self.max_dist = max_dist self.max_dist = max_dist
self.linkage_method = linkage_method self.linkage_method = linkage_method
@@ -156,6 +187,7 @@ class HierarchicalClusteringStrategy(ExtractionStrategy):
:param sentences: List of text chunks (sentences). :param sentences: List of text chunks (sentences).
:return: NumPy array of embeddings. :return: NumPy array of embeddings.
""" """
import torch
# Tokenize sentences and convert to tensor # Tokenize sentences and convert to tensor
encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings # Compute token embeddings
@@ -174,9 +206,11 @@ class HierarchicalClusteringStrategy(ExtractionStrategy):
:return: NumPy array of cluster labels. :return: NumPy array of cluster labels.
""" """
# Get embeddings # Get embeddings
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
self.timer = time.time() self.timer = time.time()
embeddings = self.get_embeddings(sentences) embeddings = self.get_embeddings(sentences)
print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds") # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
# Compute pairwise cosine distances # Compute pairwise cosine distances
distance_matrix = pdist(embeddings, 'cosine') distance_matrix = pdist(embeddings, 'cosine')
# Perform agglomerative clustering respecting order # Perform agglomerative clustering respecting order
@@ -219,7 +253,7 @@ class HierarchicalClusteringStrategy(ExtractionStrategy):
# Perform clustering # Perform clustering
labels = self.hierarchical_clustering(text_chunks) labels = self.hierarchical_clustering(text_chunks)
print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds") # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
# Organize texts by their cluster labels, retaining order # Organize texts by their cluster labels, retaining order
t = time.time() t = time.time()
@@ -240,7 +274,7 @@ class HierarchicalClusteringStrategy(ExtractionStrategy):
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
cluster['tags'] = [cat for cat, _ in top_categories] cluster['tags'] = [cat for cat, _ in top_categories]
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds") # print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
return cluster_list return cluster_list
@@ -265,9 +299,10 @@ class TopicExtractionStrategy(ExtractionStrategy):
:param num_keywords: Number of keywords to represent each topic segment. :param num_keywords: Number of keywords to represent each topic segment.
""" """
import nltk
super().__init__() super().__init__()
self.num_keywords = num_keywords self.num_keywords = num_keywords
self.tokenizer = TextTilingTokenizer() self.tokenizer = nltk.TextTilingTokenizer()
def extract_keywords(self, text: str) -> List[str]: def extract_keywords(self, text: str) -> List[str]:
""" """
@@ -276,6 +311,7 @@ class TopicExtractionStrategy(ExtractionStrategy):
:param text: The text segment from which to extract keywords. :param text: The text segment from which to extract keywords.
:return: A list of keyword strings. :return: A list of keyword strings.
""" """
import nltk
# Tokenize the text and compute word frequency # Tokenize the text and compute word frequency
words = nltk.word_tokenize(text) words = nltk.word_tokenize(text)
freq_dist = nltk.FreqDist(words) freq_dist = nltk.FreqDist(words)

View File

@@ -3,15 +3,12 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
import html2text import html2text
import json import json
import html
import re import re
import os import os
import litellm from html2text import HTML2Text
from litellm import completion, batch_completion
from .prompts import PROMPT_EXTRACT_BLOCKS from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import * from .config import *
import re
import html
from html2text import HTML2Text
def beautify_html(escaped_html): def beautify_html(escaped_html):
@@ -303,17 +300,16 @@ def extract_xml_data(tags, string):
return data return data
import time
import litellm
# Function to perform the completion with exponential backoff # Function to perform the completion with exponential backoff
def perform_completion_with_backoff(provider, prompt_with_variables, api_token): def perform_completion_with_backoff(provider, prompt_with_variables, api_token):
from litellm import completion
from litellm.exceptions import RateLimitError
max_attempts = 3 max_attempts = 3
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
for attempt in range(max_attempts): for attempt in range(max_attempts):
try: try:
response = completion( response =completion(
model=provider, model=provider,
messages=[ messages=[
{"role": "user", "content": prompt_with_variables} {"role": "user", "content": prompt_with_variables}
@@ -322,7 +318,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token):
api_key=api_token api_key=api_token
) )
return response # Return the successful response return response # Return the successful response
except litellm.exceptions.RateLimitError as e: except RateLimitError as e:
print("Rate limit error:", str(e)) print("Rate limit error:", str(e))
# Check if we have exhausted our max attempts # Check if we have exhausted our max attempts
@@ -378,7 +374,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None): def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
from litellm import batch_completion
messages = [] messages = []
for url, html in batch_data: for url, html in batch_data:

View File

@@ -9,49 +9,93 @@ from .extraction_strategy import *
from .crawler_strategy import * from .crawler_strategy import *
from typing import List from typing import List
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from .config import * from .config import *
class WebCrawler: class WebCrawler:
def __init__(self, db_path: str, crawler_strategy: CrawlerStrategy = LocalSeleniumCrawlerStrategy()): def __init__(
self,
db_path: str = None,
crawler_strategy: CrawlerStrategy = LocalSeleniumCrawlerStrategy(),
):
self.db_path = db_path self.db_path = db_path
init_db(self.db_path)
self.crawler_strategy = crawler_strategy self.crawler_strategy = crawler_strategy
# Create the .crawl4ai folder in the user's home directory if it doesn't exist # Create the .crawl4ai folder in the user's home directory if it doesn't exist
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
def fetch_page(self, # If db_path is not provided, use the default path
url_model: UrlModel, if not db_path:
provider: str = DEFAULT_PROVIDER, self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
api_token: str = None,
extract_blocks_flag: bool = True, init_db(self.db_path)
word_count_threshold = MIN_WORD_THRESHOLD,
use_cached_html: bool = False, self.ready = False
extraction_strategy: ExtractionStrategy = LLMExtractionStrategy(),
chunking_strategy: ChunkingStrategy = RegexChunking(), def warmup(self):
**kwargs print("[LOG] 🌤️ Warming up the WebCrawler")
) -> CrawlResult: single_url = UrlModel(url='https://crawl4ai.uccode.io/', forced=False)
result = self.run(
single_url,
word_count_threshold=5,
extraction_strategy= CosinegStrategy(),
verbose = False
)
self.ready = True
print("[LOG] 🌞 WebCrawler is ready to crawl")
def fetch_page(
self,
url_model: UrlModel,
provider: str = DEFAULT_PROVIDER,
api_token: str = None,
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
use_cached_html: bool = False,
extraction_strategy: ExtractionStrategy = LLMExtractionStrategy(),
chunking_strategy: ChunkingStrategy = RegexChunking(),
**kwargs,
) -> CrawlResult:
return self.run(
url_model,
word_count_threshold,
extraction_strategy,
chunking_strategy,
**kwargs,
)
pass
def run(
self,
url_model: UrlModel,
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = NoExtractionStrategy(),
chunking_strategy: ChunkingStrategy = RegexChunking(),
verbose=True,
**kwargs,
) -> CrawlResult:
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
if word_count_threshold < MIN_WORD_THRESHOLD: if word_count_threshold < MIN_WORD_THRESHOLD:
word_count_threshold = MIN_WORD_THRESHOLD word_count_threshold = MIN_WORD_THRESHOLD
# Check cache first # Check cache first
cached = get_cached_url(self.db_path, str(url_model.url)) cached = get_cached_url(self.db_path, str(url_model.url))
if cached and not url_model.forced: if cached and not url_model.forced:
return CrawlResult(**{ return CrawlResult(
"url": cached[0], **{
"html": cached[1], "url": cached[0],
"cleaned_html": cached[2], "html": cached[1],
"markdown": cached[3], "cleaned_html": cached[2],
"parsed_json": cached[4], "markdown": cached[3],
"success": cached[5], "parsed_json": cached[4],
"error_message": "" "success": cached[5],
}) "error_message": "",
}
)
# Initialize WebDriver for crawling # Initialize WebDriver for crawling
t = time.time() t = time.time()
@@ -62,65 +106,89 @@ class WebCrawler:
except Exception as e: except Exception as e:
html = "" html = ""
success = False success = False
error_message = str(e) error_message = str(e)
# Extract content from HTML # Extract content from HTML
result = get_content_of_website(html, word_count_threshold) result = get_content_of_website(html, word_count_threshold)
cleaned_html = result.get('cleaned_html', html) cleaned_html = result.get("cleaned_html", html)
markdown = result.get('markdown', "") markdown = result.get("markdown", "")
# Print a profession LOG style message, show time taken and say crawling is done # Print a profession LOG style message, show time taken and say crawling is done
print(f"[LOG] 🚀 Crawling done for {url_model.url}, success: {success}, time taken: {time.time() - t} seconds") if verbose:
print(
f"[LOG] 🚀 Crawling done for {url_model.url}, success: {success}, time taken: {time.time() - t} seconds"
)
parsed_json = [] parsed_json = []
if extract_blocks_flag: if verbose:
print(f"[LOG] 🔥 Extracting semantic blocks for {url_model.url}") print(f"[LOG] 🔥 Extracting semantic blocks for {url_model.url}")
t = time.time() t = time.time()
# Split markdown into sections # Split markdown into sections
sections = chunking_strategy.chunk(markdown) sections = chunking_strategy.chunk(markdown)
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD) # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
parsed_json = extraction_strategy.run(str(url_model.url), sections, provider, api_token) parsed_json = extraction_strategy.run(
parsed_json = json.dumps(parsed_json) str(url_model.url), sections,
)
parsed_json = json.dumps(parsed_json)
print(f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds.")
else: if verbose:
parsed_json = "{}" print(
print(f"[LOG] 🚀 Skipping extraction for {url_model.url}") f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds."
)
# Cache the result # Cache the result
cleaned_html = beautify_html(cleaned_html) cleaned_html = beautify_html(cleaned_html)
cache_url(self.db_path, str(url_model.url), html, cleaned_html, markdown, parsed_json, success) cache_url(
self.db_path,
return CrawlResult( str(url_model.url),
url=str(url_model.url), html,
html=html, cleaned_html,
cleaned_html=cleaned_html, markdown,
markdown=markdown, parsed_json,
parsed_json=parsed_json, success,
success=success,
error_message=error_message
) )
def fetch_pages(self, url_models: List[UrlModel], provider: str = DEFAULT_PROVIDER, api_token: str = None, return CrawlResult(
extract_blocks_flag: bool = True, word_count_threshold=MIN_WORD_THRESHOLD, url=str(url_model.url),
use_cached_html: bool = False, extraction_strategy: ExtractionStrategy = LLMExtractionStrategy(), html=html,
chunking_strategy: ChunkingStrategy = RegexChunking(), **kwargs) -> List[CrawlResult]: cleaned_html=cleaned_html,
markdown=markdown,
parsed_json=parsed_json,
success=success,
error_message=error_message,
)
def fetch_pages(
self,
url_models: List[UrlModel],
provider: str = DEFAULT_PROVIDER,
api_token: str = None,
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
use_cached_html: bool = False,
extraction_strategy: ExtractionStrategy = LLMExtractionStrategy(),
chunking_strategy: ChunkingStrategy = RegexChunking(),
**kwargs,
) -> List[CrawlResult]:
def fetch_page_wrapper(url_model, *args, **kwargs): def fetch_page_wrapper(url_model, *args, **kwargs):
return self.fetch_page(url_model, *args, **kwargs) return self.fetch_page(url_model, *args, **kwargs)
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
results = list(executor.map(fetch_page_wrapper, url_models, results = list(
[provider] * len(url_models), executor.map(
[api_token] * len(url_models), fetch_page_wrapper,
[extract_blocks_flag] * len(url_models), url_models,
[word_count_threshold] * len(url_models), [provider] * len(url_models),
[use_cached_html] * len(url_models), [api_token] * len(url_models),
[extraction_strategy] * len(url_models), [extract_blocks_flag] * len(url_models),
[chunking_strategy] * len(url_models), [word_count_threshold] * len(url_models),
*[kwargs] * len(url_models))) [use_cached_html] * len(url_models),
[extraction_strategy] * len(url_models),
[chunking_strategy] * len(url_models),
*[kwargs] * len(url_models),
)
)
return results return results

View File

@@ -12,4 +12,5 @@ html2text
litellm litellm
python-dotenv python-dotenv
nltk nltk
lazy_import
# spacy # spacy