chore: Add verbose option to ExtractionStrategy classes

This commit adds a new `verbose` option to the `ExtractionStrategy` classes. The `verbose` option allows for logging of extraction details, such as the number of extracted blocks and the URL being processed. This improves the debugging and monitoring capabilities of the code.
This commit is contained in:
unclecode
2024-05-17 18:06:10 +08:00
parent 32c87f0388
commit 36e46be23d
3 changed files with 14 additions and 7 deletions

View File

@@ -19,6 +19,7 @@ class ExtractionStrategy(ABC):
def __init__(self, **kwargs):
self.DEL = "<|DEL|>"
self.name = self.__class__.__name__
self.verbose = kwargs.get("verbose", False)
@abstractmethod
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
@@ -61,10 +62,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
:param api_token: The API token for the provider.
:param instruction: The instruction to use for the LLM model.
"""
super().__init__()
super().__init__()
self.provider = provider
self.api_token = api_token or PROVIDER_MODELS.get(provider, None) or os.getenv("OPENAI_API_KEY")
self.instruction = instruction
self.verbose = kwargs.get("verbose", False)
if not self.api_token:
raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
@@ -105,7 +107,8 @@ class LLMExtractionStrategy(ExtractionStrategy):
"content": unparsed
})
print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
if self.verbose:
print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
return blocks
def _merge(self, documents):
@@ -171,6 +174,7 @@ class CosineStrategy(ExtractionStrategy):
self.linkage_method = linkage_method
self.top_k = top_k
self.timer = time.time()
self.verbose = kwargs.get("verbose", False)
self.buffer_embeddings = np.array([])
@@ -180,7 +184,9 @@ class CosineStrategy(ExtractionStrategy):
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.nlp = load_text_multilabel_classifier()
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
if self.verbose:
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
"""

View File

@@ -15,15 +15,15 @@ def get_home_folder():
@lru_cache()
def load_bert_base_uncased():
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #, resume_download=None)
model = BertModel.from_pretrained('bert-base-uncased') #, resume_download=None)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
return tokenizer, model
@lru_cache()
def load_bge_small_en_v1_5():
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5') #, resume_download=None)
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5') #, resume_download=None)
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
model.eval()
return tokenizer, model

View File

@@ -86,6 +86,7 @@ class WebCrawler:
**kwargs,
) -> CrawlResult:
extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
if not isinstance(extraction_strategy, ExtractionStrategy):
raise ValueError("Unsupported extraction strategy")