chore: Add verbose option to ExtractionStrategy classes
This commit adds a new `verbose` option to the `ExtractionStrategy` classes. The `verbose` option allows for logging of extraction details, such as the number of extracted blocks and the URL being processed. This improves the debugging and monitoring capabilities of the code.
This commit is contained in:
@@ -19,6 +19,7 @@ class ExtractionStrategy(ABC):
|
||||
def __init__(self, **kwargs):
|
||||
self.DEL = "<|DEL|>"
|
||||
self.name = self.__class__.__name__
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
@@ -65,6 +66,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
self.provider = provider
|
||||
self.api_token = api_token or PROVIDER_MODELS.get(provider, None) or os.getenv("OPENAI_API_KEY")
|
||||
self.instruction = instruction
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
if not self.api_token:
|
||||
raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
|
||||
@@ -105,6 +107,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"content": unparsed
|
||||
})
|
||||
|
||||
if self.verbose:
|
||||
print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
|
||||
return blocks
|
||||
|
||||
@@ -171,6 +174,7 @@ class CosineStrategy(ExtractionStrategy):
|
||||
self.linkage_method = linkage_method
|
||||
self.top_k = top_k
|
||||
self.timer = time.time()
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
self.buffer_embeddings = np.array([])
|
||||
|
||||
@@ -180,6 +184,8 @@ class CosineStrategy(ExtractionStrategy):
|
||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
|
||||
self.nlp = load_text_multilabel_classifier()
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
||||
|
||||
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
|
||||
|
||||
@@ -15,15 +15,15 @@ def get_home_folder():
|
||||
@lru_cache()
|
||||
def load_bert_base_uncased():
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #, resume_download=None)
|
||||
model = BertModel.from_pretrained('bert-base-uncased') #, resume_download=None)
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_bge_small_en_v1_5():
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5') #, resume_download=None)
|
||||
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5') #, resume_download=None)
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model.eval()
|
||||
return tokenizer, model
|
||||
|
||||
|
||||
@@ -86,6 +86,7 @@ class WebCrawler:
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
|
||||
Reference in New Issue
Block a user