fix: pass logger to WebScrapingStrategy and update score computation in PruningContentFilter
This commit is contained in:
@@ -472,7 +472,9 @@ class AsyncWebCrawler:
|
||||
try:
|
||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||
t1 = time.perf_counter()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
scrapping_strategy = WebScrapingStrategy(
|
||||
logger=self.logger,
|
||||
)
|
||||
# result = await scrapping_strategy.ascrap(
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
|
||||
@@ -468,7 +468,7 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
'link_text_len': link_text_len
|
||||
}
|
||||
|
||||
score = self._compute_composite_score(metrics)
|
||||
score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
|
||||
|
||||
if self.threshold_type == 'fixed':
|
||||
should_remove = score < self.threshold
|
||||
|
||||
Reference in New Issue
Block a user