fix: pass logger to WebScrapingStrategy and update score computation in PruningContentFilter
This commit is contained in:
@@ -472,7 +472,9 @@ class AsyncWebCrawler:
|
|||||||
try:
|
try:
|
||||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
scrapping_strategy = WebScrapingStrategy()
|
scrapping_strategy = WebScrapingStrategy(
|
||||||
|
logger=self.logger,
|
||||||
|
)
|
||||||
# result = await scrapping_strategy.ascrap(
|
# result = await scrapping_strategy.ascrap(
|
||||||
result = scrapping_strategy.scrap(
|
result = scrapping_strategy.scrap(
|
||||||
url,
|
url,
|
||||||
|
|||||||
@@ -468,7 +468,7 @@ class PruningContentFilter(RelevantContentFilter):
|
|||||||
'link_text_len': link_text_len
|
'link_text_len': link_text_len
|
||||||
}
|
}
|
||||||
|
|
||||||
score = self._compute_composite_score(metrics)
|
score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
|
||||||
|
|
||||||
if self.threshold_type == 'fixed':
|
if self.threshold_type == 'fixed':
|
||||||
should_remove = score < self.threshold
|
should_remove = score < self.threshold
|
||||||
|
|||||||
Reference in New Issue
Block a user