fix: pass logger to WebScrapingStrategy and update score computation in PruningContentFilter

This commit is contained in:
UncleCode
2024-12-02 20:37:28 +08:00
parent 293f299c08
commit 95a4f74d2a
2 changed files with 4 additions and 2 deletions

View File

@@ -472,7 +472,9 @@ class AsyncWebCrawler:
try: try:
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
t1 = time.perf_counter() t1 = time.perf_counter()
scrapping_strategy = WebScrapingStrategy() scrapping_strategy = WebScrapingStrategy(
logger=self.logger,
)
# result = await scrapping_strategy.ascrap( # result = await scrapping_strategy.ascrap(
result = scrapping_strategy.scrap( result = scrapping_strategy.scrap(
url, url,

View File

@@ -468,7 +468,7 @@ class PruningContentFilter(RelevantContentFilter):
'link_text_len': link_text_len 'link_text_len': link_text_len
} }
score = self._compute_composite_score(metrics) score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
if self.threshold_type == 'fixed': if self.threshold_type == 'fixed':
should_remove = score < self.threshold should_remove = score < self.threshold