From 95a4f74d2a9c0ae8c6f727cce6f6d0c17694aeb4 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 2 Dec 2024 20:37:28 +0800 Subject: [PATCH] fix: pass logger to WebScrapingStrategy and update score computation in PruningContentFilter --- crawl4ai/async_webcrawler.py | 4 +++- crawl4ai/content_filter_strategy.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 66b4c21b..8db69333 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -472,7 +472,9 @@ class AsyncWebCrawler: try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" t1 = time.perf_counter() - scrapping_strategy = WebScrapingStrategy() + scrapping_strategy = WebScrapingStrategy( + logger=self.logger, + ) # result = await scrapping_strategy.ascrap( result = scrapping_strategy.scrap( url, diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index ca3868bb..f05b92fa 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -468,7 +468,7 @@ class PruningContentFilter(RelevantContentFilter): 'link_text_len': link_text_len } - score = self._compute_composite_score(metrics) + score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len) if self.threshold_type == 'fixed': should_remove = score < self.threshold