From d628bc403443bc389f59efb1ef97779faeebb4c9 Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 12 Sep 2024 17:35:45 +0800 Subject: [PATCH] Refactor content_scrapping_strategy.py to remove excluded tags --- crawl4ai/content_scrapping_strategy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 56868354..126dbaed 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -43,6 +43,10 @@ class WebScrappingStrategy(ContentScrappingStrategy): image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + for tag in kwargs.get('excluded_tags', []) or []: + for el in body.select(tag): + el.decompose() + if css_selector: selected_elements = body.select(css_selector) if not selected_elements: