From 30807f5535dd8ed5eeabced9ce0555ba86d9c9a4 Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 12 Sep 2024 16:11:20 +0800 Subject: [PATCH] Remove excluded tags from website content --- crawl4ai/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 61d1e333..b5736ad8 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -441,6 +441,10 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + for tag in kwargs.get('excluded_tags', []) or []: + for el in body.select(tag): + el.decompose() + if css_selector: selected_elements = body.select(css_selector) if not selected_elements: