From 6d3444ba17437062a75ccb99b316021c34e08209 Mon Sep 17 00:00:00 2001 From: lizhuxiong Date: Tue, 5 Aug 2025 16:18:34 +0800 Subject: [PATCH] In obtaining cleaned_html, the tag "script" needs to be processed separately. --- crawl4ai/content_scraping_strategy.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3751d52f..480d9ffe 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1668,7 +1668,26 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): content_element = body # Remove script and style tags - for tag in ["script", "style", "link", "meta", "noscript"]: + # Handle script separately + for element in body.xpath(f".//script"): + parent = element.getparent() + if parent is not None: + tail = element.tail # Get the tail text + if tail: + prev = element.getprevious() # Get the previous sibling node + if prev is not None: + if prev.tail: + prev.tail += tail + else: + prev.tail = tail + else: + if parent.text: + parent.text += tail + else: + parent.text = tail + parent.remove(element) # Delete the element + + for tag in ["style", "link", "meta", "noscript"]: for element in body.xpath(f".//{tag}"): if element.getparent() is not None: element.getparent().remove(element)