From 6d3444ba17437062a75ccb99b316021c34e08209 Mon Sep 17 00:00:00 2001 From: lizhuxiong Date: Tue, 5 Aug 2025 16:18:34 +0800 Subject: [PATCH 1/2] In obtaining cleaned_html, the tag "script" needs to be processed separately. --- crawl4ai/content_scraping_strategy.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3751d52f..480d9ffe 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1668,7 +1668,26 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): content_element = body # Remove script and style tags - for tag in ["script", "style", "link", "meta", "noscript"]: + # Handle script separately + for element in body.xpath(f".//script"): + parent = element.getparent() + if parent is not None: + tail = element.tail # Get the tail text + if tail: + prev = element.getprevious() # Get the previous sibling node + if prev is not None: + if prev.tail: + prev.tail += tail + else: + prev.tail = tail + else: + if parent.text: + parent.text += tail + else: + parent.text = tail + parent.remove(element) # Delete the element + + for tag in ["style", "link", "meta", "noscript"]: for element in body.xpath(f".//{tag}"): if element.getparent() is not None: element.getparent().remove(element) From 660d7011b9005cae20df1f4f4ee42bfc8a2908b1 Mon Sep 17 00:00:00 2001 From: lizhuxiong Date: Tue, 5 Aug 2025 16:27:03 +0800 Subject: [PATCH 2/2] In obtaining cleaned_html, the tag "script" needs to be processed separately. --- crawl4ai/content_scraping_strategy.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 480d9ffe..04d06b4f 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1668,6 +1668,11 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): content_element = body # Remove script and style tags + for tag in ["style", "link", "meta", "noscript"]: + for element in body.xpath(f".//{tag}"): + if element.getparent() is not None: + element.getparent().remove(element) + # Handle script separately for element in body.xpath(f".//script"): parent = element.getparent() @@ -1687,10 +1692,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): parent.text = tail parent.remove(element) # Delete the element - for tag in ["style", "link", "meta", "noscript"]: - for element in body.xpath(f".//{tag}"): - if element.getparent() is not None: - element.getparent().remove(element) # Handle social media and domain exclusions kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))