From 6d3444ba17437062a75ccb99b316021c34e08209 Mon Sep 17 00:00:00 2001
From: lizhuxiong <panda_xlz@163.com>
Date: Tue, 5 Aug 2025 16:18:34 +0800
Subject: [PATCH] In obtaining cleaned_html, the tag "script" needs to be
 processed separately.

---
 crawl4ai/content_scraping_strategy.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 3751d52f..480d9ffe 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -1668,7 +1668,26 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                 content_element = body
 
             # Remove script and style tags
-            for tag in ["script", "style", "link", "meta", "noscript"]:
+            # Handle script separately
+            for element in body.xpath(f".//script"):
+                parent = element.getparent()
+                if parent is not None:
+                    tail = element.tail  # Get the tail text
+                    if tail:
+                        prev = element.getprevious()  # Get the previous sibling node
+                        if prev is not None:
+                            if prev.tail:
+                                prev.tail += tail 
+                            else:
+                                prev.tail = tail
+                        else:
+                            if parent.text:
+                                parent.text += tail
+                            else:
+                                parent.text = tail
+                    parent.remove(element)  # Delete the element
+
+            for tag in ["style", "link", "meta", "noscript"]:
                 for element in body.xpath(f".//{tag}"):
                     if element.getparent() is not None:
                         element.getparent().remove(element)