In obtaining cleaned_html, the tag "script" needs to be processed separately.

2025-08-05 16:18:34 +08:00
parent e3281935bc
commit 6d3444ba17
1 changed files with 20 additions and 1 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -1668,7 +1668,26 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                content_element = body

            # Remove script and style tags
-            for tag in ["script", "style", "link", "meta", "noscript"]:
+            # Handle script separately
+            for element in body.xpath(f".//script"):
+                parent = element.getparent()
+                if parent is not None:
+                    tail = element.tail  # Get the tail text
+                    if tail:
+                        prev = element.getprevious()  # Get the previous sibling node
+                        if prev is not None:
+                            if prev.tail:
+                                prev.tail += tail 
+                            else:
+                                prev.tail = tail
+                        else:
+                            if parent.text:
+                                parent.text += tail
+                            else:
+                                parent.text = tail
+                    parent.remove(element)  # Delete the element
+
+            for tag in ["style", "link", "meta", "noscript"]:
                for element in body.xpath(f".//{tag}"):
                    if element.getparent() is not None:
                        element.getparent().remove(element)