diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3751d52f..480d9ffe 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1668,7 +1668,26 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): content_element = body # Remove script and style tags - for tag in ["script", "style", "link", "meta", "noscript"]: + # Handle script separately + for element in body.xpath(f".//script"): + parent = element.getparent() + if parent is not None: + tail = element.tail # Get the tail text + if tail: + prev = element.getprevious() # Get the previous sibling node + if prev is not None: + if prev.tail: + prev.tail += tail + else: + prev.tail = tail + else: + if parent.text: + parent.text += tail + else: + parent.text = tail + parent.remove(element) # Delete the element + + for tag in ["style", "link", "meta", "noscript"]: for element in body.xpath(f".//{tag}"): if element.getparent() is not None: element.getparent().remove(element)