From 660d7011b9005cae20df1f4f4ee42bfc8a2908b1 Mon Sep 17 00:00:00 2001 From: lizhuxiong Date: Tue, 5 Aug 2025 16:27:03 +0800 Subject: [PATCH] In obtaining cleaned_html, the tag "script" needs to be processed separately. --- crawl4ai/content_scraping_strategy.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 480d9ffe..04d06b4f 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1668,6 +1668,11 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): content_element = body # Remove script and style tags + for tag in ["style", "link", "meta", "noscript"]: + for element in body.xpath(f".//{tag}"): + if element.getparent() is not None: + element.getparent().remove(element) + # Handle script separately for element in body.xpath(f".//script"): parent = element.getparent() @@ -1687,10 +1692,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): parent.text = tail parent.remove(element) # Delete the element - for tag in ["style", "link", "meta", "noscript"]: - for element in body.xpath(f".//{tag}"): - if element.getparent() is not None: - element.getparent().remove(element) # Handle social media and domain exclusions kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))