Merged next branch

2025-04-12 10:47:02 +05:30
parent 6f7ab9c927 18e8227dfb
commit 022f5c9e25
62 changed files with 3225 additions and 7085 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -866,6 +866,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        if body is None:
            raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
        base_domain = get_base_domain(url)
+        
+        # Early removal of all images if exclude_all_images is set
+        # This happens before any processing to minimize memory usage
+        if kwargs.get("exclude_all_images", False):
+            for img in body.find_all('img'):
+                img.decompose()

        try:
            meta = extract_metadata("", soup)
@@ -1487,6 +1493,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            body = doc

            base_domain = get_base_domain(url)
+            
+            # Early removal of all images if exclude_all_images is set
+            # This is more efficient in lxml as we remove elements before any processing
+            if kwargs.get("exclude_all_images", False):
+                for img in body.xpath('//img'):
+                    if img.getparent() is not None:
+                        img.getparent().remove(img)

            # Add comment removal
            if kwargs.get("remove_comments", False):