feat(crawler): add MHTML capture functionality

Add ability to capture web pages as MHTML format, which includes all page resources in a single file. This enables complete page archival and offline viewing. - Add capture_mhtml parameter to CrawlerRunConfig - Implement MHTML capture using CDP in AsyncPlaywrightCrawlerStrategy - Add mhtml field to CrawlResult and AsyncCrawlResponse models - Add comprehensive tests for MHTML capture functionality - Update documentation with MHTML capture details - Add exclude_all_images option for better memory management Breaking changes: None
2025-04-09 15:39:04 +08:00
parent 9038e9acbd
commit a2061bf31e
14 changed files with 467 additions and 24 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -860,6 +860,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        soup = BeautifulSoup(html, parser_type)
        body = soup.body
        base_domain = get_base_domain(url)
+        
+        # Early removal of all images if exclude_all_images is set
+        # This happens before any processing to minimize memory usage
+        if kwargs.get("exclude_all_images", False):
+            for img in body.find_all('img'):
+                img.decompose()

        try:
            meta = extract_metadata("", soup)
@@ -1491,6 +1497,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            body = doc

            base_domain = get_base_domain(url)
+            
+            # Early removal of all images if exclude_all_images is set
+            # This is more efficient in lxml as we remove elements before any processing
+            if kwargs.get("exclude_all_images", False):
+                for img in body.xpath('//img'):
+                    if img.getparent() is not None:
+                        img.getparent().remove(img)

            # Add comment removal
            if kwargs.get("remove_comments", False):