feat(crawler): add MHTML capture functionality

Add ability to capture web pages as MHTML format, which includes all page resources in a single file. This enables complete page archival and offline viewing. - Add capture_mhtml parameter to CrawlerRunConfig - Implement MHTML capture using CDP in AsyncPlaywrightCrawlerStrategy - Add mhtml field to CrawlResult and AsyncCrawlResponse models - Add comprehensive tests for MHTML capture functionality - Update documentation with MHTML capture details - Add exclude_all_images option for better memory management Breaking changes: None
2025-04-09 15:39:04 +08:00
parent 9038e9acbd
commit a2061bf31e
14 changed files with 467 additions and 24 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -772,10 +772,12 @@ class CrawlerRunConfig():
        screenshot_wait_for: float = None,
        screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
        pdf: bool = False,
+        capture_mhtml: bool = False,
        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
        image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
        table_score_threshold: int = 7,
        exclude_external_images: bool = False,
+        exclude_all_images: bool = False,
        # Link and Domain Handling Parameters
        exclude_social_media_domains: list = None,
        exclude_external_links: bool = False,
@@ -860,9 +862,11 @@ class CrawlerRunConfig():
        self.screenshot_wait_for = screenshot_wait_for
        self.screenshot_height_threshold = screenshot_height_threshold
        self.pdf = pdf
+        self.capture_mhtml = capture_mhtml
        self.image_description_min_word_threshold = image_description_min_word_threshold
        self.image_score_threshold = image_score_threshold
        self.exclude_external_images = exclude_external_images
+        self.exclude_all_images = exclude_all_images
        self.table_score_threshold = table_score_threshold

        # Link and Domain Handling Parameters
@@ -991,6 +995,7 @@ class CrawlerRunConfig():
                "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
            ),
            pdf=kwargs.get("pdf", False),
+            capture_mhtml=kwargs.get("capture_mhtml", False),
            image_description_min_word_threshold=kwargs.get(
                "image_description_min_word_threshold",
                IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -999,6 +1004,7 @@ class CrawlerRunConfig():
                "image_score_threshold", IMAGE_SCORE_THRESHOLD
            ),
            table_score_threshold=kwargs.get("table_score_threshold", 7),
+            exclude_all_images=kwargs.get("exclude_all_images", False),
            exclude_external_images=kwargs.get("exclude_external_images", False),
            # Link and Domain Handling Parameters
            exclude_social_media_domains=kwargs.get(
@@ -1088,9 +1094,11 @@ class CrawlerRunConfig():
            "screenshot_wait_for": self.screenshot_wait_for,
            "screenshot_height_threshold": self.screenshot_height_threshold,
            "pdf": self.pdf,
+            "capture_mhtml": self.capture_mhtml,
            "image_description_min_word_threshold": self.image_description_min_word_threshold,
            "image_score_threshold": self.image_score_threshold,
            "table_score_threshold": self.table_score_threshold,
+            "exclude_all_images": self.exclude_all_images,
            "exclude_external_images": self.exclude_external_images,
            "exclude_social_media_domains": self.exclude_social_media_domains,
            "exclude_external_links": self.exclude_external_links,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -836,14 +836,18 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                "before_return_html", page=page, html=html, context=context, config=config
            )

-            # Handle PDF and screenshot generation
+            # Handle PDF, MHTML and screenshot generation
            start_export_time = time.perf_counter()
            pdf_data = None
            screenshot_data = None
+            mhtml_data = None

            if config.pdf:
                pdf_data = await self.export_pdf(page)

+            if config.capture_mhtml:
+                mhtml_data = await self.capture_mhtml(page)
+
            if config.screenshot:
                if config.screenshot_wait_for:
                    await asyncio.sleep(config.screenshot_wait_for)
@@ -851,9 +855,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    page, screenshot_height_threshold=config.screenshot_height_threshold
                )

-            if screenshot_data or pdf_data:
+            if screenshot_data or pdf_data or mhtml_data:
                self.logger.info(
-                    message="Exporting PDF and taking screenshot took {duration:.2f}s",
+                    message="Exporting media (PDF/MHTML/screenshot) took {duration:.2f}s",
                    tag="EXPORT",
                    params={"duration": time.perf_counter() - start_export_time},
                )
@@ -876,6 +880,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                status_code=status_code,
                screenshot=screenshot_data,
                pdf_data=pdf_data,
+                mhtml_data=mhtml_data,
                get_delayed_content=get_delayed_content,
                ssl_certificate=ssl_cert,
                downloaded_files=(
@@ -1052,6 +1057,70 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        """
        pdf_data = await page.pdf(print_background=True)
        return pdf_data
+        
+    async def capture_mhtml(self, page: Page) -> Optional[str]:
+        """
+        Captures the current page as MHTML using CDP.
+        
+        MHTML (MIME HTML) is a web page archive format that combines the HTML content 
+        with its resources (images, CSS, etc.) into a single MIME-encoded file.
+        
+        Args:
+            page (Page): The Playwright page object
+            
+        Returns:
+            Optional[str]: The MHTML content as a string, or None if there was an error
+        """
+        try:
+            # Ensure the page is fully loaded before capturing
+            try:
+                # Wait for DOM content and network to be idle
+                await page.wait_for_load_state("domcontentloaded", timeout=5000)
+                await page.wait_for_load_state("networkidle", timeout=5000)
+                
+                # Give a little extra time for JavaScript execution
+                await page.wait_for_timeout(1000)
+                
+                # Wait for any animations to complete
+                await page.evaluate("""
+                    () => new Promise(resolve => {
+                        // First requestAnimationFrame gets scheduled after the next repaint
+                        requestAnimationFrame(() => {
+                            // Second requestAnimationFrame gets called after all animations complete
+                            requestAnimationFrame(resolve);
+                        });
+                    })
+                """)
+            except Error as e:
+                if self.logger:
+                    self.logger.warning(
+                        message="Wait for load state timed out: {error}",
+                        tag="MHTML",
+                        params={"error": str(e)},
+                    )
+            
+            # Create a new CDP session
+            cdp_session = await page.context.new_cdp_session(page)
+            
+            # Call Page.captureSnapshot with format "mhtml"
+            result = await cdp_session.send("Page.captureSnapshot", {"format": "mhtml"})
+            
+            # The result contains a 'data' field with the MHTML content
+            mhtml_content = result.get("data")
+            
+            # Detach the CDP session to clean up resources
+            await cdp_session.detach()
+            
+            return mhtml_content
+        except Exception as e:
+            # Log the error but don't raise it - we'll just return None for the MHTML
+            if self.logger:
+                self.logger.error(
+                    message="Failed to capture MHTML: {error}",
+                    tag="MHTML",
+                    params={"error": str(e)},
+                )
+            return None

    async def take_screenshot(self, page, **kwargs) -> str:
        """
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -365,6 +365,7 @@ class AsyncWebCrawler:
                    crawl_result.response_headers = async_response.response_headers
                    crawl_result.downloaded_files = async_response.downloaded_files
                    crawl_result.js_execution_result = js_execution_result
+                    crawl_result.mhtml = async_response.mhtml_data
                    crawl_result.ssl_certificate = (
                        async_response.ssl_certificate
                    )  # Add SSL certificate
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -440,8 +440,7 @@ class BrowserManager:
    @classmethod
    async def get_playwright(cls):
        from playwright.async_api import async_playwright
-        if cls._playwright_instance is None:
-            cls._playwright_instance = await async_playwright().start()
+        cls._playwright_instance = await async_playwright().start()
        return cls._playwright_instance    

    def __init__(self, browser_config: BrowserConfig, logger=None):
@@ -492,7 +491,6 @@ class BrowserManager:

        Note: This method should be called in a separate task to avoid blocking the main event loop.
        """
-        self.playwright  = await self.get_playwright()
        if self.playwright is None:
            from playwright.async_api import async_playwright

--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -860,6 +860,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        soup = BeautifulSoup(html, parser_type)
        body = soup.body
        base_domain = get_base_domain(url)
+        
+        # Early removal of all images if exclude_all_images is set
+        # This happens before any processing to minimize memory usage
+        if kwargs.get("exclude_all_images", False):
+            for img in body.find_all('img'):
+                img.decompose()

        try:
            meta = extract_metadata("", soup)
@@ -1491,6 +1497,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            body = doc

            base_domain = get_base_domain(url)
+            
+            # Early removal of all images if exclude_all_images is set
+            # This is more efficient in lxml as we remove elements before any processing
+            if kwargs.get("exclude_all_images", False):
+                for img in body.xpath('//img'):
+                    if img.getparent() is not None:
+                        img.getparent().remove(img)

            # Add comment removal
            if kwargs.get("remove_comments", False):
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -95,15 +95,7 @@ class UrlModel(BaseModel):
    url: HttpUrl
    forced: bool = False

-class MarkdownGenerationResult(BaseModel):
-    raw_markdown: str
-    markdown_with_citations: str
-    references_markdown: str
-    fit_markdown: Optional[str] = None
-    fit_html: Optional[str] = None

-    def __str__(self):
-        return self.raw_markdown

@dataclass
 class TraversalStats:
@@ -124,6 +116,16 @@ class DispatchResult(BaseModel):
    end_time: Union[datetime, float]
    error_message: str = ""

+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+    def __str__(self):
+        return self.raw_markdown
+    
 class CrawlResult(BaseModel):
    url: str
    html: str
@@ -135,6 +137,7 @@ class CrawlResult(BaseModel):
    js_execution_result: Optional[Dict[str, Any]] = None
    screenshot: Optional[str] = None
    pdf: Optional[bytes] = None
+    mhtml: Optional[str] = None
    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
    extracted_content: Optional[str] = None
    metadata: Optional[dict] = None
@@ -307,6 +310,7 @@ class AsyncCrawlResponse(BaseModel):
    status_code: int
    screenshot: Optional[str] = None
    pdf_data: Optional[bytes] = None
+    mhtml_data: Optional[str] = None
    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
    downloaded_files: Optional[List[str]] = None
    ssl_certificate: Optional[SSLCertificate] = None