Add PDF and MHTML support for raw: and file:// URLs

- Replace _generate_screenshot_from_html with _generate_media_from_html - New method handles screenshot, PDF, and MHTML in one browser session - Update raw: and file:// URL handlers to use new method - Enables cached HTML to generate all media types
2025-12-22 01:24:51 +00:00
parent 444cb14f82
commit 67e03d64b8
1 changed files with 62 additions and 24 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -455,15 +455,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        elif url.startswith("file://"):
            # initialize empty lists for console messages
            captured_console = []
-            
+            pdf_data = None
+            mhtml_data = None
+
            # Process local file
            local_file_path = url[7:]  # Remove 'file://' prefix
            if not os.path.exists(local_file_path):
                raise FileNotFoundError(f"Local file not found: {local_file_path}")
            with open(local_file_path, "r", encoding="utf-8") as f:
                html = f.read()
-            if config.screenshot:
-                screenshot_data = await self._generate_screenshot_from_html(html, config)
+
+            # Handle media generation - all require loading HTML into browser
+            if config.screenshot or config.pdf or config.capture_mhtml:
+                screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
+                    html, config
+                )
+
            if config.capture_console_messages:
                page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
                captured_console = await self._capture_console_messages(page, url)
@@ -473,6 +480,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                response_headers=response_headers,
                status_code=status_code,
                screenshot=screenshot_data,
+                pdf_data=pdf_data,
+                mhtml_data=mhtml_data,
                get_delayed_content=None,
                console_messages=captured_console,
            )
@@ -487,13 +496,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            # raw_html = url[4:] if url[:4] == "raw:" else url[7:]
            raw_html = url[6:] if url.startswith("raw://") else url[4:]
            html = raw_html
-            if config.screenshot:
-                screenshot_data = await self._generate_screenshot_from_html(html, config)
+            pdf_data = None
+            mhtml_data = None
+
+            # Handle media generation - all require loading HTML into browser
+            if config.screenshot or config.pdf or config.capture_mhtml:
+                screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
+                    html, config
+                )
+
            return AsyncCrawlResponse(
                html=html,
                response_headers=response_headers,
                status_code=status_code,
                screenshot=screenshot_data,
+                pdf_data=pdf_data,
+                mhtml_data=mhtml_data,
                get_delayed_content=None,
            )
        else:
@@ -1525,22 +1543,27 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

        return captured_console

-    async def _generate_screenshot_from_html(self, html: str, config: CrawlerRunConfig = None) -> str:
+    async def _generate_media_from_html(
+        self, html: str, config: CrawlerRunConfig = None
+    ) -> tuple:
        """
-        Generate a screenshot from raw HTML content by loading it into a browser page.
+        Generate media (screenshot, PDF, MHTML) from raw HTML content.

        This method is used for raw: and file:// URLs where we have HTML content
-        but need to render it in a browser to take a screenshot.
+        but need to render it in a browser to generate media outputs.

        Args:
            html (str): The raw HTML content to render
-            config (CrawlerRunConfig, optional): Configuration for screenshot options
+            config (CrawlerRunConfig, optional): Configuration for media options

        Returns:
-            str: The base64-encoded screenshot data
+            tuple: (screenshot_data, pdf_data, mhtml_data) - any can be None
        """
        page = None
-        context = None
+        screenshot_data = None
+        pdf_data = None
+        mhtml_data = None
+
        try:
            # Get a browser page
            config = config or CrawlerRunConfig()
@@ -1549,25 +1572,40 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            # Load the HTML content into the page
            await page.set_content(html, wait_until="domcontentloaded")

-            # Take the screenshot using existing method
-            screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
-            return await self.take_screenshot(page, screenshot_height_threshold=screenshot_height_threshold)
+            # Generate requested media
+            if config.pdf:
+                pdf_data = await self.export_pdf(page)
+
+            if config.capture_mhtml:
+                mhtml_data = await self.capture_mhtml(page)
+
+            if config.screenshot:
+                if config.screenshot_wait_for:
+                    await asyncio.sleep(config.screenshot_wait_for)
+                screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
+                screenshot_data = await self.take_screenshot(
+                    page, screenshot_height_threshold=screenshot_height_threshold
+                )
+
+            return screenshot_data, pdf_data, mhtml_data

        except Exception as e:
-            error_message = f"Failed to generate screenshot from HTML: {str(e)}"
+            error_message = f"Failed to generate media from HTML: {str(e)}"
            self.logger.error(
-                message="HTML Screenshot failed: {error}",
+                message="HTML media generation failed: {error}",
                tag="ERROR",
                params={"error": error_message},
            )
-            # Return error image as fallback
-            img = Image.new("RGB", (800, 600), color="black")
-            draw = ImageDraw.Draw(img)
-            font = ImageFont.load_default()
-            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
-            buffered = BytesIO()
-            img.save(buffered, format="JPEG")
-            return base64.b64encode(buffered.getvalue()).decode("utf-8")
+            # Return error image for screenshot if it was requested
+            if config and config.screenshot:
+                img = Image.new("RGB", (800, 600), color="black")
+                draw = ImageDraw.Draw(img)
+                font = ImageFont.load_default()
+                draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
+                buffered = BytesIO()
+                img.save(buffered, format="JPEG")
+                screenshot_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            return screenshot_data, pdf_data, mhtml_data
        finally:
            # Clean up the page
            if page: