From 444cb14f82bc3e4bccc23cc818f87e3a6f49b860 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Mon, 22 Dec 2025 01:10:20 +0000
Subject: [PATCH] Add _generate_screenshot_from_html for raw: and file:// URLs

Implements the missing method that was being called but never defined.
Now raw: and file:// URLs can generate screenshots by:
1. Loading HTML into a browser page via page.set_content()
2. Taking screenshot using existing take_screenshot() method
3. Cleaning up the page afterward

This enables cached HTML to be rendered with screenshots in crawl4ai-cloud.
---
 crawl4ai/async_crawler_strategy.py | 57 ++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 2850b36a..e1169964 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -463,7 +463,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             with open(local_file_path, "r", encoding="utf-8") as f:
                 html = f.read()
             if config.screenshot:
-                screenshot_data = await self._generate_screenshot_from_html(html)
+                screenshot_data = await self._generate_screenshot_from_html(html, config)
             if config.capture_console_messages:
                 page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
                 captured_console = await self._capture_console_messages(page, url)
@@ -488,7 +488,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             raw_html = url[6:] if url.startswith("raw://") else url[4:]
             html = raw_html
             if config.screenshot:
-                screenshot_data = await self._generate_screenshot_from_html(html)
+                screenshot_data = await self._generate_screenshot_from_html(html, config)
             return AsyncCrawlResponse(
                 html=html,
                 response_headers=response_headers,
@@ -1524,7 +1524,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         await page.goto(file_path)
 
         return captured_console
-        
+
+    async def _generate_screenshot_from_html(self, html: str, config: CrawlerRunConfig = None) -> str:
+        """
+        Generate a screenshot from raw HTML content by loading it into a browser page.
+
+        This method is used for raw: and file:// URLs where we have HTML content
+        but need to render it in a browser to take a screenshot.
+
+        Args:
+            html (str): The raw HTML content to render
+            config (CrawlerRunConfig, optional): Configuration for screenshot options
+
+        Returns:
+            str: The base64-encoded screenshot data
+        """
+        page = None
+        context = None
+        try:
+            # Get a browser page
+            config = config or CrawlerRunConfig()
+            page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+
+            # Load the HTML content into the page
+            await page.set_content(html, wait_until="domcontentloaded")
+
+            # Take the screenshot using existing method
+            screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
+            return await self.take_screenshot(page, screenshot_height_threshold=screenshot_height_threshold)
+
+        except Exception as e:
+            error_message = f"Failed to generate screenshot from HTML: {str(e)}"
+            self.logger.error(
+                message="HTML Screenshot failed: {error}",
+                tag="ERROR",
+                params={"error": error_message},
+            )
+            # Return error image as fallback
+            img = Image.new("RGB", (800, 600), color="black")
+            draw = ImageDraw.Draw(img)
+            font = ImageFont.load_default()
+            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        finally:
+            # Clean up the page
+            if page:
+                try:
+                    await page.close()
+                except Exception:
+                    pass
+
     async def take_screenshot(self, page, **kwargs) -> str:
         """
         Take a screenshot of the current page.