Add _generate_screenshot_from_html for raw: and file:// URLs

Implements the missing method that was being called but never defined. Now raw: and file:// URLs can generate screenshots by: 1. Loading HTML into a browser page via page.set_content() 2. Taking screenshot using existing take_screenshot() method 3. Cleaning up the page afterward This enables cached HTML to be rendered with screenshots in crawl4ai-cloud.
2025-12-22 01:10:20 +00:00
parent 48426f73f0
commit 444cb14f82
1 changed files with 54 additions and 3 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -463,7 +463,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            with open(local_file_path, "r", encoding="utf-8") as f:
                html = f.read()
            if config.screenshot:
-                screenshot_data = await self._generate_screenshot_from_html(html)
+                screenshot_data = await self._generate_screenshot_from_html(html, config)
            if config.capture_console_messages:
                page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
                captured_console = await self._capture_console_messages(page, url)
@@ -488,7 +488,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            raw_html = url[6:] if url.startswith("raw://") else url[4:]
            html = raw_html
            if config.screenshot:
-                screenshot_data = await self._generate_screenshot_from_html(html)
+                screenshot_data = await self._generate_screenshot_from_html(html, config)
            return AsyncCrawlResponse(
                html=html,
                response_headers=response_headers,
@@ -1524,7 +1524,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        await page.goto(file_path)

        return captured_console
-        
+
+    async def _generate_screenshot_from_html(self, html: str, config: CrawlerRunConfig = None) -> str:
+        """
+        Generate a screenshot from raw HTML content by loading it into a browser page.
+
+        This method is used for raw: and file:// URLs where we have HTML content
+        but need to render it in a browser to take a screenshot.
+
+        Args:
+            html (str): The raw HTML content to render
+            config (CrawlerRunConfig, optional): Configuration for screenshot options
+
+        Returns:
+            str: The base64-encoded screenshot data
+        """
+        page = None
+        context = None
+        try:
+            # Get a browser page
+            config = config or CrawlerRunConfig()
+            page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+
+            # Load the HTML content into the page
+            await page.set_content(html, wait_until="domcontentloaded")
+
+            # Take the screenshot using existing method
+            screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
+            return await self.take_screenshot(page, screenshot_height_threshold=screenshot_height_threshold)
+
+        except Exception as e:
+            error_message = f"Failed to generate screenshot from HTML: {str(e)}"
+            self.logger.error(
+                message="HTML Screenshot failed: {error}",
+                tag="ERROR",
+                params={"error": error_message},
+            )
+            # Return error image as fallback
+            img = Image.new("RGB", (800, 600), color="black")
+            draw = ImageDraw.Draw(img)
+            font = ImageFont.load_default()
+            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        finally:
+            # Clean up the page
+            if page:
+                try:
+                    await page.close()
+                except Exception:
+                    pass
+
    async def take_screenshot(self, page, **kwargs) -> str:
        """
        Take a screenshot of the current page.