From 444cb14f82bc3e4bccc23cc818f87e3a6f49b860 Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 22 Dec 2025 01:10:20 +0000 Subject: [PATCH] Add _generate_screenshot_from_html for raw: and file:// URLs Implements the missing method that was being called but never defined. Now raw: and file:// URLs can generate screenshots by: 1. Loading HTML into a browser page via page.set_content() 2. Taking screenshot using existing take_screenshot() method 3. Cleaning up the page afterward This enables cached HTML to be rendered with screenshots in crawl4ai-cloud. --- crawl4ai/async_crawler_strategy.py | 57 ++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 2850b36a..e1169964 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -463,7 +463,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): with open(local_file_path, "r", encoding="utf-8") as f: html = f.read() if config.screenshot: - screenshot_data = await self._generate_screenshot_from_html(html) + screenshot_data = await self._generate_screenshot_from_html(html, config) if config.capture_console_messages: page, context = await self.browser_manager.get_page(crawlerRunConfig=config) captured_console = await self._capture_console_messages(page, url) @@ -488,7 +488,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): raw_html = url[6:] if url.startswith("raw://") else url[4:] html = raw_html if config.screenshot: - screenshot_data = await self._generate_screenshot_from_html(html) + screenshot_data = await self._generate_screenshot_from_html(html, config) return AsyncCrawlResponse( html=html, response_headers=response_headers, @@ -1524,7 +1524,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await page.goto(file_path) return captured_console - + + async def _generate_screenshot_from_html(self, html: str, config: CrawlerRunConfig = None) -> str: + """ + Generate a screenshot from raw HTML content by loading it into a browser page. + + This method is used for raw: and file:// URLs where we have HTML content + but need to render it in a browser to take a screenshot. + + Args: + html (str): The raw HTML content to render + config (CrawlerRunConfig, optional): Configuration for screenshot options + + Returns: + str: The base64-encoded screenshot data + """ + page = None + context = None + try: + # Get a browser page + config = config or CrawlerRunConfig() + page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + + # Load the HTML content into the page + await page.set_content(html, wait_until="domcontentloaded") + + # Take the screenshot using existing method + screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None) + return await self.take_screenshot(page, screenshot_height_threshold=screenshot_height_threshold) + + except Exception as e: + error_message = f"Failed to generate screenshot from HTML: {str(e)}" + self.logger.error( + message="HTML Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message}, + ) + # Return error image as fallback + img = Image.new("RGB", (800, 600), color="black") + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + finally: + # Clean up the page + if page: + try: + await page.close() + except Exception: + pass + async def take_screenshot(self, page, **kwargs) -> str: """ Take a screenshot of the current page.