Add _generate_screenshot_from_html for raw: and file:// URLs
Implements the missing method that was being called but never defined. Now raw: and file:// URLs can generate screenshots by: 1. Loading HTML into a browser page via page.set_content() 2. Taking screenshot using existing take_screenshot() method 3. Cleaning up the page afterward This enables cached HTML to be rendered with screenshots in crawl4ai-cloud.
This commit is contained in:
@@ -463,7 +463,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
with open(local_file_path, "r", encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
if config.screenshot:
|
||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||
screenshot_data = await self._generate_screenshot_from_html(html, config)
|
||||
if config.capture_console_messages:
|
||||
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
||||
captured_console = await self._capture_console_messages(page, url)
|
||||
@@ -488,7 +488,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
raw_html = url[6:] if url.startswith("raw://") else url[4:]
|
||||
html = raw_html
|
||||
if config.screenshot:
|
||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||
screenshot_data = await self._generate_screenshot_from_html(html, config)
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
response_headers=response_headers,
|
||||
@@ -1524,7 +1524,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await page.goto(file_path)
|
||||
|
||||
return captured_console
|
||||
|
||||
|
||||
async def _generate_screenshot_from_html(self, html: str, config: CrawlerRunConfig = None) -> str:
|
||||
"""
|
||||
Generate a screenshot from raw HTML content by loading it into a browser page.
|
||||
|
||||
This method is used for raw: and file:// URLs where we have HTML content
|
||||
but need to render it in a browser to take a screenshot.
|
||||
|
||||
Args:
|
||||
html (str): The raw HTML content to render
|
||||
config (CrawlerRunConfig, optional): Configuration for screenshot options
|
||||
|
||||
Returns:
|
||||
str: The base64-encoded screenshot data
|
||||
"""
|
||||
page = None
|
||||
context = None
|
||||
try:
|
||||
# Get a browser page
|
||||
config = config or CrawlerRunConfig()
|
||||
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
||||
|
||||
# Load the HTML content into the page
|
||||
await page.set_content(html, wait_until="domcontentloaded")
|
||||
|
||||
# Take the screenshot using existing method
|
||||
screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
|
||||
return await self.take_screenshot(page, screenshot_height_threshold=screenshot_height_threshold)
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Failed to generate screenshot from HTML: {str(e)}"
|
||||
self.logger.error(
|
||||
message="HTML Screenshot failed: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": error_message},
|
||||
)
|
||||
# Return error image as fallback
|
||||
img = Image.new("RGB", (800, 600), color="black")
|
||||
draw = ImageDraw.Draw(img)
|
||||
font = ImageFont.load_default()
|
||||
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
finally:
|
||||
# Clean up the page
|
||||
if page:
|
||||
try:
|
||||
await page.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def take_screenshot(self, page, **kwargs) -> str:
|
||||
"""
|
||||
Take a screenshot of the current page.
|
||||
|
||||
Reference in New Issue
Block a user