Add PDF and MHTML support for raw: and file:// URLs

- Replace _generate_screenshot_from_html with _generate_media_from_html
- New method handles screenshot, PDF, and MHTML in one browser session
- Update raw: and file:// URL handlers to use new method
- Enables cached HTML to generate all media types
This commit is contained in:
unclecode
2025-12-22 01:24:51 +00:00
parent 444cb14f82
commit 67e03d64b8

View File

@@ -455,15 +455,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
elif url.startswith("file://"):
# initialize empty lists for console messages
captured_console = []
pdf_data = None
mhtml_data = None
# Process local file
local_file_path = url[7:] # Remove 'file://' prefix
if not os.path.exists(local_file_path):
raise FileNotFoundError(f"Local file not found: {local_file_path}")
with open(local_file_path, "r", encoding="utf-8") as f:
html = f.read()
if config.screenshot:
screenshot_data = await self._generate_screenshot_from_html(html, config)
# Handle media generation - all require loading HTML into browser
if config.screenshot or config.pdf or config.capture_mhtml:
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
html, config
)
if config.capture_console_messages:
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
captured_console = await self._capture_console_messages(page, url)
@@ -473,6 +480,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
response_headers=response_headers,
status_code=status_code,
screenshot=screenshot_data,
pdf_data=pdf_data,
mhtml_data=mhtml_data,
get_delayed_content=None,
console_messages=captured_console,
)
@@ -487,13 +496,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# raw_html = url[4:] if url[:4] == "raw:" else url[7:]
raw_html = url[6:] if url.startswith("raw://") else url[4:]
html = raw_html
if config.screenshot:
screenshot_data = await self._generate_screenshot_from_html(html, config)
pdf_data = None
mhtml_data = None
# Handle media generation - all require loading HTML into browser
if config.screenshot or config.pdf or config.capture_mhtml:
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
html, config
)
return AsyncCrawlResponse(
html=html,
response_headers=response_headers,
status_code=status_code,
screenshot=screenshot_data,
pdf_data=pdf_data,
mhtml_data=mhtml_data,
get_delayed_content=None,
)
else:
@@ -1525,22 +1543,27 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
return captured_console
async def _generate_screenshot_from_html(self, html: str, config: CrawlerRunConfig = None) -> str:
async def _generate_media_from_html(
self, html: str, config: CrawlerRunConfig = None
) -> tuple:
"""
Generate a screenshot from raw HTML content by loading it into a browser page.
Generate media (screenshot, PDF, MHTML) from raw HTML content.
This method is used for raw: and file:// URLs where we have HTML content
but need to render it in a browser to take a screenshot.
but need to render it in a browser to generate media outputs.
Args:
html (str): The raw HTML content to render
config (CrawlerRunConfig, optional): Configuration for screenshot options
config (CrawlerRunConfig, optional): Configuration for media options
Returns:
str: The base64-encoded screenshot data
tuple: (screenshot_data, pdf_data, mhtml_data) - any can be None
"""
page = None
context = None
screenshot_data = None
pdf_data = None
mhtml_data = None
try:
# Get a browser page
config = config or CrawlerRunConfig()
@@ -1549,25 +1572,40 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Load the HTML content into the page
await page.set_content(html, wait_until="domcontentloaded")
# Take the screenshot using existing method
screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
return await self.take_screenshot(page, screenshot_height_threshold=screenshot_height_threshold)
# Generate requested media
if config.pdf:
pdf_data = await self.export_pdf(page)
if config.capture_mhtml:
mhtml_data = await self.capture_mhtml(page)
if config.screenshot:
if config.screenshot_wait_for:
await asyncio.sleep(config.screenshot_wait_for)
screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
screenshot_data = await self.take_screenshot(
page, screenshot_height_threshold=screenshot_height_threshold
)
return screenshot_data, pdf_data, mhtml_data
except Exception as e:
error_message = f"Failed to generate screenshot from HTML: {str(e)}"
error_message = f"Failed to generate media from HTML: {str(e)}"
self.logger.error(
message="HTML Screenshot failed: {error}",
message="HTML media generation failed: {error}",
tag="ERROR",
params={"error": error_message},
)
# Return error image as fallback
img = Image.new("RGB", (800, 600), color="black")
draw = ImageDraw.Draw(img)
font = ImageFont.load_default()
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
buffered = BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
# Return error image for screenshot if it was requested
if config and config.screenshot:
img = Image.new("RGB", (800, 600), color="black")
draw = ImageDraw.Draw(img)
font = ImageFont.load_default()
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
buffered = BytesIO()
img.save(buffered, format="JPEG")
screenshot_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
return screenshot_data, pdf_data, mhtml_data
finally:
# Clean up the page
if page: