Add PDF and MHTML support for raw: and file:// URLs

- Replace _generate_screenshot_from_html with _generate_media_from_html
- New method handles screenshot, PDF, and MHTML in one browser session
- Update raw: and file:// URL handlers to use new method
- Enables cached HTML to generate all media types
This commit is contained in:
unclecode
2025-12-22 01:24:51 +00:00
parent 444cb14f82
commit 67e03d64b8

View File

@@ -455,6 +455,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
elif url.startswith("file://"): elif url.startswith("file://"):
# initialize empty lists for console messages # initialize empty lists for console messages
captured_console = [] captured_console = []
pdf_data = None
mhtml_data = None
# Process local file # Process local file
local_file_path = url[7:] # Remove 'file://' prefix local_file_path = url[7:] # Remove 'file://' prefix
@@ -462,8 +464,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
raise FileNotFoundError(f"Local file not found: {local_file_path}") raise FileNotFoundError(f"Local file not found: {local_file_path}")
with open(local_file_path, "r", encoding="utf-8") as f: with open(local_file_path, "r", encoding="utf-8") as f:
html = f.read() html = f.read()
if config.screenshot:
screenshot_data = await self._generate_screenshot_from_html(html, config) # Handle media generation - all require loading HTML into browser
if config.screenshot or config.pdf or config.capture_mhtml:
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
html, config
)
if config.capture_console_messages: if config.capture_console_messages:
page, context = await self.browser_manager.get_page(crawlerRunConfig=config) page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
captured_console = await self._capture_console_messages(page, url) captured_console = await self._capture_console_messages(page, url)
@@ -473,6 +480,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
response_headers=response_headers, response_headers=response_headers,
status_code=status_code, status_code=status_code,
screenshot=screenshot_data, screenshot=screenshot_data,
pdf_data=pdf_data,
mhtml_data=mhtml_data,
get_delayed_content=None, get_delayed_content=None,
console_messages=captured_console, console_messages=captured_console,
) )
@@ -487,13 +496,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# raw_html = url[4:] if url[:4] == "raw:" else url[7:] # raw_html = url[4:] if url[:4] == "raw:" else url[7:]
raw_html = url[6:] if url.startswith("raw://") else url[4:] raw_html = url[6:] if url.startswith("raw://") else url[4:]
html = raw_html html = raw_html
if config.screenshot: pdf_data = None
screenshot_data = await self._generate_screenshot_from_html(html, config) mhtml_data = None
# Handle media generation - all require loading HTML into browser
if config.screenshot or config.pdf or config.capture_mhtml:
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
html, config
)
return AsyncCrawlResponse( return AsyncCrawlResponse(
html=html, html=html,
response_headers=response_headers, response_headers=response_headers,
status_code=status_code, status_code=status_code,
screenshot=screenshot_data, screenshot=screenshot_data,
pdf_data=pdf_data,
mhtml_data=mhtml_data,
get_delayed_content=None, get_delayed_content=None,
) )
else: else:
@@ -1525,22 +1543,27 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
return captured_console return captured_console
async def _generate_screenshot_from_html(self, html: str, config: CrawlerRunConfig = None) -> str: async def _generate_media_from_html(
self, html: str, config: CrawlerRunConfig = None
) -> tuple:
""" """
Generate a screenshot from raw HTML content by loading it into a browser page. Generate media (screenshot, PDF, MHTML) from raw HTML content.
This method is used for raw: and file:// URLs where we have HTML content This method is used for raw: and file:// URLs where we have HTML content
but need to render it in a browser to take a screenshot. but need to render it in a browser to generate media outputs.
Args: Args:
html (str): The raw HTML content to render html (str): The raw HTML content to render
config (CrawlerRunConfig, optional): Configuration for screenshot options config (CrawlerRunConfig, optional): Configuration for media options
Returns: Returns:
str: The base64-encoded screenshot data tuple: (screenshot_data, pdf_data, mhtml_data) - any can be None
""" """
page = None page = None
context = None screenshot_data = None
pdf_data = None
mhtml_data = None
try: try:
# Get a browser page # Get a browser page
config = config or CrawlerRunConfig() config = config or CrawlerRunConfig()
@@ -1549,25 +1572,40 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Load the HTML content into the page # Load the HTML content into the page
await page.set_content(html, wait_until="domcontentloaded") await page.set_content(html, wait_until="domcontentloaded")
# Take the screenshot using existing method # Generate requested media
screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None) if config.pdf:
return await self.take_screenshot(page, screenshot_height_threshold=screenshot_height_threshold) pdf_data = await self.export_pdf(page)
if config.capture_mhtml:
mhtml_data = await self.capture_mhtml(page)
if config.screenshot:
if config.screenshot_wait_for:
await asyncio.sleep(config.screenshot_wait_for)
screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
screenshot_data = await self.take_screenshot(
page, screenshot_height_threshold=screenshot_height_threshold
)
return screenshot_data, pdf_data, mhtml_data
except Exception as e: except Exception as e:
error_message = f"Failed to generate screenshot from HTML: {str(e)}" error_message = f"Failed to generate media from HTML: {str(e)}"
self.logger.error( self.logger.error(
message="HTML Screenshot failed: {error}", message="HTML media generation failed: {error}",
tag="ERROR", tag="ERROR",
params={"error": error_message}, params={"error": error_message},
) )
# Return error image as fallback # Return error image for screenshot if it was requested
img = Image.new("RGB", (800, 600), color="black") if config and config.screenshot:
draw = ImageDraw.Draw(img) img = Image.new("RGB", (800, 600), color="black")
font = ImageFont.load_default() draw = ImageDraw.Draw(img)
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) font = ImageFont.load_default()
buffered = BytesIO() draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
img.save(buffered, format="JPEG") buffered = BytesIO()
return base64.b64encode(buffered.getvalue()).decode("utf-8") img.save(buffered, format="JPEG")
screenshot_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
return screenshot_data, pdf_data, mhtml_data
finally: finally:
# Clean up the page # Clean up the page
if page: if page: