Add PDF and MHTML support for raw: and file:// URLs
- Replace _generate_screenshot_from_html with _generate_media_from_html - New method handles screenshot, PDF, and MHTML in one browser session - Update raw: and file:// URL handlers to use new method - Enables cached HTML to generate all media types
This commit is contained in:
@@ -455,6 +455,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
elif url.startswith("file://"):
|
elif url.startswith("file://"):
|
||||||
# initialize empty lists for console messages
|
# initialize empty lists for console messages
|
||||||
captured_console = []
|
captured_console = []
|
||||||
|
pdf_data = None
|
||||||
|
mhtml_data = None
|
||||||
|
|
||||||
# Process local file
|
# Process local file
|
||||||
local_file_path = url[7:] # Remove 'file://' prefix
|
local_file_path = url[7:] # Remove 'file://' prefix
|
||||||
@@ -462,8 +464,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
||||||
with open(local_file_path, "r", encoding="utf-8") as f:
|
with open(local_file_path, "r", encoding="utf-8") as f:
|
||||||
html = f.read()
|
html = f.read()
|
||||||
if config.screenshot:
|
|
||||||
screenshot_data = await self._generate_screenshot_from_html(html, config)
|
# Handle media generation - all require loading HTML into browser
|
||||||
|
if config.screenshot or config.pdf or config.capture_mhtml:
|
||||||
|
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
|
||||||
|
html, config
|
||||||
|
)
|
||||||
|
|
||||||
if config.capture_console_messages:
|
if config.capture_console_messages:
|
||||||
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
||||||
captured_console = await self._capture_console_messages(page, url)
|
captured_console = await self._capture_console_messages(page, url)
|
||||||
@@ -473,6 +480,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
response_headers=response_headers,
|
response_headers=response_headers,
|
||||||
status_code=status_code,
|
status_code=status_code,
|
||||||
screenshot=screenshot_data,
|
screenshot=screenshot_data,
|
||||||
|
pdf_data=pdf_data,
|
||||||
|
mhtml_data=mhtml_data,
|
||||||
get_delayed_content=None,
|
get_delayed_content=None,
|
||||||
console_messages=captured_console,
|
console_messages=captured_console,
|
||||||
)
|
)
|
||||||
@@ -487,13 +496,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
# raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
||||||
raw_html = url[6:] if url.startswith("raw://") else url[4:]
|
raw_html = url[6:] if url.startswith("raw://") else url[4:]
|
||||||
html = raw_html
|
html = raw_html
|
||||||
if config.screenshot:
|
pdf_data = None
|
||||||
screenshot_data = await self._generate_screenshot_from_html(html, config)
|
mhtml_data = None
|
||||||
|
|
||||||
|
# Handle media generation - all require loading HTML into browser
|
||||||
|
if config.screenshot or config.pdf or config.capture_mhtml:
|
||||||
|
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
|
||||||
|
html, config
|
||||||
|
)
|
||||||
|
|
||||||
return AsyncCrawlResponse(
|
return AsyncCrawlResponse(
|
||||||
html=html,
|
html=html,
|
||||||
response_headers=response_headers,
|
response_headers=response_headers,
|
||||||
status_code=status_code,
|
status_code=status_code,
|
||||||
screenshot=screenshot_data,
|
screenshot=screenshot_data,
|
||||||
|
pdf_data=pdf_data,
|
||||||
|
mhtml_data=mhtml_data,
|
||||||
get_delayed_content=None,
|
get_delayed_content=None,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -1525,22 +1543,27 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
return captured_console
|
return captured_console
|
||||||
|
|
||||||
async def _generate_screenshot_from_html(self, html: str, config: CrawlerRunConfig = None) -> str:
|
async def _generate_media_from_html(
|
||||||
|
self, html: str, config: CrawlerRunConfig = None
|
||||||
|
) -> tuple:
|
||||||
"""
|
"""
|
||||||
Generate a screenshot from raw HTML content by loading it into a browser page.
|
Generate media (screenshot, PDF, MHTML) from raw HTML content.
|
||||||
|
|
||||||
This method is used for raw: and file:// URLs where we have HTML content
|
This method is used for raw: and file:// URLs where we have HTML content
|
||||||
but need to render it in a browser to take a screenshot.
|
but need to render it in a browser to generate media outputs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
html (str): The raw HTML content to render
|
html (str): The raw HTML content to render
|
||||||
config (CrawlerRunConfig, optional): Configuration for screenshot options
|
config (CrawlerRunConfig, optional): Configuration for media options
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: The base64-encoded screenshot data
|
tuple: (screenshot_data, pdf_data, mhtml_data) - any can be None
|
||||||
"""
|
"""
|
||||||
page = None
|
page = None
|
||||||
context = None
|
screenshot_data = None
|
||||||
|
pdf_data = None
|
||||||
|
mhtml_data = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get a browser page
|
# Get a browser page
|
||||||
config = config or CrawlerRunConfig()
|
config = config or CrawlerRunConfig()
|
||||||
@@ -1549,25 +1572,40 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# Load the HTML content into the page
|
# Load the HTML content into the page
|
||||||
await page.set_content(html, wait_until="domcontentloaded")
|
await page.set_content(html, wait_until="domcontentloaded")
|
||||||
|
|
||||||
# Take the screenshot using existing method
|
# Generate requested media
|
||||||
screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
|
if config.pdf:
|
||||||
return await self.take_screenshot(page, screenshot_height_threshold=screenshot_height_threshold)
|
pdf_data = await self.export_pdf(page)
|
||||||
|
|
||||||
|
if config.capture_mhtml:
|
||||||
|
mhtml_data = await self.capture_mhtml(page)
|
||||||
|
|
||||||
|
if config.screenshot:
|
||||||
|
if config.screenshot_wait_for:
|
||||||
|
await asyncio.sleep(config.screenshot_wait_for)
|
||||||
|
screenshot_height_threshold = getattr(config, 'screenshot_height_threshold', None)
|
||||||
|
screenshot_data = await self.take_screenshot(
|
||||||
|
page, screenshot_height_threshold=screenshot_height_threshold
|
||||||
|
)
|
||||||
|
|
||||||
|
return screenshot_data, pdf_data, mhtml_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = f"Failed to generate screenshot from HTML: {str(e)}"
|
error_message = f"Failed to generate media from HTML: {str(e)}"
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
message="HTML Screenshot failed: {error}",
|
message="HTML media generation failed: {error}",
|
||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
params={"error": error_message},
|
params={"error": error_message},
|
||||||
)
|
)
|
||||||
# Return error image as fallback
|
# Return error image for screenshot if it was requested
|
||||||
img = Image.new("RGB", (800, 600), color="black")
|
if config and config.screenshot:
|
||||||
draw = ImageDraw.Draw(img)
|
img = Image.new("RGB", (800, 600), color="black")
|
||||||
font = ImageFont.load_default()
|
draw = ImageDraw.Draw(img)
|
||||||
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
font = ImageFont.load_default()
|
||||||
buffered = BytesIO()
|
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||||
img.save(buffered, format="JPEG")
|
buffered = BytesIO()
|
||||||
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
img.save(buffered, format="JPEG")
|
||||||
|
screenshot_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||||
|
return screenshot_data, pdf_data, mhtml_data
|
||||||
finally:
|
finally:
|
||||||
# Clean up the page
|
# Clean up the page
|
||||||
if page:
|
if page:
|
||||||
|
|||||||
Reference in New Issue
Block a user