Compare commits
27 Commits
pdf_proces
...
release/v0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
61be862ab0 | ||
|
|
9672afded2 | ||
|
|
60d6173914 | ||
|
|
48c31c4cb9 | ||
|
|
48b6283e71 | ||
|
|
5a8fb57795 | ||
|
|
df4d87ed78 | ||
|
|
f32cfc6db0 | ||
|
|
d06c39e8ab | ||
|
|
afc31e144a | ||
|
|
07ccf13be6 | ||
|
|
6893094f58 | ||
|
|
3a8f8298d3 | ||
|
|
e95e8e1a97 | ||
|
|
eb76df2c0d | ||
|
|
6ec6bc4d8a | ||
|
|
33a3cc3933 | ||
|
|
7a133e22cc | ||
|
|
dcb77c94bf | ||
|
|
a0c5f0f79a | ||
|
|
b36c6daa5c | ||
|
|
94c8a833bf | ||
|
|
84bfea8bd1 | ||
|
|
7771ed3894 | ||
|
|
eca04b0368 | ||
|
|
c2c4d42be4 | ||
|
|
edd0b576b1 |
@@ -989,53 +989,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
mhtml_data = None
|
mhtml_data = None
|
||||||
|
|
||||||
if config.pdf:
|
if config.pdf:
|
||||||
if config.css_selector:
|
pdf_data = await self.export_pdf(page)
|
||||||
# Extract content with styles and fixed image URLs
|
|
||||||
content_with_styles = await page.evaluate(f"""
|
|
||||||
() => {{
|
|
||||||
const element = document.querySelector("{config.css_selector}");
|
|
||||||
const clone = element.cloneNode(true);
|
|
||||||
|
|
||||||
// Fix all image URLs to absolute
|
|
||||||
clone.querySelectorAll('img').forEach(img => {{
|
|
||||||
if (img.src) img.src = img.src; // This converts to absolute URL
|
|
||||||
}});
|
|
||||||
|
|
||||||
// Get all styles
|
|
||||||
const styles = Array.from(document.styleSheets)
|
|
||||||
.map(sheet => {{
|
|
||||||
try {{
|
|
||||||
return Array.from(sheet.cssRules).map(rule => rule.cssText).join('\\n');
|
|
||||||
}} catch(e) {{
|
|
||||||
return '';
|
|
||||||
}}
|
|
||||||
}}).join('\\n');
|
|
||||||
|
|
||||||
return {{
|
|
||||||
html: clone.outerHTML,
|
|
||||||
styles: styles,
|
|
||||||
baseUrl: window.location.origin
|
|
||||||
}};
|
|
||||||
}}
|
|
||||||
""")
|
|
||||||
|
|
||||||
# Create page with base URL for relative resources
|
|
||||||
temp_page = await context.new_page()
|
|
||||||
await temp_page.goto(content_with_styles['baseUrl']) # Set the base URL
|
|
||||||
await temp_page.set_content(f"""
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<base href="{content_with_styles['baseUrl']}">
|
|
||||||
<style>{content_with_styles['styles']}</style>
|
|
||||||
</head>
|
|
||||||
<body>{content_with_styles['html']}</body>
|
|
||||||
</html>
|
|
||||||
""")
|
|
||||||
|
|
||||||
pdf_data = await self.export_pdf(temp_page)
|
|
||||||
await temp_page.close()
|
|
||||||
else:
|
|
||||||
pdf_data = await self.export_pdf(page)
|
|
||||||
if config.capture_mhtml:
|
if config.capture_mhtml:
|
||||||
mhtml_data = await self.capture_mhtml(page)
|
mhtml_data = await self.capture_mhtml(page)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user