Compare commits
3 Commits
v0.7.8
...
pdf_proces
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5a0866e03 | ||
|
|
a87e8c1c9e | ||
|
|
835e3c56fe |
19
.github/workflows/docker-release.yml
vendored
19
.github/workflows/docker-release.yml
vendored
@@ -11,6 +11,25 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- name: Free up disk space
|
||||||
|
run: |
|
||||||
|
echo "=== Disk space before cleanup ==="
|
||||||
|
df -h
|
||||||
|
|
||||||
|
# Remove unnecessary tools and libraries (frees ~25GB)
|
||||||
|
sudo rm -rf /usr/share/dotnet
|
||||||
|
sudo rm -rf /usr/local/lib/android
|
||||||
|
sudo rm -rf /opt/ghc
|
||||||
|
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||||
|
sudo rm -rf /usr/local/share/boost
|
||||||
|
sudo rm -rf /usr/share/swift
|
||||||
|
|
||||||
|
# Clean apt cache
|
||||||
|
sudo apt-get clean
|
||||||
|
|
||||||
|
echo "=== Disk space after cleanup ==="
|
||||||
|
df -h
|
||||||
|
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
|||||||
@@ -989,8 +989,53 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
mhtml_data = None
|
mhtml_data = None
|
||||||
|
|
||||||
if config.pdf:
|
if config.pdf:
|
||||||
pdf_data = await self.export_pdf(page)
|
if config.css_selector:
|
||||||
|
# Extract content with styles and fixed image URLs
|
||||||
|
content_with_styles = await page.evaluate(f"""
|
||||||
|
() => {{
|
||||||
|
const element = document.querySelector("{config.css_selector}");
|
||||||
|
const clone = element.cloneNode(true);
|
||||||
|
|
||||||
|
// Fix all image URLs to absolute
|
||||||
|
clone.querySelectorAll('img').forEach(img => {{
|
||||||
|
if (img.src) img.src = img.src; // This converts to absolute URL
|
||||||
|
}});
|
||||||
|
|
||||||
|
// Get all styles
|
||||||
|
const styles = Array.from(document.styleSheets)
|
||||||
|
.map(sheet => {{
|
||||||
|
try {{
|
||||||
|
return Array.from(sheet.cssRules).map(rule => rule.cssText).join('\\n');
|
||||||
|
}} catch(e) {{
|
||||||
|
return '';
|
||||||
|
}}
|
||||||
|
}}).join('\\n');
|
||||||
|
|
||||||
|
return {{
|
||||||
|
html: clone.outerHTML,
|
||||||
|
styles: styles,
|
||||||
|
baseUrl: window.location.origin
|
||||||
|
}};
|
||||||
|
}}
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Create page with base URL for relative resources
|
||||||
|
temp_page = await context.new_page()
|
||||||
|
await temp_page.goto(content_with_styles['baseUrl']) # Set the base URL
|
||||||
|
await temp_page.set_content(f"""
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<base href="{content_with_styles['baseUrl']}">
|
||||||
|
<style>{content_with_styles['styles']}</style>
|
||||||
|
</head>
|
||||||
|
<body>{content_with_styles['html']}</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
||||||
|
pdf_data = await self.export_pdf(temp_page)
|
||||||
|
await temp_page.close()
|
||||||
|
else:
|
||||||
|
pdf_data = await self.export_pdf(page)
|
||||||
if config.capture_mhtml:
|
if config.capture_mhtml:
|
||||||
mhtml_data = await self.capture_mhtml(page)
|
mhtml_data = await self.capture_mhtml(page)
|
||||||
|
|
||||||
|
|||||||
@@ -55,6 +55,16 @@
|
|||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
---
|
||||||
|
#### 🚀 Crawl4AI Cloud API — Closed Beta (Launching Soon)
|
||||||
|
Reliable, large-scale web extraction, now built to be _**drastically more cost-effective**_ than any of the existing solutions.
|
||||||
|
|
||||||
|
👉 **Apply [here](https://forms.gle/E9MyPaNXACnAMaqG7) for early access**
|
||||||
|
_We’ll be onboarding in phases and working closely with early users.
|
||||||
|
Limited slots._
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for large language models, AI agents, and data pipelines. Fully open source, flexible, and built for real-time performance, **Crawl4AI** empowers developers with unmatched speed, precision, and deployment ease.
|
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for large language models, AI agents, and data pipelines. Fully open source, flexible, and built for real-time performance, **Crawl4AI** empowers developers with unmatched speed, precision, and deployment ease.
|
||||||
|
|
||||||
> Enjoy using Crawl4AI? Consider **[becoming a sponsor](https://github.com/sponsors/unclecode)** to support ongoing development and community growth!
|
> Enjoy using Crawl4AI? Consider **[becoming a sponsor](https://github.com/sponsors/unclecode)** to support ongoing development and community growth!
|
||||||
|
|||||||
Reference in New Issue
Block a user