Compare commits
3 Commits
patch/gene
...
pdf_proces
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5a0866e03 | ||
|
|
a87e8c1c9e | ||
|
|
835e3c56fe |
19
.github/workflows/docker-release.yml
vendored
19
.github/workflows/docker-release.yml
vendored
@@ -11,6 +11,25 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Free up disk space
|
||||
run: |
|
||||
echo "=== Disk space before cleanup ==="
|
||||
df -h
|
||||
|
||||
# Remove unnecessary tools and libraries (frees ~25GB)
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /opt/hostedtoolcache/CodeQL
|
||||
sudo rm -rf /usr/local/share/boost
|
||||
sudo rm -rf /usr/share/swift
|
||||
|
||||
# Clean apt cache
|
||||
sudo apt-get clean
|
||||
|
||||
echo "=== Disk space after cleanup ==="
|
||||
df -h
|
||||
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
|
||||
@@ -989,8 +989,53 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
mhtml_data = None
|
||||
|
||||
if config.pdf:
|
||||
pdf_data = await self.export_pdf(page)
|
||||
|
||||
if config.css_selector:
|
||||
# Extract content with styles and fixed image URLs
|
||||
content_with_styles = await page.evaluate(f"""
|
||||
() => {{
|
||||
const element = document.querySelector("{config.css_selector}");
|
||||
const clone = element.cloneNode(true);
|
||||
|
||||
// Fix all image URLs to absolute
|
||||
clone.querySelectorAll('img').forEach(img => {{
|
||||
if (img.src) img.src = img.src; // This converts to absolute URL
|
||||
}});
|
||||
|
||||
// Get all styles
|
||||
const styles = Array.from(document.styleSheets)
|
||||
.map(sheet => {{
|
||||
try {{
|
||||
return Array.from(sheet.cssRules).map(rule => rule.cssText).join('\\n');
|
||||
}} catch(e) {{
|
||||
return '';
|
||||
}}
|
||||
}}).join('\\n');
|
||||
|
||||
return {{
|
||||
html: clone.outerHTML,
|
||||
styles: styles,
|
||||
baseUrl: window.location.origin
|
||||
}};
|
||||
}}
|
||||
""")
|
||||
|
||||
# Create page with base URL for relative resources
|
||||
temp_page = await context.new_page()
|
||||
await temp_page.goto(content_with_styles['baseUrl']) # Set the base URL
|
||||
await temp_page.set_content(f"""
|
||||
<html>
|
||||
<head>
|
||||
<base href="{content_with_styles['baseUrl']}">
|
||||
<style>{content_with_styles['styles']}</style>
|
||||
</head>
|
||||
<body>{content_with_styles['html']}</body>
|
||||
</html>
|
||||
""")
|
||||
|
||||
pdf_data = await self.export_pdf(temp_page)
|
||||
await temp_page.close()
|
||||
else:
|
||||
pdf_data = await self.export_pdf(page)
|
||||
if config.capture_mhtml:
|
||||
mhtml_data = await self.capture_mhtml(page)
|
||||
|
||||
|
||||
@@ -1378,10 +1378,9 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
||||
base_url=llm_config.base_url,
|
||||
extra_args=kwargs
|
||||
)
|
||||
# Simply strip the markdown formatting
|
||||
raw_json = response.choices[0].message.content.replace('```json\n', '').replace('\n```', '')
|
||||
|
||||
# Extract and return schema
|
||||
return json.loads(raw_json)
|
||||
return json.loads(response.choices[0].message.content)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
||||
|
||||
Reference in New Issue
Block a user