feat(crawler): enhance JavaScript execution and PDF processing

Add JavaScript execution result handling and improve PDF processing capabilities:
- Add js_execution_result to CrawlResult and AsyncCrawlResponse models
- Implement execution result capture in AsyncPlaywrightCrawlerStrategy
- Add batch processing for PDF pages with configurable batch size
- Enhance JsonElementExtractionStrategy with better schema generation
- Add HTML optimization utilities

BREAKING CHANGE: PDF processing now uses batch processing by default
This commit is contained in:
UncleCode
2025-01-29 21:03:39 +08:00
parent f8fd9d9eff
commit 31938fb922
7 changed files with 150 additions and 20 deletions

View File

@@ -22,7 +22,7 @@ import cProfile
import pstats
from functools import wraps
import asyncio
from lxml import html, etree
import sqlite3
import hashlib
from urllib.parse import urljoin, urlparse
@@ -2207,3 +2207,26 @@ def get_error_context(exc_info, context_lines: int = 5):
"function": func_name,
"code_context": code_context,
}
def truncate(value, threshold):
if len(value) > threshold:
return value[:threshold] + '...' # Add ellipsis to indicate truncation
return value
def optimize_html(html_str, threshold=200):
root = html.fromstring(html_str)
for element in root.iter():
# Process attributes
for attr in list(element.attrib):
element.attrib[attr] = truncate(element.attrib[attr], threshold)
# Process text content
if element.text and len(element.text) > threshold:
element.text = truncate(element.text, threshold)
# Process tail text
if element.tail and len(element.tail) > threshold:
element.tail = truncate(element.tail, threshold)
return html.tostring(root, encoding='unicode', pretty_print=False)