feat(crawler): enhance JavaScript execution and PDF processing

Add JavaScript execution result handling and improve PDF processing capabilities: - Add js_execution_result to CrawlResult and AsyncCrawlResponse models - Implement execution result capture in AsyncPlaywrightCrawlerStrategy - Add batch processing for PDF pages with configurable batch size - Enhance JsonElementExtractionStrategy with better schema generation - Add HTML optimization utilities BREAKING CHANGE: PDF processing now uses batch processing by default
2025-01-29 21:03:39 +08:00
parent f8fd9d9eff
commit 31938fb922
7 changed files with 150 additions and 20 deletions
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -103,6 +103,7 @@ class CrawlResult(BaseModel):
    media: Dict[str, List[Dict]] = {}
    links: Dict[str, List[Dict]] = {}
    downloaded_files: Optional[List[str]] = None
+    js_execution_result: Optional[Dict[str, Any]] = None
    screenshot: Optional[str] = None
    pdf: Optional[bytes] = None
    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
@@ -126,6 +127,7 @@ class CrawlResult(BaseModel):
 class AsyncCrawlResponse(BaseModel):
    html: str
    response_headers: Dict[str, str]
+    js_execution_result: Optional[Dict[str, Any]] = None
    status_code: int
    screenshot: Optional[str] = None
    pdf_data: Optional[bytes] = None