feat(crawler): enhance JavaScript execution and PDF processing

Add JavaScript execution result handling and improve PDF processing capabilities: - Add js_execution_result to CrawlResult and AsyncCrawlResponse models - Implement execution result capture in AsyncPlaywrightCrawlerStrategy - Add batch processing for PDF pages with configurable batch size - Enhance JsonElementExtractionStrategy with better schema generation - Add HTML optimization utilities BREAKING CHANGE: PDF processing now uses batch processing by default
2025-01-29 21:03:39 +08:00
parent f8fd9d9eff
commit 31938fb922
7 changed files with 150 additions and 20 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -22,7 +22,7 @@ import cProfile
 import pstats
 from functools import wraps
 import asyncio
-
+from lxml import html, etree
 import sqlite3
 import hashlib
 from urllib.parse import urljoin, urlparse
@@ -2207,3 +2207,26 @@ def get_error_context(exc_info, context_lines: int = 5):
        "function": func_name,
        "code_context": code_context,
    }
+
+def truncate(value, threshold):
+    if len(value) > threshold:
+        return value[:threshold] + '...'  # Add ellipsis to indicate truncation
+    return value
+
+def optimize_html(html_str, threshold=200):
+    root = html.fromstring(html_str)
+    
+    for element in root.iter():
+        # Process attributes
+        for attr in list(element.attrib):
+            element.attrib[attr] = truncate(element.attrib[attr], threshold)
+        
+        # Process text content
+        if element.text and len(element.text) > threshold:
+            element.text = truncate(element.text, threshold)
+            
+        # Process tail text
+        if element.tail and len(element.tail) > threshold:
+            element.tail = truncate(element.tail, threshold)
+    
+    return html.tostring(root, encoding='unicode', pretty_print=False)