refactor(crawler): improve HTML handling and cleanup codebase

- Add HTML attribute preservation in GoogleSearchCrawler - Fix lxml import references in utils.py - Remove unused ssl_certificate.json - Clean up imports and code organization in hub.py - Update test case formatting and remove unused image search test BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations
2025-02-07 21:56:27 +08:00
parent 91073c1244
commit b957ff2ecd
5 changed files with 17 additions and 79 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2467,7 +2467,7 @@ def truncate(value, threshold):
    return value

 def optimize_html(html_str, threshold=200):
-    root = html.fromstring(html_str)
+    root = lxml.html.fromstring(html_str)
    
    for _element in root.iter():
        # Process attributes
@@ -2482,7 +2482,7 @@ def optimize_html(html_str, threshold=200):
        if _element.tail and len(_element.tail) > threshold:
            _element.tail = truncate(_element.tail, threshold)
    
-    return html.tostring(root, encoding='unicode', pretty_print=False)
+    return lxml.html.tostring(root, encoding='unicode', pretty_print=False)

 class HeadPeekr:
    @staticmethod