refactor(crawler): improve HTML handling and cleanup codebase

- Add HTML attribute preservation in GoogleSearchCrawler - Fix lxml import references in utils.py - Remove unused ssl_certificate.json - Clean up imports and code organization in hub.py - Update test case formatting and remove unused image search test BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations
2025-02-07 21:56:27 +08:00
parent 91073c1244
commit b957ff2ecd
5 changed files with 17 additions and 79 deletions
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -33,6 +33,8 @@ class GoogleSearchCrawler(BaseCrawler):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            config = CrawlerRunConfig(
                cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+                keep_attrs=["id", "class"],
+                keep_data_attributes=True,
                delay_before_return_html=kwargs.get(
                    "delay", 2 if search_type == "image" else 1),
                js_code=self.js_script if search_type == "image" else None,
@@ -99,7 +101,6 @@ class GoogleSearchCrawler(BaseCrawler):
            "link": "...",
            "source": "Insider Monkey",
            "date": "1 hour ago",
-            "imageUrl": "..."
        }""",
                query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
            )
--- a/crawl4ai/hub.py
+++ b/crawl4ai/hub.py
@@ -1,17 +1,13 @@
-import importlib
-import pkgutil
-from pathlib import Path
-import logging
+# crawl4ai/hub.py
+from abc import ABC, abstractmethod
 from typing import Dict, Type
+import logging
+import importlib
+from pathlib import Path
 import inspect

 logger = logging.getLogger(__name__)

-# crawl4ai/base.py
-from abc import ABC, abstractmethod
-from typing import Optional, Dict, Any
-import json
-import logging

 class BaseCrawler(ABC):
    def __init__(self):
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2467,7 +2467,7 @@ def truncate(value, threshold):
    return value

 def optimize_html(html_str, threshold=200):
-    root = html.fromstring(html_str)
+    root = lxml.html.fromstring(html_str)
    
    for _element in root.iter():
        # Process attributes
@@ -2482,7 +2482,7 @@ def optimize_html(html_str, threshold=200):
        if _element.tail and len(_element.tail) > threshold:
            _element.tail = truncate(_element.tail, threshold)
    
-    return html.tostring(root, encoding='unicode', pretty_print=False)
+    return lxml.html.tostring(root, encoding='unicode', pretty_print=False)

 class HeadPeekr:
    @staticmethod