refactor(crawler): improve HTML handling and cleanup codebase

- Add HTML attribute preservation in GoogleSearchCrawler - Fix lxml import references in utils.py - Remove unused ssl_certificate.json - Clean up imports and code organization in hub.py - Update test case formatting and remove unused image search test BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations
2025-02-07 21:56:27 +08:00
parent 91073c1244
commit b957ff2ecd
5 changed files with 17 additions and 79 deletions
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -33,6 +33,8 @@ class GoogleSearchCrawler(BaseCrawler):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            config = CrawlerRunConfig(
                cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+                keep_attrs=["id", "class"],
+                keep_data_attributes=True,
                delay_before_return_html=kwargs.get(
                    "delay", 2 if search_type == "image" else 1),
                js_code=self.js_script if search_type == "image" else None,
@@ -99,7 +101,6 @@ class GoogleSearchCrawler(BaseCrawler):
            "link": "...",
            "source": "Insider Monkey",
            "date": "1 hour ago",
-            "imageUrl": "..."
        }""",
                query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
            )