refactor(crawler): improve HTML handling and cleanup codebase

- Add HTML attribute preservation in GoogleSearchCrawler
- Fix lxml import references in utils.py
- Remove unused ssl_certificate.json
- Clean up imports and code organization in hub.py
- Update test case formatting and remove unused image search test

BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations
This commit is contained in:
UncleCode
2025-02-07 21:56:27 +08:00
parent 91073c1244
commit b957ff2ecd
5 changed files with 17 additions and 79 deletions

View File

@@ -33,6 +33,8 @@ class GoogleSearchCrawler(BaseCrawler):
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
keep_attrs=["id", "class"],
keep_data_attributes=True,
delay_before_return_html=kwargs.get(
"delay", 2 if search_type == "image" else 1),
js_code=self.js_script if search_type == "image" else None,
@@ -99,7 +101,6 @@ class GoogleSearchCrawler(BaseCrawler):
"link": "...",
"source": "Insider Monkey",
"date": "1 hour ago",
"imageUrl": "..."
}""",
query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
)

View File

@@ -1,17 +1,13 @@
import importlib
import pkgutil
from pathlib import Path
import logging
# crawl4ai/hub.py
from abc import ABC, abstractmethod
from typing import Dict, Type
import logging
import importlib
from pathlib import Path
import inspect
logger = logging.getLogger(__name__)
# crawl4ai/base.py
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any
import json
import logging
class BaseCrawler(ABC):
def __init__(self):

View File

@@ -2467,7 +2467,7 @@ def truncate(value, threshold):
return value
def optimize_html(html_str, threshold=200):
root = html.fromstring(html_str)
root = lxml.html.fromstring(html_str)
for _element in root.iter():
# Process attributes
@@ -2482,7 +2482,7 @@ def optimize_html(html_str, threshold=200):
if _element.tail and len(_element.tail) > threshold:
_element.tail = truncate(_element.tail, threshold)
return html.tostring(root, encoding='unicode', pretty_print=False)
return lxml.html.tostring(root, encoding='unicode', pretty_print=False)
class HeadPeekr:
@staticmethod