refactor(crawler): improve HTML handling and cleanup codebase

- Add HTML attribute preservation in GoogleSearchCrawler
- Fix lxml import references in utils.py
- Remove unused ssl_certificate.json
- Clean up imports and code organization in hub.py
- Update test case formatting and remove unused image search test

BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations
This commit is contained in:
UncleCode
2025-02-07 21:56:27 +08:00
parent 91073c1244
commit b957ff2ecd
5 changed files with 17 additions and 79 deletions

View File

@@ -2467,7 +2467,7 @@ def truncate(value, threshold):
return value
def optimize_html(html_str, threshold=200):
root = html.fromstring(html_str)
root = lxml.html.fromstring(html_str)
for _element in root.iter():
# Process attributes
@@ -2482,7 +2482,7 @@ def optimize_html(html_str, threshold=200):
if _element.tail and len(_element.tail) > threshold:
_element.tail = truncate(_element.tail, threshold)
return html.tostring(root, encoding='unicode', pretty_print=False)
return lxml.html.tostring(root, encoding='unicode', pretty_print=False)
class HeadPeekr:
@staticmethod