refactor(crawler): improve HTML handling and cleanup codebase
- Add HTML attribute preservation in GoogleSearchCrawler - Fix lxml import references in utils.py - Remove unused ssl_certificate.json - Clean up imports and code organization in hub.py - Update test case formatting and remove unused image search test BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations
This commit is contained in:
@@ -2467,7 +2467,7 @@ def truncate(value, threshold):
|
||||
return value
|
||||
|
||||
def optimize_html(html_str, threshold=200):
|
||||
root = html.fromstring(html_str)
|
||||
root = lxml.html.fromstring(html_str)
|
||||
|
||||
for _element in root.iter():
|
||||
# Process attributes
|
||||
@@ -2482,7 +2482,7 @@ def optimize_html(html_str, threshold=200):
|
||||
if _element.tail and len(_element.tail) > threshold:
|
||||
_element.tail = truncate(_element.tail, threshold)
|
||||
|
||||
return html.tostring(root, encoding='unicode', pretty_print=False)
|
||||
return lxml.html.tostring(root, encoding='unicode', pretty_print=False)
|
||||
|
||||
class HeadPeekr:
|
||||
@staticmethod
|
||||
|
||||
Reference in New Issue
Block a user