refactor(crawler): improve HTML handling and cleanup codebase
- Add HTML attribute preservation in GoogleSearchCrawler - Fix lxml import references in utils.py - Remove unused ssl_certificate.json - Clean up imports and code organization in hub.py - Update test case formatting and remove unused image search test BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations
This commit is contained in:
@@ -33,6 +33,8 @@ class GoogleSearchCrawler(BaseCrawler):
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
|
||||
keep_attrs=["id", "class"],
|
||||
keep_data_attributes=True,
|
||||
delay_before_return_html=kwargs.get(
|
||||
"delay", 2 if search_type == "image" else 1),
|
||||
js_code=self.js_script if search_type == "image" else None,
|
||||
@@ -99,7 +101,6 @@ class GoogleSearchCrawler(BaseCrawler):
|
||||
"link": "...",
|
||||
"source": "Insider Monkey",
|
||||
"date": "1 hour ago",
|
||||
"imageUrl": "..."
|
||||
}""",
|
||||
query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
|
||||
)
|
||||
|
||||
@@ -1,17 +1,13 @@
|
||||
import importlib
|
||||
import pkgutil
|
||||
from pathlib import Path
|
||||
import logging
|
||||
# crawl4ai/hub.py
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Type
|
||||
import logging
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
import inspect
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# crawl4ai/base.py
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, Dict, Any
|
||||
import json
|
||||
import logging
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
def __init__(self):
|
||||
|
||||
@@ -2467,7 +2467,7 @@ def truncate(value, threshold):
|
||||
return value
|
||||
|
||||
def optimize_html(html_str, threshold=200):
|
||||
root = html.fromstring(html_str)
|
||||
root = lxml.html.fromstring(html_str)
|
||||
|
||||
for _element in root.iter():
|
||||
# Process attributes
|
||||
@@ -2482,7 +2482,7 @@ def optimize_html(html_str, threshold=200):
|
||||
if _element.tail and len(_element.tail) > threshold:
|
||||
_element.tail = truncate(_element.tail, threshold)
|
||||
|
||||
return html.tostring(root, encoding='unicode', pretty_print=False)
|
||||
return lxml.html.tostring(root, encoding='unicode', pretty_print=False)
|
||||
|
||||
class HeadPeekr:
|
||||
@staticmethod
|
||||
|
||||
Reference in New Issue
Block a user