feat: Sanitize input and handle encoding issues in LLMExtractionStrategy

This commit is contained in:
unclecode
2024-07-05 17:37:25 +08:00
parent fb6ed5f000
commit b0ec54b9e9
2 changed files with 6 additions and 6 deletions

View File

@@ -96,7 +96,7 @@ def sanitize_html(html):
return sanitized_html
def sanitize_input_encode(self, text: str) -> str:
def sanitize_input_encode(text: str) -> str:
"""Sanitize input to handle potential encoding issues."""
try:
# Attempt to encode and decode as UTF-8 to handle potential encoding issues

View File

@@ -155,8 +155,8 @@ class WebCrawler:
return None
if cached:
html = cached[1]
extracted_content = cached[4]
html = sanitize_input_encode(cached[1])
extracted_content = sanitize_input_encode(cached[4])
if screenshot:
screenshot_data = cached[9]
if not screenshot_data:
@@ -166,7 +166,7 @@ class WebCrawler:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
t1 = time.time()
html = self.crawler_strategy.crawl(url, **kwargs)
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
t2 = time.time()
if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
@@ -213,8 +213,8 @@ class WebCrawler:
except InvalidCSSSelectorError as e:
raise ValueError(str(e))
cleaned_html = result.get("cleaned_html", "")
markdown = result.get("markdown", "")
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})