feat: Sanitize input and handle encoding issues in LLMExtractionStrategy

2024-07-05 17:37:25 +08:00
parent fb6ed5f000
commit b0ec54b9e9
2 changed files with 6 additions and 6 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -96,7 +96,7 @@ def sanitize_html(html):

    return sanitized_html

-def sanitize_input_encode(self, text: str) -> str:
+def sanitize_input_encode(text: str) -> str:
    """Sanitize input to handle potential encoding issues."""
    try:
        # Attempt to encode and decode as UTF-8 to handle potential encoding issues
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -155,8 +155,8 @@ class WebCrawler:
                    return None
                
                if cached:
-                    html = cached[1]
-                    extracted_content = cached[4]
+                    html = sanitize_input_encode(cached[1])
+                    extracted_content = sanitize_input_encode(cached[4])
                    if screenshot:
                        screenshot_data = cached[9]
                        if not screenshot_data:
@@ -166,7 +166,7 @@ class WebCrawler:
                    if user_agent:
                        self.crawler_strategy.update_user_agent(user_agent)
                    t1 = time.time()
-                    html = self.crawler_strategy.crawl(url, **kwargs)
+                    html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
                    t2 = time.time()
                    if verbose:
                        print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
@@ -213,8 +213,8 @@ class WebCrawler:
            except InvalidCSSSelectorError as e:
                raise ValueError(str(e))
            
-            cleaned_html = result.get("cleaned_html", "")
-            markdown = result.get("markdown", "")
+            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+            markdown = sanitize_input_encode(result.get("markdown", ""))
            media = result.get("media", [])
            links = result.get("links", [])
            metadata = result.get("metadata", {})