From fb6ed5f000f38b0faaff79b91c4eb25a59542622 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Fri, 5 Jul 2024 17:30:58 +0800
Subject: [PATCH] feat: Sanitize input and handle encoding issues in
 LLMExtractionStrategy

This commit modifies the LLMExtractionStrategy class in `extraction_strategy.py` to sanitize input and handle potential encoding issues. The `sanitize_input_encode` function is introduced in `utils.py` to encode and decode the input text as UTF-8 or ASCII, depending on the encoding issues encountered. If an encoding error occurs, the function falls back to ASCII encoding and logs a warning message. This change improves the robustness of the extraction process and ensures that characters are not lost due to encoding issues.
---
 crawl4ai/extraction_strategy.py | 17 ++++++++++++++---
 crawl4ai/utils.py               | 10 ++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 5d5ac836..f889b45c 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -191,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
             # Sequential processing with a delay
             for ix, section in enumerate(merged_sections):
                 extract_func = partial(self.extract, url)
-                extracted_content.extend(extract_func(ix, section))
+                extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
                 time.sleep(0.5)  # 500 ms delay between each processing
         else:
             # Parallel processing using ThreadPoolExecutor
@@ -201,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
             
             with ThreadPoolExecutor(max_workers=4) as executor:
                 extract_func = partial(self.extract, url)
-                futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
+                futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
                 
                 for future in as_completed(futures):
-                    extracted_content.extend(future.result())
+                    try:
+                        extracted_content.extend(future.result())
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error in thread execution: {e}")
+                        # Add error information to extracted_content
+                        extracted_content.append({
+                            "index": 0,
+                            "error": True,
+                            "tags": ["error"],
+                            "content": str(e)
+                        })
 
         
         return extracted_content        
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index c8d4b993..c85ab168 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -96,6 +96,16 @@ def sanitize_html(html):
 
     return sanitized_html
 
+def sanitize_input_encode(self, text: str) -> str:
+    """Sanitize input to handle potential encoding issues."""
+    try:
+        # Attempt to encode and decode as UTF-8 to handle potential encoding issues
+        return text.encode('utf-8', errors='ignore').decode('utf-8')
+    except UnicodeEncodeError as e:
+        print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
+        # Fall back to ASCII if UTF-8 fails
+        return text.encode('ascii', errors='ignore').decode('ascii')
+
 def escape_json_string(s):
     """
     Escapes characters in a string to be JSON safe.