feat: Sanitize input and handle encoding issues in LLMExtractionStrategy

This commit modifies the LLMExtractionStrategy class in `extraction_strategy.py` to sanitize input and handle potential encoding issues. The `sanitize_input_encode` function is introduced in `utils.py` to encode and decode the input text as UTF-8 or ASCII, depending on the encoding issues encountered. If an encoding error occurs, the function falls back to ASCII encoding and logs a warning message. This change improves the robustness of the extraction process and ensures that characters are not lost due to encoding issues.
2024-07-05 17:30:58 +08:00
parent 597fe8bdb7
commit fb6ed5f000
2 changed files with 24 additions and 3 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -191,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
            # Sequential processing with a delay
            for ix, section in enumerate(merged_sections):
                extract_func = partial(self.extract, url)
-                extracted_content.extend(extract_func(ix, section))
+                extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
                time.sleep(0.5)  # 500 ms delay between each processing
        else:
            # Parallel processing using ThreadPoolExecutor
@@ -201,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
            
            with ThreadPoolExecutor(max_workers=4) as executor:
                extract_func = partial(self.extract, url)
-                futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
+                futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
                
                for future in as_completed(futures):
-                    extracted_content.extend(future.result())
+                    try:
+                        extracted_content.extend(future.result())
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error in thread execution: {e}")
+                        # Add error information to extracted_content
+                        extracted_content.append({
+                            "index": 0,
+                            "error": True,
+                            "tags": ["error"],
+                            "content": str(e)
+                        })

        
        return extracted_content        
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -96,6 +96,16 @@ def sanitize_html(html):

    return sanitized_html

+def sanitize_input_encode(self, text: str) -> str:
+    """Sanitize input to handle potential encoding issues."""
+    try:
+        # Attempt to encode and decode as UTF-8 to handle potential encoding issues
+        return text.encode('utf-8', errors='ignore').decode('utf-8')
+    except UnicodeEncodeError as e:
+        print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
+        # Fall back to ASCII if UTF-8 fails
+        return text.encode('ascii', errors='ignore').decode('ascii')
+
 def escape_json_string(s):
    """
    Escapes characters in a string to be JSON safe.