feat: Sanitize input and handle encoding issues in LLMExtractionStrategy

This commit modifies the LLMExtractionStrategy class in `extraction_strategy.py` to sanitize input and handle potential encoding issues. The `sanitize_input_encode` function is introduced in `utils.py` to encode and decode the input text as UTF-8 or ASCII, depending on the encoding issues encountered. If an encoding error occurs, the function falls back to ASCII encoding and logs a warning message. This change improves the robustness of the extraction process and ensures that characters are not lost due to encoding issues.
This commit is contained in:
unclecode
2024-07-05 17:30:58 +08:00
parent 597fe8bdb7
commit fb6ed5f000
2 changed files with 24 additions and 3 deletions

View File

@@ -191,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
# Sequential processing with a delay
for ix, section in enumerate(merged_sections):
extract_func = partial(self.extract, url)
extracted_content.extend(extract_func(ix, section))
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
time.sleep(0.5) # 500 ms delay between each processing
else:
# Parallel processing using ThreadPoolExecutor
@@ -201,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
with ThreadPoolExecutor(max_workers=4) as executor:
extract_func = partial(self.extract, url)
futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
for future in as_completed(futures):
extracted_content.extend(future.result())
try:
extracted_content.extend(future.result())
except Exception as e:
if self.verbose:
print(f"Error in thread execution: {e}")
# Add error information to extracted_content
extracted_content.append({
"index": 0,
"error": True,
"tags": ["error"],
"content": str(e)
})
return extracted_content

View File

@@ -96,6 +96,16 @@ def sanitize_html(html):
return sanitized_html
def sanitize_input_encode(self, text: str) -> str:
"""Sanitize input to handle potential encoding issues."""
try:
# Attempt to encode and decode as UTF-8 to handle potential encoding issues
return text.encode('utf-8', errors='ignore').decode('utf-8')
except UnicodeEncodeError as e:
print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
# Fall back to ASCII if UTF-8 fails
return text.encode('ascii', errors='ignore').decode('ascii')
def escape_json_string(s):
"""
Escapes characters in a string to be JSON safe.