feat: Sanitize input and handle encoding issues in LLMExtractionStrategy
This commit modifies the LLMExtractionStrategy class in `extraction_strategy.py` to sanitize input and handle potential encoding issues. The `sanitize_input_encode` function is introduced in `utils.py` to encode and decode the input text as UTF-8 or ASCII, depending on the encoding issues encountered. If an encoding error occurs, the function falls back to ASCII encoding and logs a warning message. This change improves the robustness of the extraction process and ensures that characters are not lost due to encoding issues.
This commit is contained in:
@@ -191,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
# Sequential processing with a delay
|
# Sequential processing with a delay
|
||||||
for ix, section in enumerate(merged_sections):
|
for ix, section in enumerate(merged_sections):
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
extracted_content.extend(extract_func(ix, section))
|
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
|
||||||
time.sleep(0.5) # 500 ms delay between each processing
|
time.sleep(0.5) # 500 ms delay between each processing
|
||||||
else:
|
else:
|
||||||
# Parallel processing using ThreadPoolExecutor
|
# Parallel processing using ThreadPoolExecutor
|
||||||
@@ -201,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
|
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
|
try:
|
||||||
extracted_content.extend(future.result())
|
extracted_content.extend(future.result())
|
||||||
|
except Exception as e:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"Error in thread execution: {e}")
|
||||||
|
# Add error information to extracted_content
|
||||||
|
extracted_content.append({
|
||||||
|
"index": 0,
|
||||||
|
"error": True,
|
||||||
|
"tags": ["error"],
|
||||||
|
"content": str(e)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
return extracted_content
|
return extracted_content
|
||||||
|
|||||||
@@ -96,6 +96,16 @@ def sanitize_html(html):
|
|||||||
|
|
||||||
return sanitized_html
|
return sanitized_html
|
||||||
|
|
||||||
|
def sanitize_input_encode(self, text: str) -> str:
|
||||||
|
"""Sanitize input to handle potential encoding issues."""
|
||||||
|
try:
|
||||||
|
# Attempt to encode and decode as UTF-8 to handle potential encoding issues
|
||||||
|
return text.encode('utf-8', errors='ignore').decode('utf-8')
|
||||||
|
except UnicodeEncodeError as e:
|
||||||
|
print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
|
||||||
|
# Fall back to ASCII if UTF-8 fails
|
||||||
|
return text.encode('ascii', errors='ignore').decode('ascii')
|
||||||
|
|
||||||
def escape_json_string(s):
|
def escape_json_string(s):
|
||||||
"""
|
"""
|
||||||
Escapes characters in a string to be JSON safe.
|
Escapes characters in a string to be JSON safe.
|
||||||
|
|||||||
Reference in New Issue
Block a user