From fb6ed5f000f38b0faaff79b91c4eb25a59542622 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 5 Jul 2024 17:30:58 +0800 Subject: [PATCH] feat: Sanitize input and handle encoding issues in LLMExtractionStrategy This commit modifies the LLMExtractionStrategy class in `extraction_strategy.py` to sanitize input and handle potential encoding issues. The `sanitize_input_encode` function is introduced in `utils.py` to encode and decode the input text as UTF-8 or ASCII, depending on the encoding issues encountered. If an encoding error occurs, the function falls back to ASCII encoding and logs a warning message. This change improves the robustness of the extraction process and ensures that characters are not lost due to encoding issues. --- crawl4ai/extraction_strategy.py | 17 ++++++++++++++--- crawl4ai/utils.py | 10 ++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 5d5ac836..f889b45c 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -191,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy): # Sequential processing with a delay for ix, section in enumerate(merged_sections): extract_func = partial(self.extract, url) - extracted_content.extend(extract_func(ix, section)) + extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) time.sleep(0.5) # 500 ms delay between each processing else: # Parallel processing using ThreadPoolExecutor @@ -201,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy): with ThreadPoolExecutor(max_workers=4) as executor: extract_func = partial(self.extract, url) - futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)] + futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] for future in as_completed(futures): - extracted_content.extend(future.result()) + try: + extracted_content.extend(future.result()) + except Exception as e: + if self.verbose: + print(f"Error in thread execution: {e}") + # Add error information to extracted_content + extracted_content.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": str(e) + }) return extracted_content diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index c8d4b993..c85ab168 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -96,6 +96,16 @@ def sanitize_html(html): return sanitized_html +def sanitize_input_encode(self, text: str) -> str: + """Sanitize input to handle potential encoding issues.""" + try: + # Attempt to encode and decode as UTF-8 to handle potential encoding issues + return text.encode('utf-8', errors='ignore').decode('utf-8') + except UnicodeEncodeError as e: + print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") + # Fall back to ASCII if UTF-8 fails + return text.encode('ascii', errors='ignore').decode('ascii') + def escape_json_string(s): """ Escapes characters in a string to be JSON safe.