diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 5d5ac836..f889b45c 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -191,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy): # Sequential processing with a delay for ix, section in enumerate(merged_sections): extract_func = partial(self.extract, url) - extracted_content.extend(extract_func(ix, section)) + extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) time.sleep(0.5) # 500 ms delay between each processing else: # Parallel processing using ThreadPoolExecutor @@ -201,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy): with ThreadPoolExecutor(max_workers=4) as executor: extract_func = partial(self.extract, url) - futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)] + futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] for future in as_completed(futures): - extracted_content.extend(future.result()) + try: + extracted_content.extend(future.result()) + except Exception as e: + if self.verbose: + print(f"Error in thread execution: {e}") + # Add error information to extracted_content + extracted_content.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": str(e) + }) return extracted_content diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index c8d4b993..c85ab168 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -96,6 +96,16 @@ def sanitize_html(html): return sanitized_html +def sanitize_input_encode(self, text: str) -> str: + """Sanitize input to handle potential encoding issues.""" + try: + # Attempt to encode and decode as UTF-8 to handle potential encoding issues + return text.encode('utf-8', errors='ignore').decode('utf-8') + except UnicodeEncodeError as e: + print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") + # Fall back to ASCII if UTF-8 fails + return text.encode('ascii', errors='ignore').decode('ascii') + def escape_json_string(s): """ Escapes characters in a string to be JSON safe.