diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 64ce9f57..2ea6fec7 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -634,7 +634,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: return node body = flatten_nested_elements(body) - + base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') + for img in imgs: + src = img.get('src', '') + if base64_pattern.match(src): + # Replace base64 data with empty string + img['src'] = base64_pattern.sub('', src) cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') cleaned_html = sanitize_html(cleaned_html)