From 16f98cebc0b0e75c0842aa4d13e45cbaea9ec8af Mon Sep 17 00:00:00 2001 From: datehoer Date: Tue, 27 Aug 2024 09:44:35 +0800 Subject: [PATCH] replace base64 image url to '' --- crawl4ai/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 64ce9f57..2ea6fec7 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -634,7 +634,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: return node body = flatten_nested_elements(body) - + base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') + for img in imgs: + src = img.get('src', '') + if base64_pattern.match(src): + # Replace base64 data with empty string + img['src'] = base64_pattern.sub('', src) cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') cleaned_html = sanitize_html(cleaned_html)