From 04d16e6d2bf84449550f748f2274b8513e5ee1f8 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 20 Oct 2024 19:25:25 +0800 Subject: [PATCH] Fix Base64 image parsing in WebScrappingStrategy (issue 182) - Add support for extracting Base64 encoded images - Improve image format detection to include Base64 images - Enhance compatibility with locally saved HTML files using Base64 image encoding --- CHANGELOG.md | 3 +++ crawl4ai/content_scrapping_strategy.py | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59e77217..b79b37ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## [v0.3.72] - 2024-10-20 +### Fixed +- Added support for parsing Base64 encoded images in WebScrappingStrategy + ### Added - Forked and integrated a customized version of the html2text library for more control over Markdown generation - New configuration options for controlling external content: diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index da163539..139779ea 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -127,7 +127,11 @@ class WebScrappingStrategy(ContentScrappingStrategy): image_width = img.get('width') width_value, width_unit = parse_dimension(image_width) image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) - image_format = os.path.splitext(img.get('src',''))[1].lower() + image_src = img.get('src','') + if "data:image/" in image_src: + image_format = image_src.split(',')[0].split(';')[0].split('/')[1] + else: + image_format = os.path.splitext(img.get('src',''))[1].lower() # Remove . from format image_format = image_format.strip('.').split('?')[0] score = 0