From 04d16e6d2bf84449550f748f2274b8513e5ee1f8 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 20 Oct 2024 19:25:25 +0800
Subject: [PATCH] Fix Base64 image parsing in WebScrappingStrategy (issue 182)

- Add support for extracting Base64 encoded images
- Improve image format detection to include Base64 images
- Enhance compatibility with locally saved HTML files using Base64 image encoding
---
 CHANGELOG.md                           | 3 +++
 crawl4ai/content_scrapping_strategy.py | 6 +++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 59e77217..b79b37ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## [v0.3.72] - 2024-10-20
 
+### Fixed
+- Added support for parsing Base64 encoded images in WebScrappingStrategy
+
 ### Added
 - Forked and integrated a customized version of the html2text library for more control over Markdown generation
 - New configuration options for controlling external content:
diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py
index da163539..139779ea 100644
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -127,7 +127,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                 image_width =  img.get('width')
                 width_value, width_unit = parse_dimension(image_width)
                 image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
-                image_format = os.path.splitext(img.get('src',''))[1].lower()
+                image_src = img.get('src','')
+                if "data:image/" in image_src:
+                    image_format = image_src.split(',')[0].split(';')[0].split('/')[1]
+                else:
+                    image_format = os.path.splitext(img.get('src',''))[1].lower()
                 # Remove . from format
                 image_format = image_format.strip('.').split('?')[0]
                 score = 0