From c1041b9bbee1338ec89997bdf62e76c6a5f3ada6 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Wed, 7 May 2025 18:43:29 +0530
Subject: [PATCH] fix: exclude_external_images flag simply discards elements
 ref:https://github.com/unclecode/crawl4ai/issues/345

---
 crawl4ai/content_scraping_strategy.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 1dfbce84..d11e02d0 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
 
                     # Check flag if we should remove external images
                     if kwargs.get("exclude_external_images", False):
-                        element.decompose()
-                        return False
-                        # src_url_base = src.split('/')[2]
-                        # url_base = url.split('/')[2]
-                        # if url_base not in src_url_base:
-                        #     element.decompose()
-                        #     return False
+                        # Handle relative URLs (which are always from the same domain)
+                        if not src.startswith('http') and not src.startswith('//'):
+                            return True  # Keep relative URLs
+                        
+                        # For absolute URLs, compare the base domains using the existing function
+                        src_base_domain = get_base_domain(src)
+                        url_base_domain = get_base_domain(url)
+                        
+                        # If the domains don't match and both are valid, the image is external
+                        if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
+                            element.decompose()
+                            return False
 
                     # if kwargs.get('exclude_social_media_links', False):
                     #     if image_src_base_domain in exclude_social_media_domains: