refactor: Add group ID to for images extracted from srcset.

2024-11-23 18:00:32 +08:00
parent 0d0cef3438
commit d729aa7d5e
1 changed files with 6 additions and 19 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -228,24 +228,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                return None

        def process_image_old(img, url, index, total_images):
-            def parse_srcset(srcset_str):
-                """Parse srcset attribute into list of image URLs with their sizes."""
-                if not srcset_str:
-                    return []
-                
-                sources = []
-                # Split on http/https and filter empty strings
-                urls = [f"http{part}" for part in srcset_str.split("http") if part]
-                
-                for url in urls:
-                    # Remove trailing comma and whitespace, then split to get width
-                    url = url.strip().rstrip(',')
-                    parts = url.rsplit(' ', 1)
-                    img_url = parts[0].strip()
-                    width = parts[1].rstrip('w') if len(parts) > 1 else None
-                    sources.append({'url': img_url, 'width': width})
-                
-                return sources          
+                   
            
            #Check if an image has valid display and inside undesired html elements
            def is_valid_image(img, parent, parent_classes):
@@ -376,12 +359,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            unique_urls = set()
            image_variants = []
            
+            # Generate a unique group ID for this set of variants
+            group_id = index 
+            
            # Base image info template
            base_info = {
                'alt': alt,
                'desc': find_closest_parent_with_useful_text(img),
                'score': score,
-                'type': 'image'
+                'type': 'image',
+                'group_id': group_id # Group ID for this set of variants
            }

            # Inline function for adding variants