refactor: Add group ID to for images extracted from srcset.

This commit is contained in:
UncleCode
2024-11-23 18:00:32 +08:00
parent 0d0cef3438
commit d729aa7d5e

View File

@@ -228,24 +228,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return None
def process_image_old(img, url, index, total_images):
def parse_srcset(srcset_str):
"""Parse srcset attribute into list of image URLs with their sizes."""
if not srcset_str:
return []
sources = []
# Split on http/https and filter empty strings
urls = [f"http{part}" for part in srcset_str.split("http") if part]
for url in urls:
# Remove trailing comma and whitespace, then split to get width
url = url.strip().rstrip(',')
parts = url.rsplit(' ', 1)
img_url = parts[0].strip()
width = parts[1].rstrip('w') if len(parts) > 1 else None
sources.append({'url': img_url, 'width': width})
return sources
#Check if an image has valid display and inside undesired html elements
def is_valid_image(img, parent, parent_classes):
@@ -376,12 +359,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
unique_urls = set()
image_variants = []
# Generate a unique group ID for this set of variants
group_id = index
# Base image info template
base_info = {
'alt': alt,
'desc': find_closest_parent_with_useful_text(img),
'score': score,
'type': 'image'
'type': 'image',
'group_id': group_id # Group ID for this set of variants
}
# Inline function for adding variants