refactor: Add group ID to for images extracted from srcset.
This commit is contained in:
@@ -228,24 +228,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def process_image_old(img, url, index, total_images):
|
def process_image_old(img, url, index, total_images):
|
||||||
def parse_srcset(srcset_str):
|
|
||||||
"""Parse srcset attribute into list of image URLs with their sizes."""
|
|
||||||
if not srcset_str:
|
|
||||||
return []
|
|
||||||
|
|
||||||
sources = []
|
|
||||||
# Split on http/https and filter empty strings
|
|
||||||
urls = [f"http{part}" for part in srcset_str.split("http") if part]
|
|
||||||
|
|
||||||
for url in urls:
|
|
||||||
# Remove trailing comma and whitespace, then split to get width
|
|
||||||
url = url.strip().rstrip(',')
|
|
||||||
parts = url.rsplit(' ', 1)
|
|
||||||
img_url = parts[0].strip()
|
|
||||||
width = parts[1].rstrip('w') if len(parts) > 1 else None
|
|
||||||
sources.append({'url': img_url, 'width': width})
|
|
||||||
|
|
||||||
return sources
|
|
||||||
|
|
||||||
#Check if an image has valid display and inside undesired html elements
|
#Check if an image has valid display and inside undesired html elements
|
||||||
def is_valid_image(img, parent, parent_classes):
|
def is_valid_image(img, parent, parent_classes):
|
||||||
@@ -376,12 +359,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
unique_urls = set()
|
unique_urls = set()
|
||||||
image_variants = []
|
image_variants = []
|
||||||
|
|
||||||
|
# Generate a unique group ID for this set of variants
|
||||||
|
group_id = index
|
||||||
|
|
||||||
# Base image info template
|
# Base image info template
|
||||||
base_info = {
|
base_info = {
|
||||||
'alt': alt,
|
'alt': alt,
|
||||||
'desc': find_closest_parent_with_useful_text(img),
|
'desc': find_closest_parent_with_useful_text(img),
|
||||||
'score': score,
|
'score': score,
|
||||||
'type': 'image'
|
'type': 'image',
|
||||||
|
'group_id': group_id # Group ID for this set of variants
|
||||||
}
|
}
|
||||||
|
|
||||||
# Inline function for adding variants
|
# Inline function for adding variants
|
||||||
|
|||||||
Reference in New Issue
Block a user