diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index e7b59d65..338cc77e 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -11,6 +11,9 @@ from .prompts import PROMPT_EXTRACT_BLOCKS from .config import * from pathlib import Path from typing import Dict, Any +from urllib.parse import urljoin +import requests +from requests.exceptions import InvalidSchema class InvalidCSSSelectorError(Exception): pass @@ -447,6 +450,101 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: links = {'internal': [], 'external': []} media = {'images': [], 'videos': [], 'audios': []} + def process_image(img, url, index, total_images): + #Check if an image has valid display and inside undesired html elements + def is_valid_image(img, parent, parent_classes): + style = img.get('style', '') + src = img.get('src', '') + classes_to_check = ['button', 'icon', 'logo'] + tags_to_check = ['button', 'input'] + return all([ + 'display:none' not in style, + src, + not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check), + parent.name not in tags_to_check + ]) + + #Score an image for it's usefulness + def score_image_for_usefulness(img, base_url, index, images_count): + # Function to parse image height/width value and units + def parse_dimension(dimension): + if dimension: + match = re.match(r"(\d+)(\D*)", dimension) + if match: + number = int(match.group(1)) + unit = match.group(2) or 'px' # Default unit is 'px' if not specified + return number, unit + return None, None + + # Fetch image file metadata to extract size and extension + def fetch_image_file_size(img, base_url): + #If src is relative path construct full URL, if not it may be CDN URL + img_url = urljoin(base_url,img.get('src')) + try: + response = requests.head(img_url) + if response.status_code == 200: + return response.headers.get('Content-Length',None) + else: + print(f"Failed to retrieve file size for {img_url}") + return None + except InvalidSchema as e: + return None + finally: + return + + image_height = img.get('height') + height_value, height_unit = parse_dimension(image_height) + image_width = img.get('width') + width_value, width_unit = parse_dimension(image_width) + image_size = int(fetch_image_file_size(img,base_url) or 0) + image_format = os.path.splitext(img.get('src',''))[1].lower() + score = 0 + if height_value: + if height_unit == 'px' and height_value > 150: + score += 1 + if height_unit in ['%','vh','vmin','vmax'] and height_value >30: + score += 1 + if width_value: + if width_unit == 'px' and width_value > 150: + score += 1 + if width_unit in ['%','vh','vmin','vmax'] and width_value >30: + score += 1 + if image_size > 10000: + score += 1 + if img.get('alt') != '': + score+=1 + if any(image_format==format for format in ['jpg','png','webp']): + score+=1 + if index/images_count<0.5: + score+=1 + return score + + # Extract meaningful text for images from closest parent + def find_closest_parent_with_useful_text(tag): + current_tag = tag + while current_tag: + current_tag = current_tag.parent + # Get the text content of the parent tag + if current_tag: + text_content = current_tag.get_text(separator='. ',strip=True) + # Check if the text content has at least word_count_threshold + if len(text_content.split()) >= word_count_threshold: + return text_content + return None + + if not is_valid_image(img, img.parent, img.parent.get('class', [])): + return None + score = score_image_for_usefulness(img, url, index, total_images) + if score <= 2: + return None + return { + 'src': img.get('src', ''), + 'alt': img.get('alt', ''), + 'desc': find_closest_parent_with_useful_text(img), + 'score': score, + 'type': 'image' + } + def process_element(element: element.PageElement) -> bool: try: if isinstance(element, NavigableString): @@ -471,11 +569,6 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: keep_element = True elif element.name == 'img': - media['images'].append({ - 'src': element.get('src'), - 'alt': element.get('alt'), - 'type': 'image' - }) return True # Always keep image elements elif element.name in ['video', 'audio']: @@ -518,6 +611,14 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: print('Error processing element:', str(e)) return False + #process images by filtering and extracting contextual text from the page + imgs = body.find_all('img') + media['images'] = [ + result for result in + (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs)) + if result is not None + ] + process_element(body) def flatten_nested_elements(node):