From 9ed155112516b2b2da60970303972adca3e3ac29 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Wed, 14 Aug 2024 10:59:49 +0530
Subject: [PATCH] Added support to source tags wrapped inside video and audio
 tags. Extended the text extraction to video and audio elements in media.
 https://github.com/unclecode/crawl4ai/issues/71

---
 crawl4ai/utils.py | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 07832888..d724988b 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -452,6 +452,19 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
     links = {'internal': [], 'external': []}
     media = {'images': [], 'videos': [], 'audios': []}
 
+    # Extract meaningful text for media files from closest parent
+    def find_closest_parent_with_useful_text(tag):
+            current_tag = tag
+            while current_tag:
+                current_tag = current_tag.parent
+                # Get the text content of the parent tag
+                if current_tag:
+                    text_content = current_tag.get_text(separator=' ',strip=True)
+                    # Check if the text content has at least word_count_threshold
+                    if len(text_content.split()) >= image_description_min_word_threshold:
+                        return text_content
+            return None
+
     def process_image(img, url, index, total_images):
             #Check if an image has valid display and inside undesired html elements
             def is_valid_image(img, parent, parent_classes):
@@ -523,19 +536,6 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                     score+=1
                 return score
 
-            # Extract meaningful text for images from closest parent
-            def find_closest_parent_with_useful_text(tag):
-                current_tag = tag
-                while current_tag:
-                    current_tag = current_tag.parent
-                    # Get the text content of the parent tag
-                    if current_tag:
-                        text_content = current_tag.get_text(separator=' ',strip=True)
-                        # Check if the text content has at least word_count_threshold
-                        if len(text_content.split()) >= image_description_min_word_threshold:
-                            return text_content
-                return None
-
             if not is_valid_image(img, img.parent, img.parent.get('class', [])):
                 return None
             score = score_image_for_usefulness(img, url, index, total_images)
@@ -579,7 +579,16 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                 media[f"{element.name}s"].append({
                     'src': element.get('src'),
                     'alt': element.get('alt'),
-                    'type': element.name
+                    'type': element.name,
+                    'description': find_closest_parent_with_useful_text(element)
+                })
+                source_tags = element.find_all('source')
+                for source_tag in source_tags:
+                    media[f"{element.name}s"].append({
+                    'src': source_tag.get('src'),
+                    'alt': element.get('alt'),
+                    'type': element.name,
+                    'description': find_closest_parent_with_useful_text(element)
                 })
                 return True  # Always keep video and audio elements