From 006bee4a5a50fed10496b701ecfea350be1b7888 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Fri, 22 Nov 2024 16:00:17 +0800
Subject: [PATCH] feat: enhance image processing capabilities - Enhanced image
 processing with srcset support and validation checks for better image
 selection.

---
 README.md                             |   4 +-
 crawl4ai/content_scraping_strategy.py | 145 ++++++++++++++++++++++++--
 crawl4ai/tools.py                     |  34 ++++++
 3 files changed, 172 insertions(+), 11 deletions(-)
 create mode 100644 crawl4ai/tools.py

diff --git a/README.md b/README.md
index af0d6610..1d3063c7 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,10 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
 
 ## New in 0.3.74 ✨
 
-- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)!
+- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster!
 - 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object.
+- 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats.
+- 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models.
 - 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content.
 - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly.
 - 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures.
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 3823a78d..3b41ec82 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -19,9 +19,9 @@ from .utils import (
     InvalidCSSSelectorError,
     CustomHTML2Text,
     normalize_url,
-    is_external_url
-    
+    is_external_url    
 )
+from .tools import profile_and_time
 
 # Pre-compile regular expressions for Open Graph and Twitter metadata
 OG_REGEX = re.compile(r'^og:')
@@ -234,7 +234,26 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                             return text_content
                 return None
 
-        def process_image(img, url, index, total_images):
+        def process_image_old(img, url, index, total_images):
+            def parse_srcset(srcset_str):
+                """Parse srcset attribute into list of image URLs with their sizes."""
+                if not srcset_str:
+                    return []
+                
+                sources = []
+                # Split on http/https and filter empty strings
+                urls = [f"http{part}" for part in srcset_str.split("http") if part]
+                
+                for url in urls:
+                    # Remove trailing comma and whitespace, then split to get width
+                    url = url.strip().rstrip(',')
+                    parts = url.rsplit(' ', 1)
+                    img_url = parts[0].strip()
+                    width = parts[1].rstrip('w') if len(parts) > 1 else None
+                    sources.append({'url': img_url, 'width': width})
+                
+                return sources          
+            
             #Check if an image has valid display and inside undesired html elements
             def is_valid_image(img, parent, parent_classes):
                 style = img.get('style', '')
@@ -283,14 +302,14 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                     score+=1
                 return score
 
-            
-            
             if not is_valid_image(img, img.parent, img.parent.get('class', [])):
                 return None
+                
             score = score_image_for_usefulness(img, url, index, total_images)
             if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
                 return None
-            return {
+
+            base_result = {
                 'src': img.get('src', ''),
                 'data-src': img.get('data-src', ''),
                 'alt': img.get('alt', ''),
@@ -299,6 +318,109 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 'type': 'image'
             }
 
+            sources = []
+            srcset = img.get('srcset', '')
+            if srcset:
+                sources = parse_srcset(srcset)
+                if sources:
+                    return [dict(base_result, src=source['url'], width=source['width']) 
+                        for source in sources]
+
+            return [base_result]  # Always return a list
+
+        def process_image(img, url, index, total_images):
+            parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') 
+                          if ' ' in u else None} 
+                         for u in [f"http{p}" for p in s.split("http") if p]]
+            
+            # Constants for checks
+            classes_to_check = frozenset(['button', 'icon', 'logo'])
+            tags_to_check = frozenset(['button', 'input'])
+            
+            # Pre-fetch commonly used attributes
+            style = img.get('style', '')
+            alt = img.get('alt', '')
+            src = img.get('src', '')
+            data_src = img.get('data-src', '')
+            width = img.get('width')
+            height = img.get('height')
+            parent = img.parent
+            parent_classes = parent.get('class', [])
+
+            # Quick validation checks
+            if ('display:none' in style or
+                parent.name in tags_to_check or
+                any(c in cls for c in parent_classes for cls in classes_to_check) or
+                any(c in src for c in classes_to_check) or
+                any(c in alt for c in classes_to_check)):
+                return None
+
+            # Quick score calculation
+            score = 0
+            if width and width.isdigit():
+                width_val = int(width)
+                score += 1 if width_val > 150 else 0
+            if height and height.isdigit():
+                height_val = int(height)
+                score += 1 if height_val > 150 else 0
+            if alt:
+                score += 1
+            score += index/total_images < 0.5
+            
+            image_format = ''
+            if "data:image/" in src:
+                image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
+            else:
+                image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
+            
+            if image_format in ('jpg', 'png', 'webp', 'avif'):
+                score += 1
+
+            if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
+                return None
+
+            # Use set for deduplication
+            unique_urls = set()
+            image_variants = []
+            
+            # Base image info template
+            base_info = {
+                'alt': alt,
+                'desc': find_closest_parent_with_useful_text(img),
+                'score': score,
+                'type': 'image'
+            }
+
+            # Inline function for adding variants
+            def add_variant(src, width=None):
+                if src and not src.startswith('data:') and src not in unique_urls:
+                    unique_urls.add(src)
+                    image_variants.append({**base_info, 'src': src, 'width': width})
+
+            # Process all sources
+            add_variant(src)
+            add_variant(data_src)
+            
+            # Handle srcset and data-srcset in one pass
+            for attr in ('srcset', 'data-srcset'):
+                if value := img.get(attr):
+                    for source in parse_srcset(value):
+                        add_variant(source['url'], source['width'])
+
+            # Quick picture element check
+            if picture := img.find_parent('picture'):
+                for source in picture.find_all('source'):
+                    if srcset := source.get('srcset'):
+                        for src in parse_srcset(srcset):
+                            add_variant(src['url'], src['width'])
+
+            # Framework-specific attributes in one pass
+            for attr, value in img.attrs.items():
+                if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
+                    add_variant(value)
+
+            return image_variants if image_variants else None
+
         def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False):
             attrs_to_remove = []
             for attr in element.attrs:
@@ -490,13 +612,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         links['internal'] = list(internal_links_dict.values())
         links['external'] = list(external_links_dict.values())
 
-
         # # Process images using ThreadPoolExecutor
         imgs = body.find_all('img')
         
-        with ThreadPoolExecutor() as executor:
-            image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs)))
-        media['images'] = [result for result in image_results if result is not None]
+        # For test we use for loop instead of thread
+        media['images'] = [
+            img for result in (process_image(img, url, i, len(imgs)) 
+                            for i, img in enumerate(imgs))
+            if result is not None
+            for img in result
+        ]
 
         def flatten_nested_elements(node):
             if isinstance(node, NavigableString):
diff --git a/crawl4ai/tools.py b/crawl4ai/tools.py
new file mode 100644
index 00000000..ff36b53a
--- /dev/null
+++ b/crawl4ai/tools.py
@@ -0,0 +1,34 @@
+import time
+import cProfile
+import pstats
+from functools import wraps
+
+def profile_and_time(func):
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        # Start timer
+        start_time = time.perf_counter()
+        
+        # Setup profiler
+        profiler = cProfile.Profile()
+        profiler.enable()
+        
+        # Run function
+        result = func(self, *args, **kwargs)
+        
+        # Stop profiler
+        profiler.disable()
+        
+        # Calculate elapsed time
+        elapsed_time = time.perf_counter() - start_time
+        
+        # Print timing
+        print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds")
+        
+        # Print profiling stats
+        stats = pstats.Stats(profiler)
+        stats.sort_stats('cumulative')  # Sort by cumulative time
+        stats.print_stats(20)  # Print top 20 time-consuming functions
+        
+        return result
+    return wrapper
\ No newline at end of file