feat: enhance image processing capabilities

- Enhanced image processing with srcset support and validation checks for better image selection.
2024-11-22 16:00:17 +08:00
parent dbb751c8f0
commit 006bee4a5a
3 changed files with 172 additions and 11 deletions
--- a/README.md
+++ b/README.md
@@ -13,8 +13,10 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc

 ## New in 0.3.74 ✨

- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)!
+- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster!
 - 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object.
+- 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats.
+- 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models.
 - 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content.
 - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly.
 - 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures.
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -19,9 +19,9 @@ from .utils import (
    InvalidCSSSelectorError,
    CustomHTML2Text,
    normalize_url,
-    is_external_url
-    
+    is_external_url    
 )
+from .tools import profile_and_time

 # Pre-compile regular expressions for Open Graph and Twitter metadata
 OG_REGEX = re.compile(r'^og:')
@@ -234,7 +234,26 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                            return text_content
                return None

-        def process_image(img, url, index, total_images):
+        def process_image_old(img, url, index, total_images):
+            def parse_srcset(srcset_str):
+                """Parse srcset attribute into list of image URLs with their sizes."""
+                if not srcset_str:
+                    return []
+                
+                sources = []
+                # Split on http/https and filter empty strings
+                urls = [f"http{part}" for part in srcset_str.split("http") if part]
+                
+                for url in urls:
+                    # Remove trailing comma and whitespace, then split to get width
+                    url = url.strip().rstrip(',')
+                    parts = url.rsplit(' ', 1)
+                    img_url = parts[0].strip()
+                    width = parts[1].rstrip('w') if len(parts) > 1 else None
+                    sources.append({'url': img_url, 'width': width})
+                
+                return sources          
+            
            #Check if an image has valid display and inside undesired html elements
            def is_valid_image(img, parent, parent_classes):
                style = img.get('style', '')
@@ -283,14 +302,14 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                    score+=1
                return score

-            
-            
            if not is_valid_image(img, img.parent, img.parent.get('class', [])):
                return None
+                
            score = score_image_for_usefulness(img, url, index, total_images)
            if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
                return None
-            return {
+
+            base_result = {
                'src': img.get('src', ''),
                'data-src': img.get('data-src', ''),
                'alt': img.get('alt', ''),
@@ -299,6 +318,109 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                'type': 'image'
            }

+            sources = []
+            srcset = img.get('srcset', '')
+            if srcset:
+                sources = parse_srcset(srcset)
+                if sources:
+                    return [dict(base_result, src=source['url'], width=source['width']) 
+                        for source in sources]
+
+            return [base_result]  # Always return a list
+
+        def process_image(img, url, index, total_images):
+            parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') 
+                          if ' ' in u else None} 
+                         for u in [f"http{p}" for p in s.split("http") if p]]
+            
+            # Constants for checks
+            classes_to_check = frozenset(['button', 'icon', 'logo'])
+            tags_to_check = frozenset(['button', 'input'])
+            
+            # Pre-fetch commonly used attributes
+            style = img.get('style', '')
+            alt = img.get('alt', '')
+            src = img.get('src', '')
+            data_src = img.get('data-src', '')
+            width = img.get('width')
+            height = img.get('height')
+            parent = img.parent
+            parent_classes = parent.get('class', [])
+
+            # Quick validation checks
+            if ('display:none' in style or
+                parent.name in tags_to_check or
+                any(c in cls for c in parent_classes for cls in classes_to_check) or
+                any(c in src for c in classes_to_check) or
+                any(c in alt for c in classes_to_check)):
+                return None
+
+            # Quick score calculation
+            score = 0
+            if width and width.isdigit():
+                width_val = int(width)
+                score += 1 if width_val > 150 else 0
+            if height and height.isdigit():
+                height_val = int(height)
+                score += 1 if height_val > 150 else 0
+            if alt:
+                score += 1
+            score += index/total_images < 0.5
+            
+            image_format = ''
+            if "data:image/" in src:
+                image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
+            else:
+                image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
+            
+            if image_format in ('jpg', 'png', 'webp', 'avif'):
+                score += 1
+
+            if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
+                return None
+
+            # Use set for deduplication
+            unique_urls = set()
+            image_variants = []
+            
+            # Base image info template
+            base_info = {
+                'alt': alt,
+                'desc': find_closest_parent_with_useful_text(img),
+                'score': score,
+                'type': 'image'
+            }
+
+            # Inline function for adding variants
+            def add_variant(src, width=None):
+                if src and not src.startswith('data:') and src not in unique_urls:
+                    unique_urls.add(src)
+                    image_variants.append({**base_info, 'src': src, 'width': width})
+
+            # Process all sources
+            add_variant(src)
+            add_variant(data_src)
+            
+            # Handle srcset and data-srcset in one pass
+            for attr in ('srcset', 'data-srcset'):
+                if value := img.get(attr):
+                    for source in parse_srcset(value):
+                        add_variant(source['url'], source['width'])
+
+            # Quick picture element check
+            if picture := img.find_parent('picture'):
+                for source in picture.find_all('source'):
+                    if srcset := source.get('srcset'):
+                        for src in parse_srcset(srcset):
+                            add_variant(src['url'], src['width'])
+
+            # Framework-specific attributes in one pass
+            for attr, value in img.attrs.items():
+                if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
+                    add_variant(value)
+
+            return image_variants if image_variants else None
+
        def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False):
            attrs_to_remove = []
            for attr in element.attrs:
@@ -490,13 +612,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        links['internal'] = list(internal_links_dict.values())
        links['external'] = list(external_links_dict.values())

-
        # # Process images using ThreadPoolExecutor
        imgs = body.find_all('img')
        
-        with ThreadPoolExecutor() as executor:
-            image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs)))
-        media['images'] = [result for result in image_results if result is not None]
+        # For test we use for loop instead of thread
+        media['images'] = [
+            img for result in (process_image(img, url, i, len(imgs)) 
+                            for i, img in enumerate(imgs))
+            if result is not None
+            for img in result
+        ]

        def flatten_nested_elements(node):
            if isinstance(node, NavigableString):
--- a/crawl4ai/tools.py
+++ b/crawl4ai/tools.py
@@ -0,0 +1,34 @@
+import time
+import cProfile
+import pstats
+from functools import wraps
+
+def profile_and_time(func):
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        # Start timer
+        start_time = time.perf_counter()
+        
+        # Setup profiler
+        profiler = cProfile.Profile()
+        profiler.enable()
+        
+        # Run function
+        result = func(self, *args, **kwargs)
+        
+        # Stop profiler
+        profiler.disable()
+        
+        # Calculate elapsed time
+        elapsed_time = time.perf_counter() - start_time
+        
+        # Print timing
+        print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds")
+        
+        # Print profiling stats
+        stats = pstats.Stats(profiler)
+        stats.sort_stats('cumulative')  # Sort by cumulative time
+        stats.print_stats(20)  # Print top 20 time-consuming functions
+        
+        return result
+    return wrapper