From 006bee4a5a50fed10496b701ecfea350be1b7888 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 16:00:17 +0800 Subject: [PATCH] feat: enhance image processing capabilities - Enhanced image processing with srcset support and validation checks for better image selection. --- README.md | 4 +- crawl4ai/content_scraping_strategy.py | 145 ++++++++++++++++++++++++-- crawl4ai/tools.py | 34 ++++++ 3 files changed, 172 insertions(+), 11 deletions(-) create mode 100644 crawl4ai/tools.py diff --git a/README.md b/README.md index af0d6610..1d3063c7 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,10 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! +- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster! - 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object. +- 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats. +- 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models. - 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. - 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3823a78d..3b41ec82 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -19,9 +19,9 @@ from .utils import ( InvalidCSSSelectorError, CustomHTML2Text, normalize_url, - is_external_url - + is_external_url ) +from .tools import profile_and_time # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r'^og:') @@ -234,7 +234,26 @@ class WebScrapingStrategy(ContentScrapingStrategy): return text_content return None - def process_image(img, url, index, total_images): + def process_image_old(img, url, index, total_images): + def parse_srcset(srcset_str): + """Parse srcset attribute into list of image URLs with their sizes.""" + if not srcset_str: + return [] + + sources = [] + # Split on http/https and filter empty strings + urls = [f"http{part}" for part in srcset_str.split("http") if part] + + for url in urls: + # Remove trailing comma and whitespace, then split to get width + url = url.strip().rstrip(',') + parts = url.rsplit(' ', 1) + img_url = parts[0].strip() + width = parts[1].rstrip('w') if len(parts) > 1 else None + sources.append({'url': img_url, 'width': width}) + + return sources + #Check if an image has valid display and inside undesired html elements def is_valid_image(img, parent, parent_classes): style = img.get('style', '') @@ -283,14 +302,14 @@ class WebScrapingStrategy(ContentScrapingStrategy): score+=1 return score - - if not is_valid_image(img, img.parent, img.parent.get('class', [])): return None + score = score_image_for_usefulness(img, url, index, total_images) if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): return None - return { + + base_result = { 'src': img.get('src', ''), 'data-src': img.get('data-src', ''), 'alt': img.get('alt', ''), @@ -299,6 +318,109 @@ class WebScrapingStrategy(ContentScrapingStrategy): 'type': 'image' } + sources = [] + srcset = img.get('srcset', '') + if srcset: + sources = parse_srcset(srcset) + if sources: + return [dict(base_result, src=source['url'], width=source['width']) + for source in sources] + + return [base_result] # Always return a list + + def process_image(img, url, index, total_images): + parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') + if ' ' in u else None} + for u in [f"http{p}" for p in s.split("http") if p]] + + # Constants for checks + classes_to_check = frozenset(['button', 'icon', 'logo']) + tags_to_check = frozenset(['button', 'input']) + + # Pre-fetch commonly used attributes + style = img.get('style', '') + alt = img.get('alt', '') + src = img.get('src', '') + data_src = img.get('data-src', '') + width = img.get('width') + height = img.get('height') + parent = img.parent + parent_classes = parent.get('class', []) + + # Quick validation checks + if ('display:none' in style or + parent.name in tags_to_check or + any(c in cls for c in parent_classes for cls in classes_to_check) or + any(c in src for c in classes_to_check) or + any(c in alt for c in classes_to_check)): + return None + + # Quick score calculation + score = 0 + if width and width.isdigit(): + width_val = int(width) + score += 1 if width_val > 150 else 0 + if height and height.isdigit(): + height_val = int(height) + score += 1 if height_val > 150 else 0 + if alt: + score += 1 + score += index/total_images < 0.5 + + image_format = '' + if "data:image/" in src: + image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] + else: + image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] + + if image_format in ('jpg', 'png', 'webp', 'avif'): + score += 1 + + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): + return None + + # Use set for deduplication + unique_urls = set() + image_variants = [] + + # Base image info template + base_info = { + 'alt': alt, + 'desc': find_closest_parent_with_useful_text(img), + 'score': score, + 'type': 'image' + } + + # Inline function for adding variants + def add_variant(src, width=None): + if src and not src.startswith('data:') and src not in unique_urls: + unique_urls.add(src) + image_variants.append({**base_info, 'src': src, 'width': width}) + + # Process all sources + add_variant(src) + add_variant(data_src) + + # Handle srcset and data-srcset in one pass + for attr in ('srcset', 'data-srcset'): + if value := img.get(attr): + for source in parse_srcset(value): + add_variant(source['url'], source['width']) + + # Quick picture element check + if picture := img.find_parent('picture'): + for source in picture.find_all('source'): + if srcset := source.get('srcset'): + for src in parse_srcset(srcset): + add_variant(src['url'], src['width']) + + # Framework-specific attributes in one pass + for attr, value in img.attrs.items(): + if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value: + add_variant(value) + + return image_variants if image_variants else None + def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False): attrs_to_remove = [] for attr in element.attrs: @@ -490,13 +612,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): links['internal'] = list(internal_links_dict.values()) links['external'] = list(external_links_dict.values()) - # # Process images using ThreadPoolExecutor imgs = body.find_all('img') - with ThreadPoolExecutor() as executor: - image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs))) - media['images'] = [result for result in image_results if result is not None] + # For test we use for loop instead of thread + media['images'] = [ + img for result in (process_image(img, url, i, len(imgs)) + for i, img in enumerate(imgs)) + if result is not None + for img in result + ] def flatten_nested_elements(node): if isinstance(node, NavigableString): diff --git a/crawl4ai/tools.py b/crawl4ai/tools.py new file mode 100644 index 00000000..ff36b53a --- /dev/null +++ b/crawl4ai/tools.py @@ -0,0 +1,34 @@ +import time +import cProfile +import pstats +from functools import wraps + +def profile_and_time(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + # Start timer + start_time = time.perf_counter() + + # Setup profiler + profiler = cProfile.Profile() + profiler.enable() + + # Run function + result = func(self, *args, **kwargs) + + # Stop profiler + profiler.disable() + + # Calculate elapsed time + elapsed_time = time.perf_counter() - start_time + + # Print timing + print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds") + + # Print profiling stats + stats = pstats.Stats(profiler) + stats.sort_stats('cumulative') # Sort by cumulative time + stats.print_stats(20) # Print top 20 time-consuming functions + + return result + return wrapper \ No newline at end of file