feat: enhance crawler with overlay removal and improved screenshot capabilities

• Add smart overlay removal system for handling popups and modals • Improve screenshot functionality with configurable timing controls • Implement URL normalization and enhanced link processing • Add custom base directory support for cache storage • Refine external content filtering and social media domain handling This commit significantly improves the crawler's ability to handle modern websites by automatically removing intrusive overlays and providing better screenshot capabilities. URL handling is now more robust with proper normalization and duplicate detection. The cache system is more flexible with customizable base directory support. Breaking changes: None Issue numbers: None
2024-10-24 20:22:47 +08:00
parent 60ba131ac8
commit bcfe83f702
7 changed files with 319 additions and 50 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -980,4 +980,53 @@ def format_html(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return soup.prettify()

+def normalize_url(href, base_url):
+    """Normalize URLs to ensure consistent format"""
+    # Extract protocol and domain from base URL
+    try:
+        base_parts = base_url.split('/')
+        protocol = base_parts[0]
+        domain = base_parts[2]
+    except IndexError:
+        raise ValueError(f"Invalid base URL format: {base_url}")
+    
+    # Handle special protocols
+    special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
+    if any(href.lower().startswith(proto) for proto in special_protocols):
+        return href.strip()
+        
+    # Handle anchor links
+    if href.startswith('#'):
+        return f"{base_url}{href}"
+        
+    # Handle protocol-relative URLs
+    if href.startswith('//'):
+        return f"{protocol}{href}"
+        
+    # Handle root-relative URLs
+    if href.startswith('/'):
+        return f"{protocol}//{domain}{href}"
+        
+    # Handle relative URLs
+    if not href.startswith(('http://', 'https://')):
+        # Remove leading './' if present
+        href = href.lstrip('./')
+        return f"{protocol}//{domain}/{href}"
+        
+    return href.strip()

+def is_external_url(url, base_domain):
+    """Determine if a URL is external"""
+    special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
+    if any(url.lower().startswith(proto) for proto in special_protocols):
+        return True
+        
+    try:
+        # Handle URLs with protocol
+        if url.startswith(('http://', 'https://')):
+            url_domain = url.split('/')[2]
+            return base_domain.lower() not in url_domain.lower()
+    except IndexError:
+        return False
+        
+    return False