feat: enhance crawler with overlay removal and improved screenshot capabilities

• Add smart overlay removal system for handling popups and modals • Improve screenshot functionality with configurable timing controls • Implement URL normalization and enhanced link processing • Add custom base directory support for cache storage • Refine external content filtering and social media domain handling This commit significantly improves the crawler's ability to handle modern websites by automatically removing intrusive overlays and providing better screenshot capabilities. URL handling is now more robust with proper normalization and duplicate detection. The cache system is more flexible with customizable base directory support. Breaking changes: None Issue numbers: None
2024-10-24 20:22:47 +08:00
parent 60ba131ac8
commit bcfe83f702
7 changed files with 319 additions and 50 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,52 @@
 # Changelog

+## [v0.3.73] - 2024-10-24
+
+### Added
+- Smart overlay removal system in AsyncPlaywrightCrawlerStrategy:
+  - Automatic removal of popups, modals, and cookie notices
+  - Detection and removal of fixed/sticky position elements
+  - Cleaning of empty block elements
+  - Configurable via `remove_overlay_elements` parameter
+- Enhanced screenshot capabilities:
+  - Added `screenshot_wait_for` parameter to control timing
+  - Improved screenshot handling with existing page context
+  - Better error handling with fallback error images
+- New URL normalization utilities:
+  - `normalize_url` function for consistent URL formatting
+  - `is_external_url` function for better link classification
+- Custom base directory support for cache storage:
+  - New `base_directory` parameter in AsyncWebCrawler
+  - Allows specifying alternative locations for `.crawl4ai` folder
+
+### Enhanced
+- Link handling improvements:
+  - Better duplicate link detection
+  - Enhanced internal/external link classification
+  - Improved handling of special URL protocols
+  - Support for anchor links and protocol-relative URLs
+- Configuration refinements:
+  - Streamlined social media domain list
+  - More focused external content filtering
+- LLM extraction strategy:
+  - Added support for separate API base URL via `api_base` parameter
+  - Better handling of base URLs in configuration
+
+### Fixed
+- Screenshot functionality:
+  - Resolved issues with screenshot timing and context
+  - Improved error handling and recovery
+- Link processing:
+  - Fixed URL normalization edge cases
+  - Better handling of invalid URLs
+  - Improved error messages for link processing failures
+
+### Developer Notes
+- The overlay removal system uses advanced JavaScript injection for better compatibility
+- URL normalization handles special cases like mailto:, tel:, and protocol-relative URLs
+- Screenshot system now reuses existing page context for better performance
+- Link processing maintains separate dictionaries for internal and external links to ensure uniqueness
+
 ## [v0.3.72] - 2024-10-22

 ### Added
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -51,7 +51,7 @@ class AsyncCrawlerStrategy(ABC):
        pass
    
    @abstractmethod
-    async def take_screenshot(self, url: str) -> str:
+    async def take_screenshot(self, **kwargs) -> str:
        pass
    
    @abstractmethod
@@ -502,13 +502,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if delay_before_return_html:
                await asyncio.sleep(delay_before_return_html)
                
+            # Check for remove_overlay_elements parameter
+            if kwargs.get("remove_overlay_elements", False):
+                await self.remove_overlay_elements(page)
+            
            html = await page.content()
            await self.execute_hook('before_return_html', page, html)
            
            # Check if kwargs has screenshot=True then take screenshot
            screenshot_data = None
            if kwargs.get("screenshot"):
-                screenshot_data = await self.take_screenshot(url)            
+                # Check we have screenshot_wait_for parameter, if we have simply wait for that time
+                screenshot_wait_for = kwargs.get("screenshot_wait_for")
+                if screenshot_wait_for:
+                    await asyncio.sleep(screenshot_wait_for)
+                screenshot_data = await self.take_screenshot(page)          

            if self.verbose:
                print(f"[LOG] ✅ Crawled {url} successfully!")
@@ -559,28 +567,156 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return [result if not isinstance(result, Exception) else str(result) for result in results]

-    async def take_screenshot(self, url: str, wait_time=1000) -> str:
-        async with await self.browser.new_context(user_agent=self.user_agent) as context:
-            page = await context.new_page()
-            try:
-                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
-                # Wait for a specified time (default is 1 second)
-                await page.wait_for_timeout(wait_time)
-                screenshot = await page.screenshot(full_page=True)
-                return base64.b64encode(screenshot).decode('utf-8')
-            except Exception as e:
-                error_message = f"Failed to take screenshot: {str(e)}"
-                print(error_message)
+    async def remove_overlay_elements(self, page: Page) -> None:
+        """
+        Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.
+        
+        Args:
+            page (Page): The Playwright page instance
+        """
+        remove_overlays_js = """
+        async () => {
+            // Function to check if element is visible
+            const isVisible = (elem) => {
+                const style = window.getComputedStyle(elem);
+                return style.display !== 'none' && 
+                       style.visibility !== 'hidden' && 
+                       style.opacity !== '0';
+            };

-                # Generate an error image
-                img = Image.new('RGB', (800, 600), color='black')
-                draw = ImageDraw.Draw(img)
-                font = ImageFont.load_default()
-                draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
+            // Common selectors for popups and overlays
+            const commonSelectors = [
+                // Close buttons first
+                'button[class*="close" i]', 'button[class*="dismiss" i]', 
+                'button[aria-label*="close" i]', 'button[title*="close" i]',
+                'a[class*="close" i]', 'span[class*="close" i]',
                
-                buffered = BytesIO()
-                img.save(buffered, format="JPEG")
-                return base64.b64encode(buffered.getvalue()).decode('utf-8')
-            finally:
-                await page.close()
+                // Cookie notices
+                '[class*="cookie-banner" i]', '[id*="cookie-banner" i]',
+                '[class*="cookie-consent" i]', '[id*="cookie-consent" i]',
+                
+                // Newsletter/subscription dialogs
+                '[class*="newsletter" i]', '[class*="subscribe" i]',
+                
+                // Generic popups/modals
+                '[class*="popup" i]', '[class*="modal" i]', 
+                '[class*="overlay" i]', '[class*="dialog" i]',
+                '[role="dialog"]', '[role="alertdialog"]'
+            ];
+
+            // Try to click close buttons first
+            for (const selector of commonSelectors.slice(0, 6)) {
+                const closeButtons = document.querySelectorAll(selector);
+                for (const button of closeButtons) {
+                    if (isVisible(button)) {
+                        try {
+                            button.click();
+                            await new Promise(resolve => setTimeout(resolve, 100));
+                        } catch (e) {
+                            console.log('Error clicking button:', e);
+                        }
+                    }
+                }
+            }
+
+            // Remove remaining overlay elements
+            const removeOverlays = () => {
+                // Find elements with high z-index
+                const allElements = document.querySelectorAll('*');
+                for (const elem of allElements) {
+                    const style = window.getComputedStyle(elem);
+                    const zIndex = parseInt(style.zIndex);
+                    const position = style.position;
+                    
+                    if (
+                        isVisible(elem) && 
+                        (zIndex > 999 || position === 'fixed' || position === 'absolute') &&
+                        (
+                            elem.offsetWidth > window.innerWidth * 0.5 ||
+                            elem.offsetHeight > window.innerHeight * 0.5 ||
+                            style.backgroundColor.includes('rgba') ||
+                            parseFloat(style.opacity) < 1
+                        )
+                    ) {
+                        elem.remove();
+                    }
+                }
+
+                // Remove elements matching common selectors
+                for (const selector of commonSelectors) {
+                    const elements = document.querySelectorAll(selector);
+                    elements.forEach(elem => {
+                        if (isVisible(elem)) {
+                            elem.remove();
+                        }
+                    });
+                }
+            };
+
+            // Remove overlay elements
+            removeOverlays();
+
+            // Remove any fixed/sticky position elements at the top/bottom
+            const removeFixedElements = () => {
+                const elements = document.querySelectorAll('*');
+                elements.forEach(elem => {
+                    const style = window.getComputedStyle(elem);
+                    if (
+                        (style.position === 'fixed' || style.position === 'sticky') &&
+                        isVisible(elem)
+                    ) {
+                        elem.remove();
+                    }
+                });
+            };
+
+            removeFixedElements();
+            
+            // Remove empty block elements as: div, p, span, etc.
+            const removeEmptyBlockElements = () => {
+                const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6');
+                blockElements.forEach(elem => {
+                    if (elem.innerText.trim() === '') {
+                        elem.remove();
+                    }
+                });
+            };
+
+            // Remove margin-right and padding-right from body (often added by modal scripts)
+            document.body.style.marginRight = '0px';
+            document.body.style.paddingRight = '0px';
+            document.body.style.overflow = 'auto';
+
+            // Wait a bit for any animations to complete
+            await new Promise(resolve => setTimeout(resolve, 100));
+        }
+        """
+        
+        try:
+            await page.evaluate(remove_overlays_js)
+            await page.wait_for_timeout(500)  # Wait for any animations to complete
+        except Exception as e:
+            if self.verbose:
+                print(f"Warning: Failed to remove overlay elements: {str(e)}")
+
+    async def take_screenshot(self, page: Page) -> str:
+        try:
+            # The page is already loaded, just take the screenshot
+            screenshot = await page.screenshot(full_page=True)
+            return base64.b64encode(screenshot).decode('utf-8')
+        except Exception as e:
+            error_message = f"Failed to take screenshot: {str(e)}"
+            print(error_message)
+
+            # Generate an error image
+            img = Image.new('RGB', (800, 600), color='black')
+            draw = ImageDraw.Draw(img)
+            font = ImageFont.load_default()
+            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
+            
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            return base64.b64encode(buffered.getvalue()).decode('utf-8')
+        finally:
+            await page.close()

--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -23,13 +23,15 @@ class AsyncWebCrawler:
        self,
        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
        always_by_pass_cache: bool = False,
+        base_directory: str = str(Path.home()),
        **kwargs,
    ):
        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
            **kwargs
        )
        self.always_by_pass_cache = always_by_pass_cache
-        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
        self.ready = False
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -37,16 +37,9 @@ SOCIAL_MEDIA_DOMAINS = [
                            'linkedin.com',
                            'instagram.com',
                            'pinterest.com',
-                            'youtube.com',
                            'tiktok.com',
                            'snapchat.com',
-                            'whatsapp.com',
-                            'messenger.com',
                            'reddit.com',
-                            'tumblr.com',
-                            'buffer.com',
-                            'xing.com',
-                            'flipboard.com',
                        ]

 # Threshold for the Image extraction - Range is 1 to 6
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -14,7 +14,10 @@ from .utils import (
    sanitize_html,
    extract_metadata,
    InvalidCSSSelectorError,
-    CustomHTML2Text
+    CustomHTML2Text,
+    normalize_url,
+    is_external_url
+    
 )

 class ContentScrappingStrategy(ABC):
@@ -67,6 +70,8 @@ class WebScrappingStrategy(ContentScrappingStrategy):

        links = {'internal': [], 'external': []}
        media = {'images': [], 'videos': [], 'audios': []}
+        internal_links_dict = {}
+        external_links_dict = {}

        # Extract meaningful text for media files from closest parent
        def find_closest_parent_with_useful_text(tag):
@@ -205,30 +210,55 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
                social_media_domains = list(set(social_media_domains))

+                
                try:
                    if element.name == 'a' and element.get('href'):
-                        href = element['href']
+                        href = element.get('href', '').strip()
+                        if not href:  # Skip empty hrefs
+                            return False
+                            
                        url_base = url.split('/')[2]
-                        link_data = {'href': href, 'text': element.get_text()}
-                        if href.startswith('http') and url_base not in href:
-                            links['external'].append(link_data)
+                        
+                        # Normalize the URL
+                        try:
+                            normalized_href = normalize_url(href, url)
+                        except ValueError as e:
+                            # logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
+                            return False
+                            
+                        link_data = {
+                            'href': normalized_href,
+                            'text': element.get_text().strip(),
+                            'title': element.get('title', '').strip()
+                        }
+                        
+                        # Check for duplicates and add to appropriate dictionary
+                        is_external = is_external_url(normalized_href, url_base)
+                        if is_external:
+                            if normalized_href not in external_links_dict:
+                                external_links_dict[normalized_href] = link_data
                        else:
-                            links['internal'].append(link_data)
+                            if normalized_href not in internal_links_dict:
+                                internal_links_dict[normalized_href] = link_data
+                                
                        keep_element = True
                        
-                        if kwargs.get('exclude_external_links', False):
-                            href_parts = href.split('/')
-                            href_url_base = href_parts[2] if len(href_parts) > 2 else href
-                            if url_base not in href_url_base:
-                                element.decompose()
-                                return False
-                            
-                        if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
-                            if any(domain in href for domain in social_media_domains):
+                        # Handle external link exclusions
+                        if is_external:
+                            if kwargs.get('exclude_external_links', False):
                                element.decompose()
                                return False
+                            elif kwargs.get('exclude_social_media_links', False):
+                                if any(domain in normalized_href.lower() for domain in social_media_domains):
+                                    element.decompose()
+                                    return False
+                            elif kwargs.get('exclude_domains', []):
+                                if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
+                                    element.decompose()
+                                    return False
+                                    
                except Exception as e:
-                    raise "Error processing links"
+                    raise Exception(f"Error processing links: {str(e)}")

                try:
                    if element.name == 'img':
@@ -252,12 +282,18 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                                element.decompose()
                                return False
                            
-                        if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True):
+                        if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False):
                            src_url_base = src.split('/')[2]
                            url_base = url.split('/')[2]
                            if any(domain in src for domain in social_media_domains):
                                element.decompose()
                                return False
+                            
+                        # Handle exclude domains
+                        if kwargs.get('exclude_domains', []):
+                            if any(domain in src for domain in kwargs.get('exclude_domains', [])):
+                                element.decompose()
+                                return False
                        
                        return True  # Always keep image elements
                except Exception as e:
@@ -328,6 +364,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
        # ]
        
        process_element(body)
+        
+        # Update the links dictionary with unique links
+        links['internal'] = list(internal_links_dict.values())
+        links['external'] = list(external_links_dict.values())
+

        # # Process images using ThreadPoolExecutor
        imgs = body.find_all('img')
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
        self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
        self.apply_chunking = kwargs.get("apply_chunking", True)
        self.base_url = kwargs.get("base_url", None)
+        self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
        self.extra_args = kwargs.get("extra_args", {})
        if not self.apply_chunking:
            self.chunk_token_threshold = 1e9
@@ -116,7 +117,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
            self.provider, 
            prompt_with_variables, 
            self.api_token, 
-            base_url=self.base_url,
+            base_url=self.api_base or self.base_url,
            extra_args = self.extra_args
            ) # , json_response=self.extract_type == "schema")
        try:
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -980,4 +980,53 @@ def format_html(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return soup.prettify()

+def normalize_url(href, base_url):
+    """Normalize URLs to ensure consistent format"""
+    # Extract protocol and domain from base URL
+    try:
+        base_parts = base_url.split('/')
+        protocol = base_parts[0]
+        domain = base_parts[2]
+    except IndexError:
+        raise ValueError(f"Invalid base URL format: {base_url}")
+    
+    # Handle special protocols
+    special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
+    if any(href.lower().startswith(proto) for proto in special_protocols):
+        return href.strip()
+        
+    # Handle anchor links
+    if href.startswith('#'):
+        return f"{base_url}{href}"
+        
+    # Handle protocol-relative URLs
+    if href.startswith('//'):
+        return f"{protocol}{href}"
+        
+    # Handle root-relative URLs
+    if href.startswith('/'):
+        return f"{protocol}//{domain}{href}"
+        
+    # Handle relative URLs
+    if not href.startswith(('http://', 'https://')):
+        # Remove leading './' if present
+        href = href.lstrip('./')
+        return f"{protocol}//{domain}/{href}"
+        
+    return href.strip()

+def is_external_url(url, base_domain):
+    """Determine if a URL is external"""
+    special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
+    if any(url.lower().startswith(proto) for proto in special_protocols):
+        return True
+        
+    try:
+        # Handle URLs with protocol
+        if url.startswith(('http://', 'https://')):
+            url_domain = url.split('/')[2]
+            return base_domain.lower() not in url_domain.lower()
+    except IndexError:
+        return False
+        
+    return False