diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ab493ff..9c5d35bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,52 @@ # Changelog +## [v0.3.73] - 2024-10-24 + +### Added +- Smart overlay removal system in AsyncPlaywrightCrawlerStrategy: + - Automatic removal of popups, modals, and cookie notices + - Detection and removal of fixed/sticky position elements + - Cleaning of empty block elements + - Configurable via `remove_overlay_elements` parameter +- Enhanced screenshot capabilities: + - Added `screenshot_wait_for` parameter to control timing + - Improved screenshot handling with existing page context + - Better error handling with fallback error images +- New URL normalization utilities: + - `normalize_url` function for consistent URL formatting + - `is_external_url` function for better link classification +- Custom base directory support for cache storage: + - New `base_directory` parameter in AsyncWebCrawler + - Allows specifying alternative locations for `.crawl4ai` folder + +### Enhanced +- Link handling improvements: + - Better duplicate link detection + - Enhanced internal/external link classification + - Improved handling of special URL protocols + - Support for anchor links and protocol-relative URLs +- Configuration refinements: + - Streamlined social media domain list + - More focused external content filtering +- LLM extraction strategy: + - Added support for separate API base URL via `api_base` parameter + - Better handling of base URLs in configuration + +### Fixed +- Screenshot functionality: + - Resolved issues with screenshot timing and context + - Improved error handling and recovery +- Link processing: + - Fixed URL normalization edge cases + - Better handling of invalid URLs + - Improved error messages for link processing failures + +### Developer Notes +- The overlay removal system uses advanced JavaScript injection for better compatibility +- URL normalization handles special cases like mailto:, tel:, and protocol-relative URLs +- Screenshot system now reuses existing page context for better performance +- Link processing maintains separate dictionaries for internal and external links to ensure uniqueness + ## [v0.3.72] - 2024-10-22 ### Added diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 1ddb32da..fe0b6767 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -51,7 +51,7 @@ class AsyncCrawlerStrategy(ABC): pass @abstractmethod - async def take_screenshot(self, url: str) -> str: + async def take_screenshot(self, **kwargs) -> str: pass @abstractmethod @@ -502,13 +502,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if delay_before_return_html: await asyncio.sleep(delay_before_return_html) + # Check for remove_overlay_elements parameter + if kwargs.get("remove_overlay_elements", False): + await self.remove_overlay_elements(page) + html = await page.content() await self.execute_hook('before_return_html', page, html) # Check if kwargs has screenshot=True then take screenshot screenshot_data = None if kwargs.get("screenshot"): - screenshot_data = await self.take_screenshot(url) + # Check we have screenshot_wait_for parameter, if we have simply wait for that time + screenshot_wait_for = kwargs.get("screenshot_wait_for") + if screenshot_wait_for: + await asyncio.sleep(screenshot_wait_for) + screenshot_data = await self.take_screenshot(page) if self.verbose: print(f"[LOG] ✅ Crawled {url} successfully!") @@ -559,28 +567,156 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): results = await asyncio.gather(*tasks, return_exceptions=True) return [result if not isinstance(result, Exception) else str(result) for result in results] - async def take_screenshot(self, url: str, wait_time=1000) -> str: - async with await self.browser.new_context(user_agent=self.user_agent) as context: - page = await context.new_page() - try: - await page.goto(url, wait_until="domcontentloaded", timeout=30000) - # Wait for a specified time (default is 1 second) - await page.wait_for_timeout(wait_time) - screenshot = await page.screenshot(full_page=True) - return base64.b64encode(screenshot).decode('utf-8') - except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) + async def remove_overlay_elements(self, page: Page) -> None: + """ + Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. + + Args: + page (Page): The Playwright page instance + """ + remove_overlays_js = """ + async () => { + // Function to check if element is visible + const isVisible = (elem) => { + const style = window.getComputedStyle(elem); + return style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + }; - # Generate an error image - img = Image.new('RGB', (800, 600), color='black') - draw = ImageDraw.Draw(img) - font = ImageFont.load_default() - draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + // Common selectors for popups and overlays + const commonSelectors = [ + // Close buttons first + 'button[class*="close" i]', 'button[class*="dismiss" i]', + 'button[aria-label*="close" i]', 'button[title*="close" i]', + 'a[class*="close" i]', 'span[class*="close" i]', - buffered = BytesIO() - img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') - finally: - await page.close() + // Cookie notices + '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', + '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', + + // Newsletter/subscription dialogs + '[class*="newsletter" i]', '[class*="subscribe" i]', + + // Generic popups/modals + '[class*="popup" i]', '[class*="modal" i]', + '[class*="overlay" i]', '[class*="dialog" i]', + '[role="dialog"]', '[role="alertdialog"]' + ]; + + // Try to click close buttons first + for (const selector of commonSelectors.slice(0, 6)) { + const closeButtons = document.querySelectorAll(selector); + for (const button of closeButtons) { + if (isVisible(button)) { + try { + button.click(); + await new Promise(resolve => setTimeout(resolve, 100)); + } catch (e) { + console.log('Error clicking button:', e); + } + } + } + } + + // Remove remaining overlay elements + const removeOverlays = () => { + // Find elements with high z-index + const allElements = document.querySelectorAll('*'); + for (const elem of allElements) { + const style = window.getComputedStyle(elem); + const zIndex = parseInt(style.zIndex); + const position = style.position; + + if ( + isVisible(elem) && + (zIndex > 999 || position === 'fixed' || position === 'absolute') && + ( + elem.offsetWidth > window.innerWidth * 0.5 || + elem.offsetHeight > window.innerHeight * 0.5 || + style.backgroundColor.includes('rgba') || + parseFloat(style.opacity) < 1 + ) + ) { + elem.remove(); + } + } + + // Remove elements matching common selectors + for (const selector of commonSelectors) { + const elements = document.querySelectorAll(selector); + elements.forEach(elem => { + if (isVisible(elem)) { + elem.remove(); + } + }); + } + }; + + // Remove overlay elements + removeOverlays(); + + // Remove any fixed/sticky position elements at the top/bottom + const removeFixedElements = () => { + const elements = document.querySelectorAll('*'); + elements.forEach(elem => { + const style = window.getComputedStyle(elem); + if ( + (style.position === 'fixed' || style.position === 'sticky') && + isVisible(elem) + ) { + elem.remove(); + } + }); + }; + + removeFixedElements(); + + // Remove empty block elements as: div, p, span, etc. + const removeEmptyBlockElements = () => { + const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); + blockElements.forEach(elem => { + if (elem.innerText.trim() === '') { + elem.remove(); + } + }); + }; + + // Remove margin-right and padding-right from body (often added by modal scripts) + document.body.style.marginRight = '0px'; + document.body.style.paddingRight = '0px'; + document.body.style.overflow = 'auto'; + + // Wait a bit for any animations to complete + await new Promise(resolve => setTimeout(resolve, 100)); + } + """ + + try: + await page.evaluate(remove_overlays_js) + await page.wait_for_timeout(500) # Wait for any animations to complete + except Exception as e: + if self.verbose: + print(f"Warning: Failed to remove overlay elements: {str(e)}") + + async def take_screenshot(self, page: Page) -> str: + try: + # The page is already loaded, just take the screenshot + screenshot = await page.screenshot(full_page=True) + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + print(error_message) + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + finally: + await page.close() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 005523eb..b66173a7 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -23,13 +23,15 @@ class AsyncWebCrawler: self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, always_by_pass_cache: bool = False, + base_directory: str = str(Path.home()), **kwargs, ): self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( **kwargs ) self.always_by_pass_cache = always_by_pass_cache - self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) self.ready = False diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 2f33a8a9..a07ca977 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -37,16 +37,9 @@ SOCIAL_MEDIA_DOMAINS = [ 'linkedin.com', 'instagram.com', 'pinterest.com', - 'youtube.com', 'tiktok.com', 'snapchat.com', - 'whatsapp.com', - 'messenger.com', 'reddit.com', - 'tumblr.com', - 'buffer.com', - 'xing.com', - 'flipboard.com', ] # Threshold for the Image extraction - Range is 1 to 6 diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 0c472f0d..7799de66 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -14,7 +14,10 @@ from .utils import ( sanitize_html, extract_metadata, InvalidCSSSelectorError, - CustomHTML2Text + CustomHTML2Text, + normalize_url, + is_external_url + ) class ContentScrappingStrategy(ABC): @@ -67,6 +70,8 @@ class WebScrappingStrategy(ContentScrappingStrategy): links = {'internal': [], 'external': []} media = {'images': [], 'videos': [], 'audios': []} + internal_links_dict = {} + external_links_dict = {} # Extract meaningful text for media files from closest parent def find_closest_parent_with_useful_text(tag): @@ -205,30 +210,55 @@ class WebScrappingStrategy(ContentScrappingStrategy): social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', []) social_media_domains = list(set(social_media_domains)) + try: if element.name == 'a' and element.get('href'): - href = element['href'] + href = element.get('href', '').strip() + if not href: # Skip empty hrefs + return False + url_base = url.split('/')[2] - link_data = {'href': href, 'text': element.get_text()} - if href.startswith('http') and url_base not in href: - links['external'].append(link_data) + + # Normalize the URL + try: + normalized_href = normalize_url(href, url) + except ValueError as e: + # logging.warning(f"Invalid URL format: {href}, Error: {str(e)}") + return False + + link_data = { + 'href': normalized_href, + 'text': element.get_text().strip(), + 'title': element.get('title', '').strip() + } + + # Check for duplicates and add to appropriate dictionary + is_external = is_external_url(normalized_href, url_base) + if is_external: + if normalized_href not in external_links_dict: + external_links_dict[normalized_href] = link_data else: - links['internal'].append(link_data) + if normalized_href not in internal_links_dict: + internal_links_dict[normalized_href] = link_data + keep_element = True - if kwargs.get('exclude_external_links', False): - href_parts = href.split('/') - href_url_base = href_parts[2] if len(href_parts) > 2 else href - if url_base not in href_url_base: - element.decompose() - return False - - if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True): - if any(domain in href for domain in social_media_domains): + # Handle external link exclusions + if is_external: + if kwargs.get('exclude_external_links', False): element.decompose() return False + elif kwargs.get('exclude_social_media_links', False): + if any(domain in normalized_href.lower() for domain in social_media_domains): + element.decompose() + return False + elif kwargs.get('exclude_domains', []): + if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])): + element.decompose() + return False + except Exception as e: - raise "Error processing links" + raise Exception(f"Error processing links: {str(e)}") try: if element.name == 'img': @@ -252,12 +282,18 @@ class WebScrappingStrategy(ContentScrappingStrategy): element.decompose() return False - if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True): + if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False): src_url_base = src.split('/')[2] url_base = url.split('/')[2] if any(domain in src for domain in social_media_domains): element.decompose() return False + + # Handle exclude domains + if kwargs.get('exclude_domains', []): + if any(domain in src for domain in kwargs.get('exclude_domains', [])): + element.decompose() + return False return True # Always keep image elements except Exception as e: @@ -328,6 +364,11 @@ class WebScrappingStrategy(ContentScrappingStrategy): # ] process_element(body) + + # Update the links dictionary with unique links + links['internal'] = list(internal_links_dict.values()) + links['external'] = list(external_links_dict.values()) + # # Process images using ThreadPoolExecutor imgs = body.find_all('img') diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 046067d8..b79e0c43 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy): self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) self.apply_chunking = kwargs.get("apply_chunking", True) self.base_url = kwargs.get("base_url", None) + self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) self.extra_args = kwargs.get("extra_args", {}) if not self.apply_chunking: self.chunk_token_threshold = 1e9 @@ -116,7 +117,7 @@ class LLMExtractionStrategy(ExtractionStrategy): self.provider, prompt_with_variables, self.api_token, - base_url=self.base_url, + base_url=self.api_base or self.base_url, extra_args = self.extra_args ) # , json_response=self.extract_type == "schema") try: diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 34ab219b..baa08a0f 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -980,4 +980,53 @@ def format_html(html_string): soup = BeautifulSoup(html_string, 'html.parser') return soup.prettify() +def normalize_url(href, base_url): + """Normalize URLs to ensure consistent format""" + # Extract protocol and domain from base URL + try: + base_parts = base_url.split('/') + protocol = base_parts[0] + domain = base_parts[2] + except IndexError: + raise ValueError(f"Invalid base URL format: {base_url}") + + # Handle special protocols + special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} + if any(href.lower().startswith(proto) for proto in special_protocols): + return href.strip() + + # Handle anchor links + if href.startswith('#'): + return f"{base_url}{href}" + + # Handle protocol-relative URLs + if href.startswith('//'): + return f"{protocol}{href}" + + # Handle root-relative URLs + if href.startswith('/'): + return f"{protocol}//{domain}{href}" + + # Handle relative URLs + if not href.startswith(('http://', 'https://')): + # Remove leading './' if present + href = href.lstrip('./') + return f"{protocol}//{domain}/{href}" + + return href.strip() +def is_external_url(url, base_domain): + """Determine if a URL is external""" + special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} + if any(url.lower().startswith(proto) for proto in special_protocols): + return True + + try: + # Handle URLs with protocol + if url.startswith(('http://', 'https://')): + url_domain = url.split('/')[2] + return base_domain.lower() not in url_domain.lower() + except IndexError: + return False + + return False