feat: enhance crawler with overlay removal and improved screenshot capabilities

• Add smart overlay removal system for handling popups and modals
• Improve screenshot functionality with configurable timing controls
• Implement URL normalization and enhanced link processing
• Add custom base directory support for cache storage
• Refine external content filtering and social media domain handling

This commit significantly improves the crawler's ability to handle modern
websites by automatically removing intrusive overlays and providing better
screenshot capabilities. URL handling is now more robust with proper
normalization and duplicate detection. The cache system is more flexible
with customizable base directory support.

Breaking changes: None
Issue numbers: None
This commit is contained in:
UncleCode
2024-10-24 20:22:47 +08:00
parent 60ba131ac8
commit bcfe83f702
7 changed files with 319 additions and 50 deletions

View File

@@ -1,5 +1,52 @@
# Changelog # Changelog
## [v0.3.73] - 2024-10-24
### Added
- Smart overlay removal system in AsyncPlaywrightCrawlerStrategy:
- Automatic removal of popups, modals, and cookie notices
- Detection and removal of fixed/sticky position elements
- Cleaning of empty block elements
- Configurable via `remove_overlay_elements` parameter
- Enhanced screenshot capabilities:
- Added `screenshot_wait_for` parameter to control timing
- Improved screenshot handling with existing page context
- Better error handling with fallback error images
- New URL normalization utilities:
- `normalize_url` function for consistent URL formatting
- `is_external_url` function for better link classification
- Custom base directory support for cache storage:
- New `base_directory` parameter in AsyncWebCrawler
- Allows specifying alternative locations for `.crawl4ai` folder
### Enhanced
- Link handling improvements:
- Better duplicate link detection
- Enhanced internal/external link classification
- Improved handling of special URL protocols
- Support for anchor links and protocol-relative URLs
- Configuration refinements:
- Streamlined social media domain list
- More focused external content filtering
- LLM extraction strategy:
- Added support for separate API base URL via `api_base` parameter
- Better handling of base URLs in configuration
### Fixed
- Screenshot functionality:
- Resolved issues with screenshot timing and context
- Improved error handling and recovery
- Link processing:
- Fixed URL normalization edge cases
- Better handling of invalid URLs
- Improved error messages for link processing failures
### Developer Notes
- The overlay removal system uses advanced JavaScript injection for better compatibility
- URL normalization handles special cases like mailto:, tel:, and protocol-relative URLs
- Screenshot system now reuses existing page context for better performance
- Link processing maintains separate dictionaries for internal and external links to ensure uniqueness
## [v0.3.72] - 2024-10-22 ## [v0.3.72] - 2024-10-22
### Added ### Added

View File

@@ -51,7 +51,7 @@ class AsyncCrawlerStrategy(ABC):
pass pass
@abstractmethod @abstractmethod
async def take_screenshot(self, url: str) -> str: async def take_screenshot(self, **kwargs) -> str:
pass pass
@abstractmethod @abstractmethod
@@ -502,13 +502,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if delay_before_return_html: if delay_before_return_html:
await asyncio.sleep(delay_before_return_html) await asyncio.sleep(delay_before_return_html)
# Check for remove_overlay_elements parameter
if kwargs.get("remove_overlay_elements", False):
await self.remove_overlay_elements(page)
html = await page.content() html = await page.content()
await self.execute_hook('before_return_html', page, html) await self.execute_hook('before_return_html', page, html)
# Check if kwargs has screenshot=True then take screenshot # Check if kwargs has screenshot=True then take screenshot
screenshot_data = None screenshot_data = None
if kwargs.get("screenshot"): if kwargs.get("screenshot"):
screenshot_data = await self.take_screenshot(url) # Check we have screenshot_wait_for parameter, if we have simply wait for that time
screenshot_wait_for = kwargs.get("screenshot_wait_for")
if screenshot_wait_for:
await asyncio.sleep(screenshot_wait_for)
screenshot_data = await self.take_screenshot(page)
if self.verbose: if self.verbose:
print(f"[LOG] ✅ Crawled {url} successfully!") print(f"[LOG] ✅ Crawled {url} successfully!")
@@ -559,13 +567,141 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
results = await asyncio.gather(*tasks, return_exceptions=True) results = await asyncio.gather(*tasks, return_exceptions=True)
return [result if not isinstance(result, Exception) else str(result) for result in results] return [result if not isinstance(result, Exception) else str(result) for result in results]
async def take_screenshot(self, url: str, wait_time=1000) -> str: async def remove_overlay_elements(self, page: Page) -> None:
async with await self.browser.new_context(user_agent=self.user_agent) as context: """
page = await context.new_page() Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.
Args:
page (Page): The Playwright page instance
"""
remove_overlays_js = """
async () => {
// Function to check if element is visible
const isVisible = (elem) => {
const style = window.getComputedStyle(elem);
return style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0';
};
// Common selectors for popups and overlays
const commonSelectors = [
// Close buttons first
'button[class*="close" i]', 'button[class*="dismiss" i]',
'button[aria-label*="close" i]', 'button[title*="close" i]',
'a[class*="close" i]', 'span[class*="close" i]',
// Cookie notices
'[class*="cookie-banner" i]', '[id*="cookie-banner" i]',
'[class*="cookie-consent" i]', '[id*="cookie-consent" i]',
// Newsletter/subscription dialogs
'[class*="newsletter" i]', '[class*="subscribe" i]',
// Generic popups/modals
'[class*="popup" i]', '[class*="modal" i]',
'[class*="overlay" i]', '[class*="dialog" i]',
'[role="dialog"]', '[role="alertdialog"]'
];
// Try to click close buttons first
for (const selector of commonSelectors.slice(0, 6)) {
const closeButtons = document.querySelectorAll(selector);
for (const button of closeButtons) {
if (isVisible(button)) {
try {
button.click();
await new Promise(resolve => setTimeout(resolve, 100));
} catch (e) {
console.log('Error clicking button:', e);
}
}
}
}
// Remove remaining overlay elements
const removeOverlays = () => {
// Find elements with high z-index
const allElements = document.querySelectorAll('*');
for (const elem of allElements) {
const style = window.getComputedStyle(elem);
const zIndex = parseInt(style.zIndex);
const position = style.position;
if (
isVisible(elem) &&
(zIndex > 999 || position === 'fixed' || position === 'absolute') &&
(
elem.offsetWidth > window.innerWidth * 0.5 ||
elem.offsetHeight > window.innerHeight * 0.5 ||
style.backgroundColor.includes('rgba') ||
parseFloat(style.opacity) < 1
)
) {
elem.remove();
}
}
// Remove elements matching common selectors
for (const selector of commonSelectors) {
const elements = document.querySelectorAll(selector);
elements.forEach(elem => {
if (isVisible(elem)) {
elem.remove();
}
});
}
};
// Remove overlay elements
removeOverlays();
// Remove any fixed/sticky position elements at the top/bottom
const removeFixedElements = () => {
const elements = document.querySelectorAll('*');
elements.forEach(elem => {
const style = window.getComputedStyle(elem);
if (
(style.position === 'fixed' || style.position === 'sticky') &&
isVisible(elem)
) {
elem.remove();
}
});
};
removeFixedElements();
// Remove empty block elements as: div, p, span, etc.
const removeEmptyBlockElements = () => {
const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6');
blockElements.forEach(elem => {
if (elem.innerText.trim() === '') {
elem.remove();
}
});
};
// Remove margin-right and padding-right from body (often added by modal scripts)
document.body.style.marginRight = '0px';
document.body.style.paddingRight = '0px';
document.body.style.overflow = 'auto';
// Wait a bit for any animations to complete
await new Promise(resolve => setTimeout(resolve, 100));
}
"""
try: try:
await page.goto(url, wait_until="domcontentloaded", timeout=30000) await page.evaluate(remove_overlays_js)
# Wait for a specified time (default is 1 second) await page.wait_for_timeout(500) # Wait for any animations to complete
await page.wait_for_timeout(wait_time) except Exception as e:
if self.verbose:
print(f"Warning: Failed to remove overlay elements: {str(e)}")
async def take_screenshot(self, page: Page) -> str:
try:
# The page is already loaded, just take the screenshot
screenshot = await page.screenshot(full_page=True) screenshot = await page.screenshot(full_page=True)
return base64.b64encode(screenshot).decode('utf-8') return base64.b64encode(screenshot).decode('utf-8')
except Exception as e: except Exception as e:

View File

@@ -23,13 +23,15 @@ class AsyncWebCrawler:
self, self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None, crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
always_by_pass_cache: bool = False, always_by_pass_cache: bool = False,
base_directory: str = str(Path.home()),
**kwargs, **kwargs,
): ):
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
**kwargs **kwargs
) )
self.always_by_pass_cache = always_by_pass_cache self.always_by_pass_cache = always_by_pass_cache
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
self.ready = False self.ready = False

View File

@@ -37,16 +37,9 @@ SOCIAL_MEDIA_DOMAINS = [
'linkedin.com', 'linkedin.com',
'instagram.com', 'instagram.com',
'pinterest.com', 'pinterest.com',
'youtube.com',
'tiktok.com', 'tiktok.com',
'snapchat.com', 'snapchat.com',
'whatsapp.com',
'messenger.com',
'reddit.com', 'reddit.com',
'tumblr.com',
'buffer.com',
'xing.com',
'flipboard.com',
] ]
# Threshold for the Image extraction - Range is 1 to 6 # Threshold for the Image extraction - Range is 1 to 6

View File

@@ -14,7 +14,10 @@ from .utils import (
sanitize_html, sanitize_html,
extract_metadata, extract_metadata,
InvalidCSSSelectorError, InvalidCSSSelectorError,
CustomHTML2Text CustomHTML2Text,
normalize_url,
is_external_url
) )
class ContentScrappingStrategy(ABC): class ContentScrappingStrategy(ABC):
@@ -67,6 +70,8 @@ class WebScrappingStrategy(ContentScrappingStrategy):
links = {'internal': [], 'external': []} links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []} media = {'images': [], 'videos': [], 'audios': []}
internal_links_dict = {}
external_links_dict = {}
# Extract meaningful text for media files from closest parent # Extract meaningful text for media files from closest parent
def find_closest_parent_with_useful_text(tag): def find_closest_parent_with_useful_text(tag):
@@ -205,30 +210,55 @@ class WebScrappingStrategy(ContentScrappingStrategy):
social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', []) social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
social_media_domains = list(set(social_media_domains)) social_media_domains = list(set(social_media_domains))
try: try:
if element.name == 'a' and element.get('href'): if element.name == 'a' and element.get('href'):
href = element['href'] href = element.get('href', '').strip()
if not href: # Skip empty hrefs
return False
url_base = url.split('/')[2] url_base = url.split('/')[2]
link_data = {'href': href, 'text': element.get_text()}
if href.startswith('http') and url_base not in href: # Normalize the URL
links['external'].append(link_data) try:
normalized_href = normalize_url(href, url)
except ValueError as e:
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
return False
link_data = {
'href': normalized_href,
'text': element.get_text().strip(),
'title': element.get('title', '').strip()
}
# Check for duplicates and add to appropriate dictionary
is_external = is_external_url(normalized_href, url_base)
if is_external:
if normalized_href not in external_links_dict:
external_links_dict[normalized_href] = link_data
else: else:
links['internal'].append(link_data) if normalized_href not in internal_links_dict:
internal_links_dict[normalized_href] = link_data
keep_element = True keep_element = True
# Handle external link exclusions
if is_external:
if kwargs.get('exclude_external_links', False): if kwargs.get('exclude_external_links', False):
href_parts = href.split('/') element.decompose()
href_url_base = href_parts[2] if len(href_parts) > 2 else href return False
if url_base not in href_url_base: elif kwargs.get('exclude_social_media_links', False):
if any(domain in normalized_href.lower() for domain in social_media_domains):
element.decompose()
return False
elif kwargs.get('exclude_domains', []):
if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
element.decompose() element.decompose()
return False return False
if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
if any(domain in href for domain in social_media_domains):
element.decompose()
return False
except Exception as e: except Exception as e:
raise "Error processing links" raise Exception(f"Error processing links: {str(e)}")
try: try:
if element.name == 'img': if element.name == 'img':
@@ -252,13 +282,19 @@ class WebScrappingStrategy(ContentScrappingStrategy):
element.decompose() element.decompose()
return False return False
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True): if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False):
src_url_base = src.split('/')[2] src_url_base = src.split('/')[2]
url_base = url.split('/')[2] url_base = url.split('/')[2]
if any(domain in src for domain in social_media_domains): if any(domain in src for domain in social_media_domains):
element.decompose() element.decompose()
return False return False
# Handle exclude domains
if kwargs.get('exclude_domains', []):
if any(domain in src for domain in kwargs.get('exclude_domains', [])):
element.decompose()
return False
return True # Always keep image elements return True # Always keep image elements
except Exception as e: except Exception as e:
raise "Error processing images" raise "Error processing images"
@@ -329,6 +365,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
process_element(body) process_element(body)
# Update the links dictionary with unique links
links['internal'] = list(internal_links_dict.values())
links['external'] = list(external_links_dict.values())
# # Process images using ThreadPoolExecutor # # Process images using ThreadPoolExecutor
imgs = body.find_all('img') imgs = body.find_all('img')

View File

@@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
self.apply_chunking = kwargs.get("apply_chunking", True) self.apply_chunking = kwargs.get("apply_chunking", True)
self.base_url = kwargs.get("base_url", None) self.base_url = kwargs.get("base_url", None)
self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
self.extra_args = kwargs.get("extra_args", {}) self.extra_args = kwargs.get("extra_args", {})
if not self.apply_chunking: if not self.apply_chunking:
self.chunk_token_threshold = 1e9 self.chunk_token_threshold = 1e9
@@ -116,7 +117,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.provider, self.provider,
prompt_with_variables, prompt_with_variables,
self.api_token, self.api_token,
base_url=self.base_url, base_url=self.api_base or self.base_url,
extra_args = self.extra_args extra_args = self.extra_args
) # , json_response=self.extract_type == "schema") ) # , json_response=self.extract_type == "schema")
try: try:

View File

@@ -980,4 +980,53 @@ def format_html(html_string):
soup = BeautifulSoup(html_string, 'html.parser') soup = BeautifulSoup(html_string, 'html.parser')
return soup.prettify() return soup.prettify()
def normalize_url(href, base_url):
"""Normalize URLs to ensure consistent format"""
# Extract protocol and domain from base URL
try:
base_parts = base_url.split('/')
protocol = base_parts[0]
domain = base_parts[2]
except IndexError:
raise ValueError(f"Invalid base URL format: {base_url}")
# Handle special protocols
special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
if any(href.lower().startswith(proto) for proto in special_protocols):
return href.strip()
# Handle anchor links
if href.startswith('#'):
return f"{base_url}{href}"
# Handle protocol-relative URLs
if href.startswith('//'):
return f"{protocol}{href}"
# Handle root-relative URLs
if href.startswith('/'):
return f"{protocol}//{domain}{href}"
# Handle relative URLs
if not href.startswith(('http://', 'https://')):
# Remove leading './' if present
href = href.lstrip('./')
return f"{protocol}//{domain}/{href}"
return href.strip()
def is_external_url(url, base_domain):
"""Determine if a URL is external"""
special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
if any(url.lower().startswith(proto) for proto in special_protocols):
return True
try:
# Handle URLs with protocol
if url.startswith(('http://', 'https://')):
url_domain = url.split('/')[2]
return base_domain.lower() not in url_domain.lower()
except IndexError:
return False
return False