feat: enhance crawler with overlay removal and improved screenshot capabilities

• Add smart overlay removal system for handling popups and modals
• Improve screenshot functionality with configurable timing controls
• Implement URL normalization and enhanced link processing
• Add custom base directory support for cache storage
• Refine external content filtering and social media domain handling

This commit significantly improves the crawler's ability to handle modern
websites by automatically removing intrusive overlays and providing better
screenshot capabilities. URL handling is now more robust with proper
normalization and duplicate detection. The cache system is more flexible
with customizable base directory support.

Breaking changes: None
Issue numbers: None
This commit is contained in:
UncleCode
2024-10-24 20:22:47 +08:00
parent 60ba131ac8
commit bcfe83f702
7 changed files with 319 additions and 50 deletions

View File

@@ -1,5 +1,52 @@
# Changelog
## [v0.3.73] - 2024-10-24
### Added
- Smart overlay removal system in AsyncPlaywrightCrawlerStrategy:
- Automatic removal of popups, modals, and cookie notices
- Detection and removal of fixed/sticky position elements
- Cleaning of empty block elements
- Configurable via `remove_overlay_elements` parameter
- Enhanced screenshot capabilities:
- Added `screenshot_wait_for` parameter to control timing
- Improved screenshot handling with existing page context
- Better error handling with fallback error images
- New URL normalization utilities:
- `normalize_url` function for consistent URL formatting
- `is_external_url` function for better link classification
- Custom base directory support for cache storage:
- New `base_directory` parameter in AsyncWebCrawler
- Allows specifying alternative locations for `.crawl4ai` folder
### Enhanced
- Link handling improvements:
- Better duplicate link detection
- Enhanced internal/external link classification
- Improved handling of special URL protocols
- Support for anchor links and protocol-relative URLs
- Configuration refinements:
- Streamlined social media domain list
- More focused external content filtering
- LLM extraction strategy:
- Added support for separate API base URL via `api_base` parameter
- Better handling of base URLs in configuration
### Fixed
- Screenshot functionality:
- Resolved issues with screenshot timing and context
- Improved error handling and recovery
- Link processing:
- Fixed URL normalization edge cases
- Better handling of invalid URLs
- Improved error messages for link processing failures
### Developer Notes
- The overlay removal system uses advanced JavaScript injection for better compatibility
- URL normalization handles special cases like mailto:, tel:, and protocol-relative URLs
- Screenshot system now reuses existing page context for better performance
- Link processing maintains separate dictionaries for internal and external links to ensure uniqueness
## [v0.3.72] - 2024-10-22
### Added

View File

@@ -51,7 +51,7 @@ class AsyncCrawlerStrategy(ABC):
pass
@abstractmethod
async def take_screenshot(self, url: str) -> str:
async def take_screenshot(self, **kwargs) -> str:
pass
@abstractmethod
@@ -502,13 +502,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if delay_before_return_html:
await asyncio.sleep(delay_before_return_html)
# Check for remove_overlay_elements parameter
if kwargs.get("remove_overlay_elements", False):
await self.remove_overlay_elements(page)
html = await page.content()
await self.execute_hook('before_return_html', page, html)
# Check if kwargs has screenshot=True then take screenshot
screenshot_data = None
if kwargs.get("screenshot"):
screenshot_data = await self.take_screenshot(url)
# Check we have screenshot_wait_for parameter, if we have simply wait for that time
screenshot_wait_for = kwargs.get("screenshot_wait_for")
if screenshot_wait_for:
await asyncio.sleep(screenshot_wait_for)
screenshot_data = await self.take_screenshot(page)
if self.verbose:
print(f"[LOG] ✅ Crawled {url} successfully!")
@@ -559,28 +567,156 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
results = await asyncio.gather(*tasks, return_exceptions=True)
return [result if not isinstance(result, Exception) else str(result) for result in results]
async def take_screenshot(self, url: str, wait_time=1000) -> str:
async with await self.browser.new_context(user_agent=self.user_agent) as context:
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
# Wait for a specified time (default is 1 second)
await page.wait_for_timeout(wait_time)
screenshot = await page.screenshot(full_page=True)
return base64.b64encode(screenshot).decode('utf-8')
except Exception as e:
error_message = f"Failed to take screenshot: {str(e)}"
print(error_message)
async def remove_overlay_elements(self, page: Page) -> None:
"""
Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.
Args:
page (Page): The Playwright page instance
"""
remove_overlays_js = """
async () => {
// Function to check if element is visible
const isVisible = (elem) => {
const style = window.getComputedStyle(elem);
return style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0';
};
# Generate an error image
img = Image.new('RGB', (800, 600), color='black')
draw = ImageDraw.Draw(img)
font = ImageFont.load_default()
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
// Common selectors for popups and overlays
const commonSelectors = [
// Close buttons first
'button[class*="close" i]', 'button[class*="dismiss" i]',
'button[aria-label*="close" i]', 'button[title*="close" i]',
'a[class*="close" i]', 'span[class*="close" i]',
buffered = BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode('utf-8')
finally:
await page.close()
// Cookie notices
'[class*="cookie-banner" i]', '[id*="cookie-banner" i]',
'[class*="cookie-consent" i]', '[id*="cookie-consent" i]',
// Newsletter/subscription dialogs
'[class*="newsletter" i]', '[class*="subscribe" i]',
// Generic popups/modals
'[class*="popup" i]', '[class*="modal" i]',
'[class*="overlay" i]', '[class*="dialog" i]',
'[role="dialog"]', '[role="alertdialog"]'
];
// Try to click close buttons first
for (const selector of commonSelectors.slice(0, 6)) {
const closeButtons = document.querySelectorAll(selector);
for (const button of closeButtons) {
if (isVisible(button)) {
try {
button.click();
await new Promise(resolve => setTimeout(resolve, 100));
} catch (e) {
console.log('Error clicking button:', e);
}
}
}
}
// Remove remaining overlay elements
const removeOverlays = () => {
// Find elements with high z-index
const allElements = document.querySelectorAll('*');
for (const elem of allElements) {
const style = window.getComputedStyle(elem);
const zIndex = parseInt(style.zIndex);
const position = style.position;
if (
isVisible(elem) &&
(zIndex > 999 || position === 'fixed' || position === 'absolute') &&
(
elem.offsetWidth > window.innerWidth * 0.5 ||
elem.offsetHeight > window.innerHeight * 0.5 ||
style.backgroundColor.includes('rgba') ||
parseFloat(style.opacity) < 1
)
) {
elem.remove();
}
}
// Remove elements matching common selectors
for (const selector of commonSelectors) {
const elements = document.querySelectorAll(selector);
elements.forEach(elem => {
if (isVisible(elem)) {
elem.remove();
}
});
}
};
// Remove overlay elements
removeOverlays();
// Remove any fixed/sticky position elements at the top/bottom
const removeFixedElements = () => {
const elements = document.querySelectorAll('*');
elements.forEach(elem => {
const style = window.getComputedStyle(elem);
if (
(style.position === 'fixed' || style.position === 'sticky') &&
isVisible(elem)
) {
elem.remove();
}
});
};
removeFixedElements();
// Remove empty block elements as: div, p, span, etc.
const removeEmptyBlockElements = () => {
const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6');
blockElements.forEach(elem => {
if (elem.innerText.trim() === '') {
elem.remove();
}
});
};
// Remove margin-right and padding-right from body (often added by modal scripts)
document.body.style.marginRight = '0px';
document.body.style.paddingRight = '0px';
document.body.style.overflow = 'auto';
// Wait a bit for any animations to complete
await new Promise(resolve => setTimeout(resolve, 100));
}
"""
try:
await page.evaluate(remove_overlays_js)
await page.wait_for_timeout(500) # Wait for any animations to complete
except Exception as e:
if self.verbose:
print(f"Warning: Failed to remove overlay elements: {str(e)}")
async def take_screenshot(self, page: Page) -> str:
try:
# The page is already loaded, just take the screenshot
screenshot = await page.screenshot(full_page=True)
return base64.b64encode(screenshot).decode('utf-8')
except Exception as e:
error_message = f"Failed to take screenshot: {str(e)}"
print(error_message)
# Generate an error image
img = Image.new('RGB', (800, 600), color='black')
draw = ImageDraw.Draw(img)
font = ImageFont.load_default()
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
buffered = BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode('utf-8')
finally:
await page.close()

View File

@@ -23,13 +23,15 @@ class AsyncWebCrawler:
self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
always_by_pass_cache: bool = False,
base_directory: str = str(Path.home()),
**kwargs,
):
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
**kwargs
)
self.always_by_pass_cache = always_by_pass_cache
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
# self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
self.ready = False

View File

@@ -37,16 +37,9 @@ SOCIAL_MEDIA_DOMAINS = [
'linkedin.com',
'instagram.com',
'pinterest.com',
'youtube.com',
'tiktok.com',
'snapchat.com',
'whatsapp.com',
'messenger.com',
'reddit.com',
'tumblr.com',
'buffer.com',
'xing.com',
'flipboard.com',
]
# Threshold for the Image extraction - Range is 1 to 6

View File

@@ -14,7 +14,10 @@ from .utils import (
sanitize_html,
extract_metadata,
InvalidCSSSelectorError,
CustomHTML2Text
CustomHTML2Text,
normalize_url,
is_external_url
)
class ContentScrappingStrategy(ABC):
@@ -67,6 +70,8 @@ class WebScrappingStrategy(ContentScrappingStrategy):
links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []}
internal_links_dict = {}
external_links_dict = {}
# Extract meaningful text for media files from closest parent
def find_closest_parent_with_useful_text(tag):
@@ -205,30 +210,55 @@ class WebScrappingStrategy(ContentScrappingStrategy):
social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
social_media_domains = list(set(social_media_domains))
try:
if element.name == 'a' and element.get('href'):
href = element['href']
href = element.get('href', '').strip()
if not href: # Skip empty hrefs
return False
url_base = url.split('/')[2]
link_data = {'href': href, 'text': element.get_text()}
if href.startswith('http') and url_base not in href:
links['external'].append(link_data)
# Normalize the URL
try:
normalized_href = normalize_url(href, url)
except ValueError as e:
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
return False
link_data = {
'href': normalized_href,
'text': element.get_text().strip(),
'title': element.get('title', '').strip()
}
# Check for duplicates and add to appropriate dictionary
is_external = is_external_url(normalized_href, url_base)
if is_external:
if normalized_href not in external_links_dict:
external_links_dict[normalized_href] = link_data
else:
links['internal'].append(link_data)
if normalized_href not in internal_links_dict:
internal_links_dict[normalized_href] = link_data
keep_element = True
if kwargs.get('exclude_external_links', False):
href_parts = href.split('/')
href_url_base = href_parts[2] if len(href_parts) > 2 else href
if url_base not in href_url_base:
element.decompose()
return False
if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
if any(domain in href for domain in social_media_domains):
# Handle external link exclusions
if is_external:
if kwargs.get('exclude_external_links', False):
element.decompose()
return False
elif kwargs.get('exclude_social_media_links', False):
if any(domain in normalized_href.lower() for domain in social_media_domains):
element.decompose()
return False
elif kwargs.get('exclude_domains', []):
if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
element.decompose()
return False
except Exception as e:
raise "Error processing links"
raise Exception(f"Error processing links: {str(e)}")
try:
if element.name == 'img':
@@ -252,12 +282,18 @@ class WebScrappingStrategy(ContentScrappingStrategy):
element.decompose()
return False
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True):
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False):
src_url_base = src.split('/')[2]
url_base = url.split('/')[2]
if any(domain in src for domain in social_media_domains):
element.decompose()
return False
# Handle exclude domains
if kwargs.get('exclude_domains', []):
if any(domain in src for domain in kwargs.get('exclude_domains', [])):
element.decompose()
return False
return True # Always keep image elements
except Exception as e:
@@ -328,6 +364,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
# ]
process_element(body)
# Update the links dictionary with unique links
links['internal'] = list(internal_links_dict.values())
links['external'] = list(external_links_dict.values())
# # Process images using ThreadPoolExecutor
imgs = body.find_all('img')

View File

@@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
self.apply_chunking = kwargs.get("apply_chunking", True)
self.base_url = kwargs.get("base_url", None)
self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
self.extra_args = kwargs.get("extra_args", {})
if not self.apply_chunking:
self.chunk_token_threshold = 1e9
@@ -116,7 +117,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.provider,
prompt_with_variables,
self.api_token,
base_url=self.base_url,
base_url=self.api_base or self.base_url,
extra_args = self.extra_args
) # , json_response=self.extract_type == "schema")
try:

View File

@@ -980,4 +980,53 @@ def format_html(html_string):
soup = BeautifulSoup(html_string, 'html.parser')
return soup.prettify()
def normalize_url(href, base_url):
"""Normalize URLs to ensure consistent format"""
# Extract protocol and domain from base URL
try:
base_parts = base_url.split('/')
protocol = base_parts[0]
domain = base_parts[2]
except IndexError:
raise ValueError(f"Invalid base URL format: {base_url}")
# Handle special protocols
special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
if any(href.lower().startswith(proto) for proto in special_protocols):
return href.strip()
# Handle anchor links
if href.startswith('#'):
return f"{base_url}{href}"
# Handle protocol-relative URLs
if href.startswith('//'):
return f"{protocol}{href}"
# Handle root-relative URLs
if href.startswith('/'):
return f"{protocol}//{domain}{href}"
# Handle relative URLs
if not href.startswith(('http://', 'https://')):
# Remove leading './' if present
href = href.lstrip('./')
return f"{protocol}//{domain}/{href}"
return href.strip()
def is_external_url(url, base_domain):
"""Determine if a URL is external"""
special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
if any(url.lower().startswith(proto) for proto in special_protocols):
return True
try:
# Handle URLs with protocol
if url.startswith(('http://', 'https://')):
url_domain = url.split('/')[2]
return base_domain.lower() not in url_domain.lower()
except IndexError:
return False
return False