feat: enhance crawler with overlay removal and improved screenshot capabilities

• Add smart overlay removal system for handling popups and modals
• Improve screenshot functionality with configurable timing controls
• Implement URL normalization and enhanced link processing
• Add custom base directory support for cache storage
• Refine external content filtering and social media domain handling

This commit significantly improves the crawler's ability to handle modern
websites by automatically removing intrusive overlays and providing better
screenshot capabilities. URL handling is now more robust with proper
normalization and duplicate detection. The cache system is more flexible
with customizable base directory support.

Breaking changes: None
Issue numbers: None
This commit is contained in:
UncleCode
2024-10-24 20:22:47 +08:00
parent 60ba131ac8
commit bcfe83f702
7 changed files with 319 additions and 50 deletions

View File

@@ -980,4 +980,53 @@ def format_html(html_string):
soup = BeautifulSoup(html_string, 'html.parser')
return soup.prettify()
def normalize_url(href, base_url):
"""Normalize URLs to ensure consistent format"""
# Extract protocol and domain from base URL
try:
base_parts = base_url.split('/')
protocol = base_parts[0]
domain = base_parts[2]
except IndexError:
raise ValueError(f"Invalid base URL format: {base_url}")
# Handle special protocols
special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
if any(href.lower().startswith(proto) for proto in special_protocols):
return href.strip()
# Handle anchor links
if href.startswith('#'):
return f"{base_url}{href}"
# Handle protocol-relative URLs
if href.startswith('//'):
return f"{protocol}{href}"
# Handle root-relative URLs
if href.startswith('/'):
return f"{protocol}//{domain}{href}"
# Handle relative URLs
if not href.startswith(('http://', 'https://')):
# Remove leading './' if present
href = href.lstrip('./')
return f"{protocol}//{domain}/{href}"
return href.strip()
def is_external_url(url, base_domain):
"""Determine if a URL is external"""
special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
if any(url.lower().startswith(proto) for proto in special_protocols):
return True
try:
# Handle URLs with protocol
if url.startswith(('http://', 'https://')):
url_domain = url.split('/')[2]
return base_domain.lower() not in url_domain.lower()
except IndexError:
return False
return False