feat: enhance crawler with overlay removal and improved screenshot capabilities

• Add smart overlay removal system for handling popups and modals
• Improve screenshot functionality with configurable timing controls
• Implement URL normalization and enhanced link processing
• Add custom base directory support for cache storage
• Refine external content filtering and social media domain handling

This commit significantly improves the crawler's ability to handle modern
websites by automatically removing intrusive overlays and providing better
screenshot capabilities. URL handling is now more robust with proper
normalization and duplicate detection. The cache system is more flexible
with customizable base directory support.

Breaking changes: None
Issue numbers: None
This commit is contained in:
UncleCode
2024-10-24 20:22:47 +08:00
parent 60ba131ac8
commit bcfe83f702
7 changed files with 319 additions and 50 deletions

View File

@@ -14,7 +14,10 @@ from .utils import (
sanitize_html,
extract_metadata,
InvalidCSSSelectorError,
CustomHTML2Text
CustomHTML2Text,
normalize_url,
is_external_url
)
class ContentScrappingStrategy(ABC):
@@ -67,6 +70,8 @@ class WebScrappingStrategy(ContentScrappingStrategy):
links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []}
internal_links_dict = {}
external_links_dict = {}
# Extract meaningful text for media files from closest parent
def find_closest_parent_with_useful_text(tag):
@@ -205,30 +210,55 @@ class WebScrappingStrategy(ContentScrappingStrategy):
social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
social_media_domains = list(set(social_media_domains))
try:
if element.name == 'a' and element.get('href'):
href = element['href']
href = element.get('href', '').strip()
if not href: # Skip empty hrefs
return False
url_base = url.split('/')[2]
link_data = {'href': href, 'text': element.get_text()}
if href.startswith('http') and url_base not in href:
links['external'].append(link_data)
# Normalize the URL
try:
normalized_href = normalize_url(href, url)
except ValueError as e:
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
return False
link_data = {
'href': normalized_href,
'text': element.get_text().strip(),
'title': element.get('title', '').strip()
}
# Check for duplicates and add to appropriate dictionary
is_external = is_external_url(normalized_href, url_base)
if is_external:
if normalized_href not in external_links_dict:
external_links_dict[normalized_href] = link_data
else:
links['internal'].append(link_data)
if normalized_href not in internal_links_dict:
internal_links_dict[normalized_href] = link_data
keep_element = True
if kwargs.get('exclude_external_links', False):
href_parts = href.split('/')
href_url_base = href_parts[2] if len(href_parts) > 2 else href
if url_base not in href_url_base:
element.decompose()
return False
if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
if any(domain in href for domain in social_media_domains):
# Handle external link exclusions
if is_external:
if kwargs.get('exclude_external_links', False):
element.decompose()
return False
elif kwargs.get('exclude_social_media_links', False):
if any(domain in normalized_href.lower() for domain in social_media_domains):
element.decompose()
return False
elif kwargs.get('exclude_domains', []):
if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
element.decompose()
return False
except Exception as e:
raise "Error processing links"
raise Exception(f"Error processing links: {str(e)}")
try:
if element.name == 'img':
@@ -252,12 +282,18 @@ class WebScrappingStrategy(ContentScrappingStrategy):
element.decompose()
return False
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True):
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False):
src_url_base = src.split('/')[2]
url_base = url.split('/')[2]
if any(domain in src for domain in social_media_domains):
element.decompose()
return False
# Handle exclude domains
if kwargs.get('exclude_domains', []):
if any(domain in src for domain in kwargs.get('exclude_domains', [])):
element.decompose()
return False
return True # Always keep image elements
except Exception as e:
@@ -328,6 +364,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
# ]
process_element(body)
# Update the links dictionary with unique links
links['internal'] = list(internal_links_dict.values())
links['external'] = list(external_links_dict.values())
# # Process images using ThreadPoolExecutor
imgs = body.find_all('img')