feat(scraping): add LXML-based scraping mode for improved performance
Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing. LXML mode offers 10-20x better performance for large HTML documents. Key changes: - Added ScrapingMode enum with BEAUTIFULSOUP and LXML options - Implemented LXMLWebScrapingStrategy class - Added LXML-based metadata extraction - Updated documentation with scraping mode usage and performance considerations - Added cssselect dependency BREAKING CHANGE: None
This commit is contained in:
@@ -868,6 +868,63 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
'metadata': meta
|
||||
}
|
||||
|
||||
def extract_metadata_using_lxml(html, doc=None):
|
||||
"""
|
||||
Extract metadata from HTML using lxml for better performance.
|
||||
"""
|
||||
metadata = {}
|
||||
|
||||
if not html and doc is None:
|
||||
return {}
|
||||
|
||||
if doc is None:
|
||||
try:
|
||||
doc = lhtml.document_fromstring(html)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
# Use XPath to find head element
|
||||
head = doc.xpath('//head')
|
||||
if not head:
|
||||
return metadata
|
||||
|
||||
head = head[0]
|
||||
|
||||
# Title - using XPath
|
||||
title = head.xpath('.//title/text()')
|
||||
metadata['title'] = title[0].strip() if title else None
|
||||
|
||||
# Meta description - using XPath with multiple attribute conditions
|
||||
description = head.xpath('.//meta[@name="description"]/@content')
|
||||
metadata['description'] = description[0].strip() if description else None
|
||||
|
||||
# Meta keywords
|
||||
keywords = head.xpath('.//meta[@name="keywords"]/@content')
|
||||
metadata['keywords'] = keywords[0].strip() if keywords else None
|
||||
|
||||
# Meta author
|
||||
author = head.xpath('.//meta[@name="author"]/@content')
|
||||
metadata['author'] = author[0].strip() if author else None
|
||||
|
||||
# Open Graph metadata - using starts-with() for performance
|
||||
og_tags = head.xpath('.//meta[starts-with(@property, "og:")]')
|
||||
for tag in og_tags:
|
||||
property_name = tag.get('property', '').strip()
|
||||
content = tag.get('content', '').strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
# Twitter Card metadata
|
||||
twitter_tags = head.xpath('.//meta[starts-with(@name, "twitter:")]')
|
||||
for tag in twitter_tags:
|
||||
property_name = tag.get('name', '').strip()
|
||||
content = tag.get('content', '').strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def extract_metadata(html, soup=None):
|
||||
"""
|
||||
Extract optimized content, media, and links from website HTML.
|
||||
|
||||
Reference in New Issue
Block a user