feat(scraping): add LXML-based scraping mode for improved performance

Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing.
LXML mode offers 10-20x better performance for large HTML documents.

Key changes:
- Added ScrapingMode enum with BEAUTIFULSOUP and LXML options
- Implemented LXMLWebScrapingStrategy class
- Added LXML-based metadata extraction
- Updated documentation with scraping mode usage and performance considerations
- Added cssselect dependency

BREAKING CHANGE: None
This commit is contained in:
UncleCode
2025-01-12 20:46:23 +08:00
parent 825c78a048
commit f3ae5a657c
12 changed files with 1366 additions and 509 deletions

View File

@@ -868,6 +868,63 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
'metadata': meta
}
def extract_metadata_using_lxml(html, doc=None):
"""
Extract metadata from HTML using lxml for better performance.
"""
metadata = {}
if not html and doc is None:
return {}
if doc is None:
try:
doc = lhtml.document_fromstring(html)
except Exception:
return {}
# Use XPath to find head element
head = doc.xpath('//head')
if not head:
return metadata
head = head[0]
# Title - using XPath
title = head.xpath('.//title/text()')
metadata['title'] = title[0].strip() if title else None
# Meta description - using XPath with multiple attribute conditions
description = head.xpath('.//meta[@name="description"]/@content')
metadata['description'] = description[0].strip() if description else None
# Meta keywords
keywords = head.xpath('.//meta[@name="keywords"]/@content')
metadata['keywords'] = keywords[0].strip() if keywords else None
# Meta author
author = head.xpath('.//meta[@name="author"]/@content')
metadata['author'] = author[0].strip() if author else None
# Open Graph metadata - using starts-with() for performance
og_tags = head.xpath('.//meta[starts-with(@property, "og:")]')
for tag in og_tags:
property_name = tag.get('property', '').strip()
content = tag.get('content', '').strip()
if property_name and content:
metadata[property_name] = content
# Twitter Card metadata
twitter_tags = head.xpath('.//meta[starts-with(@name, "twitter:")]')
for tag in twitter_tags:
property_name = tag.get('name', '').strip()
content = tag.get('content', '').strip()
if property_name and content:
metadata[property_name] = content
return metadata
def extract_metadata(html, soup=None):
"""
Extract optimized content, media, and links from website HTML.