feat(scraping): add LXML-based scraping mode for improved performance

Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing. LXML mode offers 10-20x better performance for large HTML documents. Key changes: - Added ScrapingMode enum with BEAUTIFULSOUP and LXML options - Implemented LXMLWebScrapingStrategy class - Added LXML-based metadata extraction - Updated documentation with scraping mode usage and performance considerations - Added cssselect dependency BREAKING CHANGE: None
2025-01-12 20:46:23 +08:00
parent 825c78a048
commit f3ae5a657c
12 changed files with 1366 additions and 509 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -868,6 +868,63 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
        'metadata': meta
    }

+def extract_metadata_using_lxml(html, doc=None):
+    """
+    Extract metadata from HTML using lxml for better performance.
+    """
+    metadata = {}
+    
+    if not html and doc is None:
+        return {}
+    
+    if doc is None:
+        try:
+            doc = lhtml.document_fromstring(html)
+        except Exception:
+            return {}
+    
+    # Use XPath to find head element
+    head = doc.xpath('//head')
+    if not head:
+        return metadata
+    
+    head = head[0]
+    
+    # Title - using XPath
+    title = head.xpath('.//title/text()')
+    metadata['title'] = title[0].strip() if title else None
+
+    # Meta description - using XPath with multiple attribute conditions
+    description = head.xpath('.//meta[@name="description"]/@content')
+    metadata['description'] = description[0].strip() if description else None
+
+    # Meta keywords
+    keywords = head.xpath('.//meta[@name="keywords"]/@content')
+    metadata['keywords'] = keywords[0].strip() if keywords else None
+
+    # Meta author
+    author = head.xpath('.//meta[@name="author"]/@content')
+    metadata['author'] = author[0].strip() if author else None
+
+    # Open Graph metadata - using starts-with() for performance
+    og_tags = head.xpath('.//meta[starts-with(@property, "og:")]')
+    for tag in og_tags:
+        property_name = tag.get('property', '').strip()
+        content = tag.get('content', '').strip()
+        if property_name and content:
+            metadata[property_name] = content
+
+    # Twitter Card metadata
+    twitter_tags = head.xpath('.//meta[starts-with(@name, "twitter:")]')
+    for tag in twitter_tags:
+        property_name = tag.get('name', '').strip()
+        content = tag.get('content', '').strip()
+        if property_name and content:
+            metadata[property_name] = content
+
+    return metadata
+
+
 def extract_metadata(html, soup=None):
    """
    Extract optimized content, media, and links from website HTML.