fix(metadata): improve title extraction with fallbacks for edge cases. REF #995

Some pages include a <title> tag in HTML but lxml fails to parse it due to unusual structure.
Added fallback logic using .find() and OpenGraph/Twitter meta tags to ensure reliable title extraction.
This commit is contained in:
ntohidi
2025-05-28 10:17:50 +02:00
parent bfec5156ad
commit 2b3b728dcd
2 changed files with 37 additions and 9 deletions

View File

@@ -696,6 +696,13 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
success = True success = True
try: try:
# Extract metadata FIRST from the original HTML to avoid issues with modified content.
try:
meta = extract_metadata_using_lxml(html, None) # Pass the original HTML
except Exception as e:
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
meta = {}
doc = lhtml.document_fromstring(html) doc = lhtml.document_fromstring(html)
# Match BeautifulSoup's behavior of using body or full doc # Match BeautifulSoup's behavior of using body or full doc
# body = doc.xpath('//body')[0] if doc.xpath('//body') else doc # body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
@@ -736,14 +743,14 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
"error", f"Error with excluded CSS selector: {str(e)}", "SCRAPE" "error", f"Error with excluded CSS selector: {str(e)}", "SCRAPE"
) )
# Extract metadata before any content filtering # # Extract metadata before any content filtering
try: # try:
meta = extract_metadata_using_lxml( # meta = extract_metadata_using_lxml(
"", doc # "", doc
) # Using same function as BeautifulSoup version # ) # Using same function as BeautifulSoup version
except Exception as e: # except Exception as e:
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") # self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
meta = {} # meta = {}
content_element = None content_element = None
if target_elements: if target_elements:

View File

@@ -1487,8 +1487,29 @@ def extract_metadata_using_lxml(html, doc=None):
head = head[0] head = head[0]
# Title - using XPath # Title - using XPath
# title = head.xpath(".//title/text()")
# metadata["title"] = title[0].strip() if title else None
# === Title Extraction - New Approach ===
# Attempt to extract <title> using XPath
title = head.xpath(".//title/text()") title = head.xpath(".//title/text()")
metadata["title"] = title[0].strip() if title else None title = title[0] if title else None
# Fallback: Use .find() in case XPath fails due to malformed HTML
if not title:
title_el = doc.find(".//title")
title = title_el.text if title_el is not None else None
# Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
if not title:
title_candidates = (
doc.xpath("//meta[@property='og:title']/@content") or
doc.xpath("//meta[@name='twitter:title']/@content")
)
title = title_candidates[0] if title_candidates else None
# Strip and assign title
metadata["title"] = title.strip() if title else None
# Meta description - using XPath with multiple attribute conditions # Meta description - using XPath with multiple attribute conditions
description = head.xpath('.//meta[@name="description"]/@content') description = head.xpath('.//meta[@name="description"]/@content')