fix(metadata): improve title extraction with fallbacks for edge cases. REF #995
Some pages include a <title> tag in HTML but lxml fails to parse it due to unusual structure. Added fallback logic using .find() and OpenGraph/Twitter meta tags to ensure reliable title extraction.
This commit is contained in:
@@ -696,6 +696,13 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
success = True
|
success = True
|
||||||
try:
|
try:
|
||||||
|
# Extract metadata FIRST from the original HTML to avoid issues with modified content.
|
||||||
|
try:
|
||||||
|
meta = extract_metadata_using_lxml(html, None) # Pass the original HTML
|
||||||
|
except Exception as e:
|
||||||
|
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
||||||
|
meta = {}
|
||||||
|
|
||||||
doc = lhtml.document_fromstring(html)
|
doc = lhtml.document_fromstring(html)
|
||||||
# Match BeautifulSoup's behavior of using body or full doc
|
# Match BeautifulSoup's behavior of using body or full doc
|
||||||
# body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
|
# body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
|
||||||
@@ -736,14 +743,14 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
"error", f"Error with excluded CSS selector: {str(e)}", "SCRAPE"
|
"error", f"Error with excluded CSS selector: {str(e)}", "SCRAPE"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract metadata before any content filtering
|
# # Extract metadata before any content filtering
|
||||||
try:
|
# try:
|
||||||
meta = extract_metadata_using_lxml(
|
# meta = extract_metadata_using_lxml(
|
||||||
"", doc
|
# "", doc
|
||||||
) # Using same function as BeautifulSoup version
|
# ) # Using same function as BeautifulSoup version
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
# self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
||||||
meta = {}
|
# meta = {}
|
||||||
|
|
||||||
content_element = None
|
content_element = None
|
||||||
if target_elements:
|
if target_elements:
|
||||||
|
|||||||
@@ -1487,8 +1487,29 @@ def extract_metadata_using_lxml(html, doc=None):
|
|||||||
head = head[0]
|
head = head[0]
|
||||||
|
|
||||||
# Title - using XPath
|
# Title - using XPath
|
||||||
|
# title = head.xpath(".//title/text()")
|
||||||
|
# metadata["title"] = title[0].strip() if title else None
|
||||||
|
|
||||||
|
# === Title Extraction - New Approach ===
|
||||||
|
# Attempt to extract <title> using XPath
|
||||||
title = head.xpath(".//title/text()")
|
title = head.xpath(".//title/text()")
|
||||||
metadata["title"] = title[0].strip() if title else None
|
title = title[0] if title else None
|
||||||
|
|
||||||
|
# Fallback: Use .find() in case XPath fails due to malformed HTML
|
||||||
|
if not title:
|
||||||
|
title_el = doc.find(".//title")
|
||||||
|
title = title_el.text if title_el is not None else None
|
||||||
|
|
||||||
|
# Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
|
||||||
|
if not title:
|
||||||
|
title_candidates = (
|
||||||
|
doc.xpath("//meta[@property='og:title']/@content") or
|
||||||
|
doc.xpath("//meta[@name='twitter:title']/@content")
|
||||||
|
)
|
||||||
|
title = title_candidates[0] if title_candidates else None
|
||||||
|
|
||||||
|
# Strip and assign title
|
||||||
|
metadata["title"] = title.strip() if title else None
|
||||||
|
|
||||||
# Meta description - using XPath with multiple attribute conditions
|
# Meta description - using XPath with multiple attribute conditions
|
||||||
description = head.xpath('.//meta[@name="description"]/@content')
|
description = head.xpath('.//meta[@name="description"]/@content')
|
||||||
|
|||||||
Reference in New Issue
Block a user