fix: Improve comments for article metadata extraction in extract_metadata functions. ref #1105

This commit is contained in:
ntohidi
2025-07-08 12:54:33 +02:00
parent a3d41c7951
commit 36429a63de

View File

@@ -1547,7 +1547,8 @@ def extract_metadata_using_lxml(html, doc=None):
content = tag.get("content", "").strip() content = tag.get("content", "").strip()
if property_name and content: if property_name and content:
metadata[property_name] = content metadata[property_name] = content
# Article metadata - using starts-with() for performance
# Article metadata
article_tags = head.xpath('.//meta[starts-with(@property, "article:")]') article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
for tag in article_tags: for tag in article_tags:
property_name = tag.get("property", "").strip() property_name = tag.get("property", "").strip()
@@ -1629,12 +1630,15 @@ def extract_metadata(html, soup=None):
content = tag.get("content", "").strip() content = tag.get("content", "").strip()
if property_name and content: if property_name and content:
metadata[property_name] = content metadata[property_name] = content
# getting the article Values
metadata.update({ # Article metadata
tag['property'].strip():tag["content"].strip() article_tags = head.find_all("meta", attrs={"property": re.compile(r"^article:")})
for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")}) for tag in article_tags:
if tag.has_attr('property') and tag.has_attr('content') property_name = tag.get("property", "").strip()
}) content = tag.get("content", "").strip()
if property_name and content:
metadata[property_name] = content
return metadata return metadata