From 36429a63ded80920e37d4925be33bd0d5582fda0 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 8 Jul 2025 12:54:33 +0200 Subject: [PATCH] fix: Improve comments for article metadata extraction in extract_metadata functions. ref #1105 --- crawl4ai/utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index e029a004..8735dee0 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1547,7 +1547,8 @@ def extract_metadata_using_lxml(html, doc=None): content = tag.get("content", "").strip() if property_name and content: metadata[property_name] = content - # Article metadata - using starts-with() for performance + + # Article metadata article_tags = head.xpath('.//meta[starts-with(@property, "article:")]') for tag in article_tags: property_name = tag.get("property", "").strip() @@ -1629,12 +1630,15 @@ def extract_metadata(html, soup=None): content = tag.get("content", "").strip() if property_name and content: metadata[property_name] = content - # getting the article Values - metadata.update({ - tag['property'].strip():tag["content"].strip() - for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")}) - if tag.has_attr('property') and tag.has_attr('content') - }) + + # Article metadata + article_tags = head.find_all("meta", attrs={"property": re.compile(r"^article:")}) + for tag in article_tags: + property_name = tag.get("property", "").strip() + content = tag.get("content", "").strip() + if property_name and content: + metadata[property_name] = content + return metadata