#1105 :fix(metadata): optimize article metadata extraction using XPath for improved performance

2025-05-19 13:48:02 +08:00
parent faa98eefbc
commit 137ac014fb
1 changed files with 7 additions and 6 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1456,12 +1456,13 @@ def extract_metadata_using_lxml(html, doc=None):
        content = tag.get("content", "").strip()
        if property_name and content:
            metadata[property_name] = content
-        # getting the article Values
+   # Article metadata - using starts-with() for performance
-    metadata.update({
+    article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
-        tag['property'].strip():tag["content"].strip()
+    for tag in article_tags:
-        for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
+        property_name = tag.get("property", "").strip()
-          if tag.has_attr('property') and tag.has_attr('content')
+        content = tag.get("content", "").strip()
-    })
+        if property_name and content:
            metadata[property_name] = content
    return metadata