#1105 :fix(metadata): optimize article metadata extraction using XPath for improved performance

2025-05-19 13:48:02 +08:00
parent faa98eefbc
commit 137ac014fb
1 changed files with 7 additions and 6 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1456,12 +1456,13 @@ def extract_metadata_using_lxml(html, doc=None):
        content = tag.get("content", "").strip()
        if property_name and content:
            metadata[property_name] = content
-        # getting the article Values
-    metadata.update({
-        tag['property'].strip():tag["content"].strip()
-        for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
-          if tag.has_attr('property') and tag.has_attr('content')
-    })
+   # Article metadata - using starts-with() for performance
+    article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
+    for tag in article_tags:
+        property_name = tag.get("property", "").strip()
+        content = tag.get("content", "").strip()
+        if property_name and content:
+            metadata[property_name] = content

    return metadata