From 2b3b728dcdbe8ce633c22ecf9e3d017f6ba41e23 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Wed, 28 May 2025 10:17:50 +0200
Subject: [PATCH] fix(metadata): improve title extraction with fallbacks for
 edge cases. REF #995

Some pages include a <title> tag in HTML but lxml fails to parse it due to unusual structure.
Added fallback logic using .find() and OpenGraph/Twitter meta tags to ensure reliable title extraction.
---
 crawl4ai/content_scraping_strategy.py | 23 +++++++++++++++--------
 crawl4ai/utils.py                     | 23 ++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 9 deletions(-)
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 3510f64f..b5ece9ba 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -696,6 +696,13 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
 
         success = True
         try:
+            # Extract metadata FIRST from the original HTML to avoid issues with modified content.
+            try:
+                meta = extract_metadata_using_lxml(html, None)  # Pass the original HTML
+            except Exception as e:
+                self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
+                meta = {}
+                
             doc = lhtml.document_fromstring(html)
             # Match BeautifulSoup's behavior of using body or full doc
             # body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
@@ -736,14 +743,14 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
                         "error", f"Error with excluded CSS selector: {str(e)}", "SCRAPE"
                     )
 
-            # Extract metadata before any content filtering
-            try:
-                meta = extract_metadata_using_lxml(
-                    "", doc
-                )  # Using same function as BeautifulSoup version
-            except Exception as e:
-                self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
-                meta = {}
+            # # Extract metadata before any content filtering
+            # try:
+            #     meta = extract_metadata_using_lxml(
+            #         "", doc
+            #     )  # Using same function as BeautifulSoup version
+            # except Exception as e:
+            #     self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
+            #     meta = {}
 
             content_element = None
             if target_elements:
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index d8b366d9..ef5df62f 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1487,8 +1487,29 @@ def extract_metadata_using_lxml(html, doc=None):
     head = head[0]
 
     # Title - using XPath
+    # title = head.xpath(".//title/text()")
+    # metadata["title"] = title[0].strip() if title else None
+
+    # === Title Extraction - New Approach ===
+    # Attempt to extract <title> using XPath
     title = head.xpath(".//title/text()")
-    metadata["title"] = title[0].strip() if title else None
+    title = title[0] if title else None
+
+    # Fallback: Use .find() in case XPath fails due to malformed HTML
+    if not title:
+        title_el = doc.find(".//title")
+        title = title_el.text if title_el is not None else None
+
+    # Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
+    if not title:
+        title_candidates = (
+            doc.xpath("//meta[@property='og:title']/@content") or
+            doc.xpath("//meta[@name='twitter:title']/@content")
+        )
+        title = title_candidates[0] if title_candidates else None
+
+    # Strip and assign title
+    metadata["title"] = title.strip() if title else None
 
     # Meta description - using XPath with multiple attribute conditions
     description = head.xpath('.//meta[@name="description"]/@content')