From 2b3b728dcdbe8ce633c22ecf9e3d017f6ba41e23 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 28 May 2025 10:17:50 +0200 Subject: [PATCH] fix(metadata): improve title extraction with fallbacks for edge cases. REF #995 Some pages include a tag in HTML but lxml fails to parse it due to unusual structure. Added fallback logic using .find() and OpenGraph/Twitter meta tags to ensure reliable title extraction. --- crawl4ai/content_scraping_strategy.py | 23 +++++++++++++++-------- crawl4ai/utils.py | 23 ++++++++++++++++++++++- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3510f64f..b5ece9ba 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -696,6 +696,13 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy): success = True try: + # Extract metadata FIRST from the original HTML to avoid issues with modified content. + try: + meta = extract_metadata_using_lxml(html, None) # Pass the original HTML + except Exception as e: + self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") + meta = {} + doc = lhtml.document_fromstring(html) # Match BeautifulSoup's behavior of using body or full doc # body = doc.xpath('//body')[0] if doc.xpath('//body') else doc @@ -736,14 +743,14 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy): "error", f"Error with excluded CSS selector: {str(e)}", "SCRAPE" ) - # Extract metadata before any content filtering - try: - meta = extract_metadata_using_lxml( - "", doc - ) # Using same function as BeautifulSoup version - except Exception as e: - self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") - meta = {} + # # Extract metadata before any content filtering + # try: + # meta = extract_metadata_using_lxml( + # "", doc + # ) # Using same function as BeautifulSoup version + # except Exception as e: + # self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") + # meta = {} content_element = None if target_elements: diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d8b366d9..ef5df62f 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1487,8 +1487,29 @@ def extract_metadata_using_lxml(html, doc=None): head = head[0] # Title - using XPath + # title = head.xpath(".//title/text()") + # metadata["title"] = title[0].strip() if title else None + + # === Title Extraction - New Approach === + # Attempt to extract <title> using XPath title = head.xpath(".//title/text()") - metadata["title"] = title[0].strip() if title else None + title = title[0] if title else None + + # Fallback: Use .find() in case XPath fails due to malformed HTML + if not title: + title_el = doc.find(".//title") + title = title_el.text if title_el is not None else None + + # Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty + if not title: + title_candidates = ( + doc.xpath("//meta[@property='og:title']/@content") or + doc.xpath("//meta[@name='twitter:title']/@content") + ) + title = title_candidates[0] if title_candidates else None + + # Strip and assign title + metadata["title"] = title.strip() if title else None # Meta description - using XPath with multiple attribute conditions description = head.xpath('.//meta[@name="description"]/@content')