fix: Implement base tag support in link extraction (#1147)

- Extract base href from <head><base> tag using XPath in _process_element method - Use base URL as the primary URL for link normalization when present - Add error handling with logging for malformed or problematic base tags - Maintain backward compatibility when no base tag is present - Add test to verify the functionality of the base tag extraction.
2025-08-08 20:00:11 +05:30
parent a5bcac4c9d
commit 18ad3ef159
2 changed files with 21 additions and 0 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -242,6 +242,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
        exclude_domains = set(kwargs.get("exclude_domains", []))

        # Process links
+        try:
+            base_element = element.xpath("//head/base[@href]")
+            if base_element:
+                base_href = base_element[0].get("href", "").strip()
+                if base_href:
+                    url = base_href
+        except Exception as e:
+            self._log("error", f"Error extracting base URL: {str(e)}", "SCRAPE")
+            pass
+
        for link in element.xpath(".//a[@href]"):
            href = link.get("href", "").strip()
            if not href: