From 18ad3ef1599a4c7565692a8368b16fb2789de4a6 Mon Sep 17 00:00:00 2001
From: Soham Kukreti <kukretisoham@gmail.com>
Date: Fri, 8 Aug 2025 20:00:11 +0530
Subject: [PATCH] fix: Implement base tag support in link extraction (#1147) -
 Extract base href from <head><base> tag using XPath in _process_element
 method - Use base URL as the primary URL for link normalization when present
 - Add error handling with logging for malformed or problematic base tags -
 Maintain backward compatibility when no base tag is present - Add test to
 verify the functionality of the base tag extraction.

---
 crawl4ai/content_scraping_strategy.py  | 10 ++++++++++
 tests/async/test_content_extraction.py | 11 +++++++++++
 2 files changed, 21 insertions(+)
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index e13ffa5e..81c8a41f 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -242,6 +242,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
         exclude_domains = set(kwargs.get("exclude_domains", []))
 
         # Process links
+        try:
+            base_element = element.xpath("//head/base[@href]")
+            if base_element:
+                base_href = base_element[0].get("href", "").strip()
+                if base_href:
+                    url = base_href
+        except Exception as e:
+            self._log("error", f"Error extracting base URL: {str(e)}", "SCRAPE")
+            pass
+
         for link in element.xpath(".//a[@href]"):
             href = link.get("href", "").strip()
             if not href:
diff --git a/tests/async/test_content_extraction.py b/tests/async/test_content_extraction.py
index 9372387a..509a387f 100644
--- a/tests/async/test_content_extraction.py
+++ b/tests/async/test_content_extraction.py
@@ -91,6 +91,17 @@ async def test_css_selector_extraction():
         assert result.markdown
         assert all(heading in result.markdown for heading in ["#", "##", "###"])
 
+@pytest.mark.asyncio
+async def test_base_tag_link_extraction():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        url = "https://sohamkukreti.github.io/portfolio"
+        result = await crawler.arun(url=url)
+        assert result.success
+        assert result.links
+        assert isinstance(result.links, dict)
+        assert "internal" in result.links
+        assert "external" in result.links
+        assert any("github.com" in x["href"] for x in result.links["external"])
 
 # Entry point for debugging
 if __name__ == "__main__":