fix: Implement base tag support in link extraction (#1147)

- Extract base href from <head><base> tag using XPath in _process_element method - Use base URL as the primary URL for link normalization when present - Add error handling with logging for malformed or problematic base tags - Maintain backward compatibility when no base tag is present - Add test to verify the functionality of the base tag extraction.
2025-08-08 20:00:11 +05:30
parent a5bcac4c9d
commit 18ad3ef159
2 changed files with 21 additions and 0 deletions
--- a/tests/async/test_content_extraction.py
+++ b/tests/async/test_content_extraction.py
@@ -91,6 +91,17 @@ async def test_css_selector_extraction():
        assert result.markdown
        assert all(heading in result.markdown for heading in ["#", "##", "###"])

+@pytest.mark.asyncio
+async def test_base_tag_link_extraction():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        url = "https://sohamkukreti.github.io/portfolio"
+        result = await crawler.arun(url=url)
+        assert result.success
+        assert result.links
+        assert isinstance(result.links, dict)
+        assert "internal" in result.links
+        assert "external" in result.links
+        assert any("github.com" in x["href"] for x in result.links["external"])

 # Entry point for debugging
 if __name__ == "__main__":