Merge pull request #1381 from unclecode/fix/base-tag-link-resolution

fix: Implement base tag support in link extraction (#1147)
2025-08-11 18:32:32 +08:00
parent f0ce7b2710 18ad3ef159
commit 57c14db7cb
2 changed files with 21 additions and 0 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -242,6 +242,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
        exclude_domains = set(kwargs.get("exclude_domains", []))
        # Process links
        try:
            base_element = element.xpath("//head/base[@href]")
            if base_element:
                base_href = base_element[0].get("href", "").strip()
                if base_href:
                    url = base_href
        except Exception as e:
            self._log("error", f"Error extracting base URL: {str(e)}", "SCRAPE")
            pass
        for link in element.xpath(".//a[@href]"):
            href = link.get("href", "").strip()
            if not href:
--- a/tests/async/test_content_extraction.py
+++ b/tests/async/test_content_extraction.py
@@ -91,6 +91,17 @@ async def test_css_selector_extraction():
        assert result.markdown
        assert all(heading in result.markdown for heading in ["#", "##", "###"])
@pytest.mark.asyncio
 async def test_base_tag_link_extraction():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://sohamkukreti.github.io/portfolio"
        result = await crawler.arun(url=url)
        assert result.success
        assert result.links
        assert isinstance(result.links, dict)
        assert "internal" in result.links
        assert "external" in result.links
        assert any("github.com" in x["href"] for x in result.links["external"])
 # Entry point for debugging
 if __name__ == "__main__":