From 18ad3ef1599a4c7565692a8368b16fb2789de4a6 Mon Sep 17 00:00:00 2001 From: Soham Kukreti Date: Fri, 8 Aug 2025 20:00:11 +0530 Subject: [PATCH] fix: Implement base tag support in link extraction (#1147) - Extract base href from tag using XPath in _process_element method - Use base URL as the primary URL for link normalization when present - Add error handling with logging for malformed or problematic base tags - Maintain backward compatibility when no base tag is present - Add test to verify the functionality of the base tag extraction. --- crawl4ai/content_scraping_strategy.py | 10 ++++++++++ tests/async/test_content_extraction.py | 11 +++++++++++ 2 files changed, 21 insertions(+) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index e13ffa5e..81c8a41f 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -242,6 +242,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy): exclude_domains = set(kwargs.get("exclude_domains", [])) # Process links + try: + base_element = element.xpath("//head/base[@href]") + if base_element: + base_href = base_element[0].get("href", "").strip() + if base_href: + url = base_href + except Exception as e: + self._log("error", f"Error extracting base URL: {str(e)}", "SCRAPE") + pass + for link in element.xpath(".//a[@href]"): href = link.get("href", "").strip() if not href: diff --git a/tests/async/test_content_extraction.py b/tests/async/test_content_extraction.py index 9372387a..509a387f 100644 --- a/tests/async/test_content_extraction.py +++ b/tests/async/test_content_extraction.py @@ -91,6 +91,17 @@ async def test_css_selector_extraction(): assert result.markdown assert all(heading in result.markdown for heading in ["#", "##", "###"]) +@pytest.mark.asyncio +async def test_base_tag_link_extraction(): + async with AsyncWebCrawler(verbose=True) as crawler: + url = "https://sohamkukreti.github.io/portfolio" + result = await crawler.arun(url=url) + assert result.success + assert result.links + assert isinstance(result.links, dict) + assert "internal" in result.links + assert "external" in result.links + assert any("github.com" in x["href"] for x in result.links["external"]) # Entry point for debugging if __name__ == "__main__":