Merge pull request #1381 from unclecode/fix/base-tag-link-resolution
fix: Implement base tag support in link extraction (#1147)
This commit is contained in:
@@ -242,6 +242,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
exclude_domains = set(kwargs.get("exclude_domains", []))
|
exclude_domains = set(kwargs.get("exclude_domains", []))
|
||||||
|
|
||||||
# Process links
|
# Process links
|
||||||
|
try:
|
||||||
|
base_element = element.xpath("//head/base[@href]")
|
||||||
|
if base_element:
|
||||||
|
base_href = base_element[0].get("href", "").strip()
|
||||||
|
if base_href:
|
||||||
|
url = base_href
|
||||||
|
except Exception as e:
|
||||||
|
self._log("error", f"Error extracting base URL: {str(e)}", "SCRAPE")
|
||||||
|
pass
|
||||||
|
|
||||||
for link in element.xpath(".//a[@href]"):
|
for link in element.xpath(".//a[@href]"):
|
||||||
href = link.get("href", "").strip()
|
href = link.get("href", "").strip()
|
||||||
if not href:
|
if not href:
|
||||||
|
|||||||
@@ -91,6 +91,17 @@ async def test_css_selector_extraction():
|
|||||||
assert result.markdown
|
assert result.markdown
|
||||||
assert all(heading in result.markdown for heading in ["#", "##", "###"])
|
assert all(heading in result.markdown for heading in ["#", "##", "###"])
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_base_tag_link_extraction():
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
url = "https://sohamkukreti.github.io/portfolio"
|
||||||
|
result = await crawler.arun(url=url)
|
||||||
|
assert result.success
|
||||||
|
assert result.links
|
||||||
|
assert isinstance(result.links, dict)
|
||||||
|
assert "internal" in result.links
|
||||||
|
assert "external" in result.links
|
||||||
|
assert any("github.com" in x["href"] for x in result.links["external"])
|
||||||
|
|
||||||
# Entry point for debugging
|
# Entry point for debugging
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user