Merge pull request #1381 from unclecode/fix/base-tag-link-resolution

fix: Implement base tag support in link extraction (#1147)
This commit is contained in:
Nasrin
2025-08-11 18:32:32 +08:00
committed by GitHub
2 changed files with 21 additions and 0 deletions

View File

@@ -242,6 +242,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
exclude_domains = set(kwargs.get("exclude_domains", []))
# Process links
try:
base_element = element.xpath("//head/base[@href]")
if base_element:
base_href = base_element[0].get("href", "").strip()
if base_href:
url = base_href
except Exception as e:
self._log("error", f"Error extracting base URL: {str(e)}", "SCRAPE")
pass
for link in element.xpath(".//a[@href]"):
href = link.get("href", "").strip()
if not href: