Merge branch 'develop' of https://github.com/unclecode/crawl4ai into develop

This commit is contained in:
ntohidi
2025-08-12 12:22:25 +08:00
5 changed files with 362 additions and 7 deletions

View File

@@ -242,6 +242,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
exclude_domains = set(kwargs.get("exclude_domains", []))
# Process links
try:
base_element = element.xpath("//head/base[@href]")
if base_element:
base_href = base_element[0].get("href", "").strip()
if base_href:
url = base_href
except Exception as e:
self._log("error", f"Error extracting base URL: {str(e)}", "SCRAPE")
pass
for link in element.xpath(".//a[@href]"):
href = link.get("href", "").strip()
if not href: