fix: url normalisation ref: https://github.com/unclecode/crawl4ai/issues/841
This commit is contained in:
@@ -1958,6 +1958,10 @@ def normalize_url(href, base_url):
|
|||||||
if not parsed_base.scheme or not parsed_base.netloc:
|
if not parsed_base.scheme or not parsed_base.netloc:
|
||||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||||
|
|
||||||
|
# Ensure base_url ends with a trailing slash if it's a directory path
|
||||||
|
if not base_url.endswith('/'):
|
||||||
|
base_url = base_url + '/'
|
||||||
|
|
||||||
# Use urljoin to handle all cases
|
# Use urljoin to handle all cases
|
||||||
normalized = urljoin(base_url, href.strip())
|
normalized = urljoin(base_url, href.strip())
|
||||||
return normalized
|
return normalized
|
||||||
|
|||||||
Reference in New Issue
Block a user