fix: url normalisation ref: https://github.com/unclecode/crawl4ai/issues/841

2025-03-21 16:48:07 +05:30
parent f89113377a
commit 471d110c5e
1 changed files with 4 additions and 0 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1958,6 +1958,10 @@ def normalize_url(href, base_url):
    if not parsed_base.scheme or not parsed_base.netloc:
        raise ValueError(f"Invalid base URL format: {base_url}")

+    # Ensure base_url ends with a trailing slash if it's a directory path
+    if not base_url.endswith('/'):
+        base_url = base_url + '/'
+
    # Use urljoin to handle all cases
    normalized = urljoin(base_url, href.strip())
    return normalized