From 471d110c5e496a1334422ee177e95cf1675ad37b Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 16:48:07 +0530 Subject: [PATCH] fix: url normalisation ref: https://github.com/unclecode/crawl4ai/issues/841 --- crawl4ai/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 5b8af794..fe725317 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1958,6 +1958,10 @@ def normalize_url(href, base_url): if not parsed_base.scheme or not parsed_base.netloc: raise ValueError(f"Invalid base URL format: {base_url}") + # Ensure base_url ends with a trailing slash if it's a directory path + if not base_url.endswith('/'): + base_url = base_url + '/' + # Use urljoin to handle all cases normalized = urljoin(base_url, href.strip()) return normalized