From 6740e87b4d24e5e5904a8100419f3b1e0eed501a Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 13:41:31 +0530 Subject: [PATCH] fix: remove trailing slash when the path is empty. This is causing dupicate crawls --- crawl4ai/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index acaf7933..5b8af794 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2002,7 +2002,7 @@ def normalize_url_for_deep_crawl(href, base_url): normalized = urlunparse(( parsed.scheme, netloc, - parsed.path.rstrip('/') or '/', # Normalize trailing slash + parsed.path.rstrip('/'), # Normalize trailing slash parsed.params, query, fragment @@ -2030,7 +2030,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url): normalized = urlunparse(( parsed.scheme, parsed.netloc.lower(), - parsed.path, + parsed.path.rstrip('/'), parsed.params, parsed.query, '' # Remove fragment