From 40ab287c9087bf6701072558427d974510335bc0 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Fri, 22 Aug 2025 12:05:21 +0800 Subject: [PATCH] fix(utils): Improve URL normalization by avoiding quote/unquote to preserve '+' signs. ref #1332 --- crawl4ai/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 73f1d2a3..09e6e4b7 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2184,8 +2184,10 @@ def normalize_url( netloc = parsed.netloc.lower() # ── path ── - # Strip duplicate slashes and trailing “/” (except root) - path = quote(unquote(parsed.path)) + # Strip duplicate slashes and trailing "/" (except root) + # IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs + # The path from urlparse is already properly encoded + path = parsed.path if path.endswith('/') and path != '/': path = path.rstrip('/')