fix: remove trailing slash when the path is empty. This is causing dupicate crawls
This commit is contained in:
@@ -2002,7 +2002,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
normalized = urlunparse((
|
normalized = urlunparse((
|
||||||
parsed.scheme,
|
parsed.scheme,
|
||||||
netloc,
|
netloc,
|
||||||
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
parsed.path.rstrip('/'), # Normalize trailing slash
|
||||||
parsed.params,
|
parsed.params,
|
||||||
query,
|
query,
|
||||||
fragment
|
fragment
|
||||||
@@ -2030,7 +2030,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
|||||||
normalized = urlunparse((
|
normalized = urlunparse((
|
||||||
parsed.scheme,
|
parsed.scheme,
|
||||||
parsed.netloc.lower(),
|
parsed.netloc.lower(),
|
||||||
parsed.path,
|
parsed.path.rstrip('/'),
|
||||||
parsed.params,
|
parsed.params,
|
||||||
parsed.query,
|
parsed.query,
|
||||||
'' # Remove fragment
|
'' # Remove fragment
|
||||||
|
|||||||
Reference in New Issue
Block a user