Fix can_process_url() to receive normalized URL in deep crawl strategies

Pass the normalized absolute URL instead of the raw href to
can_process_url() in BFS, BFF, and DFS deep crawl strategies.
This ensures URL validation and filter chain evaluation operate
on consistent, fully-qualified URLs.

Fixes #1743
This commit is contained in:
unclecode
2026-02-01 03:45:52 +00:00
parent ee717dc019
commit 43738c9ed2
3 changed files with 3 additions and 3 deletions

View File

@@ -177,7 +177,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
base_url = normalize_url_for_deep_crawl(url, source_url)
if base_url in visited:
continue
if not await self.can_process_url(url, new_depth):
if not await self.can_process_url(base_url, new_depth):
self.stats.urls_skipped += 1
continue

View File

@@ -170,7 +170,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
base_url = normalize_url_for_deep_crawl(url, source_url)
if base_url in visited:
continue
if not await self.can_process_url(url, next_depth):
if not await self.can_process_url(base_url, next_depth):
self.stats.urls_skipped += 1
continue

View File

@@ -300,7 +300,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
if not normalized_url or normalized_url in seen:
continue
if not await self.can_process_url(raw_url, next_depth):
if not await self.can_process_url(normalized_url, next_depth):
self.stats.urls_skipped += 1
continue