Fix can_process_url() to receive normalized URL in deep crawl strategies

Pass the normalized absolute URL instead of the raw href to can_process_url() in BFS, BFF, and DFS deep crawl strategies. This ensures URL validation and filter chain evaluation operate on consistent, fully-qualified URLs. Fixes #1743
2026-02-01 03:45:52 +00:00
parent ee717dc019
commit 43738c9ed2
3 changed files with 3 additions and 3 deletions
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -177,7 +177,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
            base_url = normalize_url_for_deep_crawl(url, source_url)
            if base_url in visited:
                continue
-            if not await self.can_process_url(url, new_depth):
+            if not await self.can_process_url(base_url, new_depth):
                self.stats.urls_skipped += 1
                continue
                
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -170,7 +170,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
            base_url = normalize_url_for_deep_crawl(url, source_url)
            if base_url in visited:
                continue
-            if not await self.can_process_url(url, next_depth):
+            if not await self.can_process_url(base_url, next_depth):
                self.stats.urls_skipped += 1
                continue

--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -300,7 +300,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
            if not normalized_url or normalized_url in seen:
                continue

-            if not await self.can_process_url(raw_url, next_depth):
+            if not await self.can_process_url(normalized_url, next_depth):
                self.stats.urls_skipped += 1
                continue