From e01d1e73e167bb89d6656f0bdda359555a1c0be0 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 17:34:13 +0530 Subject: [PATCH] fix: link normalisation in BestFirstStrategy --- crawl4ai/deep_crawling/bff_strategy.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index 4811ba14..65d4e819 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -11,6 +11,7 @@ from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn +from ..utils import normalize_url_for_deep_crawl from math import inf as infinity @@ -106,13 +107,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): valid_links = [] for link in links: url = link.get("href") - if url in visited: + base_url = normalize_url_for_deep_crawl(url, source_url) + if base_url in visited: continue if not await self.can_process_url(url, new_depth): self.stats.urls_skipped += 1 continue - valid_links.append(url) + valid_links.append(base_url) # If we have more valid links than capacity, limit them if len(valid_links) > remaining_capacity: