From f89113377aa2e7ac40023976e63cb2d1d9a93255 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 13:44:57 +0530 Subject: [PATCH] fix: Move adding of visited urls to the 'visited' set, when queueing the URLs instead of after dequeuing, this is to prevent duplicate crawls. https://github.com/unclecode/crawl4ai/issues/843 --- crawl4ai/deep_crawling/bfs_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 54b72ea3..48c116dd 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -117,7 +117,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}") self.stats.urls_skipped += 1 continue - + + visited.add(base_url) valid_links.append((base_url, score)) # If we have more valid links than capacity, sort by score and take the top ones @@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): while current_level and not self._cancel_event.is_set(): next_level: List[Tuple[str, Optional[str]]] = [] urls = [url for url, _ in current_level] - visited.update(urls) # Clone the config to disable deep crawling recursion and enforce batch mode. batch_config = config.clone(deep_crawl_strategy=None, stream=False)