fix: Move adding of visited urls to the 'visited' set, when queueing the URLs instead of after dequeuing, this is to prevent duplicate crawls. https://github.com/unclecode/crawl4ai/issues/843

This commit is contained in:
Aravind Karnam
2025-03-21 13:44:57 +05:30
parent 6740e87b4d
commit f89113377a

View File

@@ -118,6 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
self.stats.urls_skipped += 1 self.stats.urls_skipped += 1
continue continue
visited.add(base_url)
valid_links.append((base_url, score)) valid_links.append((base_url, score))
# If we have more valid links than capacity, sort by score and take the top ones # If we have more valid links than capacity, sort by score and take the top ones
@@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
while current_level and not self._cancel_event.is_set(): while current_level and not self._cancel_event.is_set():
next_level: List[Tuple[str, Optional[str]]] = [] next_level: List[Tuple[str, Optional[str]]] = []
urls = [url for url, _ in current_level] urls = [url for url, _ in current_level]
visited.update(urls)
# Clone the config to disable deep crawling recursion and enforce batch mode. # Clone the config to disable deep crawling recursion and enforce batch mode.
batch_config = config.clone(deep_crawl_strategy=None, stream=False) batch_config = config.clone(deep_crawl_strategy=None, stream=False)