fix(crawler): ensure max_pages limit is respected during batch processing in crawling strategies
This commit is contained in:
@@ -148,6 +148,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
# Calculate how many more URLs we can process in this batch
|
||||
remaining = self.max_pages - self._pages_crawled
|
||||
batch_size = min(BATCH_SIZE, remaining)
|
||||
if batch_size <= 0:
|
||||
# No more pages to crawl
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
||||
# Retrieve up to BATCH_SIZE items from the priority queue.
|
||||
for _ in range(BATCH_SIZE):
|
||||
@@ -182,6 +190,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
# Count only successful crawls toward max_pages limit
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
yield result
|
||||
|
||||
|
||||
@@ -156,6 +156,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
results: List[CrawlResult] = []
|
||||
|
||||
while current_level and not self._cancel_event.is_set():
|
||||
# Check if we've already reached max_pages before starting a new level
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
next_level: List[Tuple[str, Optional[str]]] = []
|
||||
urls = [url for url, _ in current_level]
|
||||
visited.update(urls)
|
||||
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
# Count only successful crawls
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
results_count += 1
|
||||
yield result
|
||||
|
||||
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
# Count only successful crawls toward max_pages limit
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
# Only discover links from successful crawls
|
||||
new_links: List[Tuple[str, Optional[str]]] = []
|
||||
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
# and only discover links from successful crawls
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
new_links: List[Tuple[str, Optional[str]]] = []
|
||||
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
||||
|
||||
Reference in New Issue
Block a user