fix(crawler): ensure max_pages limit is respected during batch processing in crawling strategies
This commit is contained in:
@@ -148,6 +148,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Calculate how many more URLs we can process in this batch
|
||||||
|
remaining = self.max_pages - self._pages_crawled
|
||||||
|
batch_size = min(BATCH_SIZE, remaining)
|
||||||
|
if batch_size <= 0:
|
||||||
|
# No more pages to crawl
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
|
break
|
||||||
|
|
||||||
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
||||||
# Retrieve up to BATCH_SIZE items from the priority queue.
|
# Retrieve up to BATCH_SIZE items from the priority queue.
|
||||||
for _ in range(BATCH_SIZE):
|
for _ in range(BATCH_SIZE):
|
||||||
@@ -182,6 +190,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
# Count only successful crawls toward max_pages limit
|
# Count only successful crawls toward max_pages limit
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|||||||
@@ -156,6 +156,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
results: List[CrawlResult] = []
|
results: List[CrawlResult] = []
|
||||||
|
|
||||||
while current_level and not self._cancel_event.is_set():
|
while current_level and not self._cancel_event.is_set():
|
||||||
|
# Check if we've already reached max_pages before starting a new level
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
|
break
|
||||||
|
|
||||||
next_level: List[Tuple[str, Optional[str]]] = []
|
next_level: List[Tuple[str, Optional[str]]] = []
|
||||||
urls = [url for url, _ in current_level]
|
urls = [url for url, _ in current_level]
|
||||||
visited.update(urls)
|
visited.update(urls)
|
||||||
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
# Count only successful crawls
|
# Count only successful crawls
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
results_count += 1
|
results_count += 1
|
||||||
yield result
|
yield result
|
||||||
|
|||||||
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
# Count only successful crawls toward max_pages limit
|
# Count only successful crawls toward max_pages limit
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
# Only discover links from successful crawls
|
# Only discover links from successful crawls
|
||||||
new_links: List[Tuple[str, Optional[str]]] = []
|
new_links: List[Tuple[str, Optional[str]]] = []
|
||||||
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
# and only discover links from successful crawls
|
# and only discover links from successful crawls
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
new_links: List[Tuple[str, Optional[str]]] = []
|
new_links: List[Tuple[str, Optional[str]]] = []
|
||||||
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
||||||
|
|||||||
Reference in New Issue
Block a user