refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled. Add score_threshold parameter to BFS/DFS strategies for quality control. Remove legacy parameter handling in AsyncWebCrawler. Improve error handling and logging in crawl strategies. BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()
2025-03-03 21:51:11 +08:00
parent c612f9a852
commit d024749633
7 changed files with 372 additions and 91 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -224,22 +224,22 @@ class AsyncWebCrawler:
        url: str,
        config: CrawlerRunConfig = None,
        # Legacy parameters maintained for backwards compatibility
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        content_filter: RelevantContentFilter = None,
-        cache_mode: Optional[CacheMode] = None,
+        # word_count_threshold=MIN_WORD_THRESHOLD,
+        # extraction_strategy: ExtractionStrategy = None,
+        # chunking_strategy: ChunkingStrategy = RegexChunking(),
+        # content_filter: RelevantContentFilter = None,
+        # cache_mode: Optional[CacheMode] = None,
        # Deprecated cache parameters
-        bypass_cache: bool = False,
-        disable_cache: bool = False,
-        no_cache_read: bool = False,
-        no_cache_write: bool = False,
+        # bypass_cache: bool = False,
+        # disable_cache: bool = False,
+        # no_cache_read: bool = False,
+        # no_cache_write: bool = False,
        # Other legacy parameters
-        css_selector: str = None,
-        screenshot: bool = False,
-        pdf: bool = False,
-        user_agent: str = None,
-        verbose=True,
+        # css_selector: str = None,
+        # screenshot: bool = False,
+        # pdf: bool = False,
+        # user_agent: str = None,
+        # verbose=True,
        **kwargs,
    ) -> RunManyReturn:
        """
@@ -276,39 +276,41 @@ class AsyncWebCrawler:

        async with self._lock or self.nullcontext():
            try:
+                self.logger.verbose = crawler_config.verbose
                # Handle configuration
                if crawler_config is not None:
                    config = crawler_config
                else:
                    # Merge all parameters into a single kwargs dict for config creation
-                    config_kwargs = {
-                        "word_count_threshold": word_count_threshold,
-                        "extraction_strategy": extraction_strategy,
-                        "chunking_strategy": chunking_strategy,
-                        "content_filter": content_filter,
-                        "cache_mode": cache_mode,
-                        "bypass_cache": bypass_cache,
-                        "disable_cache": disable_cache,
-                        "no_cache_read": no_cache_read,
-                        "no_cache_write": no_cache_write,
-                        "css_selector": css_selector,
-                        "screenshot": screenshot,
-                        "pdf": pdf,
-                        "verbose": verbose,
-                        **kwargs,
-                    }
-                    config = CrawlerRunConfig.from_kwargs(config_kwargs)
+                    # config_kwargs = {
+                    #     "word_count_threshold": word_count_threshold,
+                    #     "extraction_strategy": extraction_strategy,
+                    #     "chunking_strategy": chunking_strategy,
+                    #     "content_filter": content_filter,
+                    #     "cache_mode": cache_mode,
+                    #     "bypass_cache": bypass_cache,
+                    #     "disable_cache": disable_cache,
+                    #     "no_cache_read": no_cache_read,
+                    #     "no_cache_write": no_cache_write,
+                    #     "css_selector": css_selector,
+                    #     "screenshot": screenshot,
+                    #     "pdf": pdf,
+                    #     "verbose": verbose,
+                    #     **kwargs,
+                    # }
+                    # config = CrawlerRunConfig.from_kwargs(config_kwargs)
+                    pass

                # Handle deprecated cache parameters
-                if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
-                    # Convert legacy parameters if cache_mode not provided
-                    if config.cache_mode is None:
-                        config.cache_mode = _legacy_to_cache_mode(
-                            disable_cache=disable_cache,
-                            bypass_cache=bypass_cache,
-                            no_cache_read=no_cache_read,
-                            no_cache_write=no_cache_write,
-                        )
+                # if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
+                #     # Convert legacy parameters if cache_mode not provided
+                #     if config.cache_mode is None:
+                #         config.cache_mode = _legacy_to_cache_mode(
+                #             disable_cache=disable_cache,
+                #             bypass_cache=bypass_cache,
+                #             no_cache_read=no_cache_read,
+                #             no_cache_write=no_cache_write,
+                #         )

                # Default to ENABLED if no cache mode specified
                if config.cache_mode is None:
@@ -344,7 +346,11 @@ class AsyncWebCrawler:
                    # If screenshot is requested but its not in cache, then set cache_result to None
                    screenshot_data = cached_result.screenshot
                    pdf_data = cached_result.pdf
-                    if config.screenshot and not screenshot or config.pdf and not pdf:
+                    # if config.screenshot and not screenshot or config.pdf and not pdf:
+                    if config.screenshot and not screenshot_data:
+                        cached_result = None
+                    
+                    if config.pdf and not pdf_data:
                        cached_result = None

                    self.logger.url_status(
@@ -358,12 +364,11 @@ class AsyncWebCrawler:
                if config and config.proxy_rotation_strategy:
                    next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
                    if next_proxy:
-                        if verbose:
-                            self.logger.info(
-                                message="Switch proxy: {proxy}",
-                                tag="PROXY",
-                                params={"proxy": next_proxy.server},
-                            )
+                        self.logger.info(
+                            message="Switch proxy: {proxy}",
+                            tag="PROXY",
+                            params={"proxy": next_proxy.server},
+                        )
                        config.proxy_config = next_proxy
                        # config = config.clone(proxy_config=next_proxy)

@@ -371,8 +376,8 @@ class AsyncWebCrawler:
                if not cached_result or not html:
                    t1 = time.perf_counter()

-                    if user_agent:
-                        self.crawler_strategy.update_user_agent(user_agent)
+                    if config.user_agent:
+                        self.crawler_strategy.update_user_agent(config.user_agent)

                    # Check robots.txt if enabled
                    if config and config.check_robots_txt:
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -37,15 +37,18 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
        filter_chain: FilterChain = FilterChain(),
        url_scorer: Optional[URLScorer] = None,
        include_external: bool = False,
+        max_pages: int = float('inf'),
        logger: Optional[logging.Logger] = None,
    ):
        self.max_depth = max_depth
        self.filter_chain = filter_chain
        self.url_scorer = url_scorer
        self.include_external = include_external
+        self.max_pages = max_pages
        self.logger = logger or logging.getLogger(__name__)
        self.stats = TraversalStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0

    async def can_process_url(self, url: str, depth: int) -> bool:
        """
@@ -86,12 +89,20 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
        new_depth = current_depth + 1
        if new_depth > self.max_depth:
            return
+            
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return

        # Retrieve internal links; include external links if enabled.
        links = result.links.get("internal", [])
        if self.include_external:
            links += result.links.get("external", [])

+        # If we have more links than remaining capacity, limit how many we'll process
+        valid_links = []
        for link in links:
            url = link.get("href")
            if url in visited:
@@ -99,8 +110,16 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
            if not await self.can_process_url(url, new_depth):
                self.stats.urls_skipped += 1
                continue
-
-            # Record the new depth.
+                
+            valid_links.append(url)
+            
+        # If we have more valid links than capacity, limit them
+        if len(valid_links) > remaining_capacity:
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Record the new depths and add to next_links
+        for url in valid_links:
            depths[url] = new_depth
            next_links.append((url, source_url))

@@ -123,6 +142,11 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
        depths: Dict[str, int] = {start_url: 0}

        while not queue.empty() and not self._cancel_event.is_set():
+            # Stop if we've reached the max pages limit
+            if self._pages_crawled >= self.max_pages:
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+                
            batch: List[Tuple[float, int, str, Optional[str]]] = []
            # Retrieve up to BATCH_SIZE items from the priority queue.
            for _ in range(BATCH_SIZE):
@@ -153,14 +177,23 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                result.metadata["depth"] = depth
                result.metadata["parent_url"] = parent_url
                result.metadata["score"] = score
+                
+                # Count only successful crawls toward max_pages limit
+                if result.success:
+                    self._pages_crawled += 1
+                
                yield result
-                # Discover new links from this result.
-                new_links: List[Tuple[str, Optional[str]]] = []
-                await self.link_discovery(result, result_url, depth, visited, new_links, depths)
-                for new_url, new_parent in new_links:
-                    new_depth = depths.get(new_url, depth + 1)
-                    new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
-                    await queue.put((new_score, new_depth, new_url, new_parent))
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Discover new links from this result
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, result_url, depth, visited, new_links, depths)
+                    
+                    for new_url, new_parent in new_links:
+                        new_depth = depths.get(new_url, depth + 1)
+                        new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
+                        await queue.put((new_score, new_depth, new_url, new_parent))

        # End of crawl.

--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -24,17 +24,22 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        self,
        max_depth: int,
        filter_chain: FilterChain = FilterChain(),
-        url_scorer: Optional[URLScorer] = None,
+        url_scorer: Optional[URLScorer] = None,        
        include_external: bool = False,
+        score_threshold: float = float('-inf'),
+        max_pages: int = float('inf'),
        logger: Optional[logging.Logger] = None,
    ):
        self.max_depth = max_depth
        self.filter_chain = filter_chain
        self.url_scorer = url_scorer
        self.include_external = include_external
+        self.score_threshold = score_threshold
+        self.max_pages = max_pages
        self.logger = logger or logging.getLogger(__name__)
        self.stats = TraversalStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0

    async def can_process_url(self, url: str, depth: int) -> bool:
        """
@@ -72,16 +77,25 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        prepares the next level of URLs.
        Each valid URL is appended to next_level as a tuple (url, parent_url)
        and its depth is tracked.
-        """
+        """            
        next_depth = current_depth + 1
        if next_depth > self.max_depth:
            return

+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
        # Get internal links and, if enabled, external links.
        links = result.links.get("internal", [])
        if self.include_external:
            links += result.links.get("external", [])

+        valid_links = []
+        
+        # First collect all valid links
        for link in links:
            url = link.get("href")
            if url in visited:
@@ -90,10 +104,29 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                self.stats.urls_skipped += 1
                continue

-            # Score the URL if a scorer is provided. In this simple BFS
-            # the score is not used for ordering.
+            # Score the URL if a scorer is provided
            score = self.url_scorer.score(url) if self.url_scorer else 0
-            # attach the score to metadata if needed.
+            
+            # Skip URLs with scores below the threshold
+            if score < self.score_threshold:
+                self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
+                self.stats.urls_skipped += 1
+                continue
+            
+            valid_links.append((url, score))
+        
+        # If we have more valid links than capacity, sort by score and take the top ones
+        if len(valid_links) > remaining_capacity:
+            if self.url_scorer:
+                # Sort by score in descending order
+                valid_links.sort(key=lambda x: x[1], reverse=True)
+            # Take only as many as we have capacity for
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Process the final selected links
+        for url, score in valid_links:
+            # attach the score to metadata if needed
            if score:
                result.metadata = result.metadata or {}
                result.metadata["score"] = score
@@ -125,7 +158,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
            # Clone the config to disable deep crawling recursion and enforce batch mode.
            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
            batch_results = await crawler.arun_many(urls=urls, config=batch_config)
-
+            
+            # Update pages crawled counter - count only successful crawls
+            successful_results = [r for r in batch_results if r.success]
+            self._pages_crawled += len(successful_results)
+            
            for result in batch_results:
                url = result.url
                depth = depths.get(url, 0)
@@ -134,7 +171,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                parent_url = next((parent for (u, parent) in current_level if u == url), None)
                result.metadata["parent_url"] = parent_url
                results.append(result)
-                await self.link_discovery(result, url, depth, visited, next_level, depths)
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)

            current_level = next_level

@@ -161,6 +202,9 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):

            stream_config = config.clone(deep_crawl_strategy=None, stream=True)
            stream_gen = await crawler.arun_many(urls=urls, config=stream_config)
+            
+            # Keep track of processed results for this batch
+            results_count = 0
            async for result in stream_gen:
                url = result.url
                depth = depths.get(url, 0)
@@ -168,9 +212,24 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                result.metadata["depth"] = depth
                parent_url = next((parent for (u, parent) in current_level if u == url), None)
                result.metadata["parent_url"] = parent_url
+                
+                # Count only successful crawls
+                if result.success:
+                    self._pages_crawled += 1
+                
+                results_count += 1
                yield result
-                await self.link_discovery(result, url, depth, visited, next_level, depths)
-
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+            
+            # If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop
+            # by considering these URLs as visited but not counting them toward the max_pages limit
+            if results_count == 0 and urls:
+                self.logger.warning(f"No results returned for {len(urls)} URLs, marking as visited")
+                
            current_level = next_level

    async def shutdown(self) -> None:
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -37,6 +37,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
            # Clone config to disable recursive deep crawling.
            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
            url_results = await crawler.arun_many(urls=[url], config=batch_config)
+            
            for result in url_results:
                result.metadata = result.metadata or {}
                result.metadata["depth"] = depth
@@ -44,13 +45,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                if self.url_scorer:
                    result.metadata["score"] = self.url_scorer.score(url)
                results.append(result)
-
-                new_links: List[Tuple[str, Optional[str]]] = []
-                await self.link_discovery(result, url, depth, visited, new_links, depths)
-                # Push new links in reverse order so the first discovered is processed next.
-                for new_url, new_parent in reversed(new_links):
-                    new_depth = depths.get(new_url, depth + 1)
-                    stack.append((new_url, new_parent, new_depth))
+                
+                # Count only successful crawls toward max_pages limit
+                if result.success:
+                    self._pages_crawled += 1
+                    
+                    # Only discover links from successful crawls
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, url, depth, visited, new_links, depths)
+                    
+                    # Push new links in reverse order so the first discovered is processed next.
+                    for new_url, new_parent in reversed(new_links):
+                        new_depth = depths.get(new_url, depth + 1)
+                        stack.append((new_url, new_parent, new_depth))
        return results

    async def _arun_stream(
@@ -83,8 +90,13 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                    result.metadata["score"] = self.url_scorer.score(url)
                yield result

-                new_links: List[Tuple[str, Optional[str]]] = []
-                await self.link_discovery(result, url, depth, visited, new_links, depths)
-                for new_url, new_parent in reversed(new_links):
-                    new_depth = depths.get(new_url, depth + 1)
-                    stack.append((new_url, new_parent, new_depth))
+                # Only count successful crawls toward max_pages limit
+                # and only discover links from successful crawls
+                if result.success:
+                    self._pages_crawled += 1
+                    
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, url, depth, visited, new_links, depths)
+                    for new_url, new_parent in reversed(new_links):
+                        new_depth = depths.get(new_url, depth + 1)
+                        stack.append((new_url, new_parent, new_depth))