refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled. Add score_threshold parameter to BFS/DFS strategies for quality control. Remove legacy parameter handling in AsyncWebCrawler. Improve error handling and logging in crawl strategies. BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()
2025-03-03 21:51:11 +08:00
parent c612f9a852
commit d024749633
7 changed files with 372 additions and 91 deletions
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -37,6 +37,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
            # Clone config to disable recursive deep crawling.
            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
            url_results = await crawler.arun_many(urls=[url], config=batch_config)
+            
            for result in url_results:
                result.metadata = result.metadata or {}
                result.metadata["depth"] = depth
@@ -44,13 +45,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                if self.url_scorer:
                    result.metadata["score"] = self.url_scorer.score(url)
                results.append(result)
-
-                new_links: List[Tuple[str, Optional[str]]] = []
-                await self.link_discovery(result, url, depth, visited, new_links, depths)
-                # Push new links in reverse order so the first discovered is processed next.
-                for new_url, new_parent in reversed(new_links):
-                    new_depth = depths.get(new_url, depth + 1)
-                    stack.append((new_url, new_parent, new_depth))
+                
+                # Count only successful crawls toward max_pages limit
+                if result.success:
+                    self._pages_crawled += 1
+                    
+                    # Only discover links from successful crawls
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, url, depth, visited, new_links, depths)
+                    
+                    # Push new links in reverse order so the first discovered is processed next.
+                    for new_url, new_parent in reversed(new_links):
+                        new_depth = depths.get(new_url, depth + 1)
+                        stack.append((new_url, new_parent, new_depth))
        return results

    async def _arun_stream(
@@ -83,8 +90,13 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                    result.metadata["score"] = self.url_scorer.score(url)
                yield result

-                new_links: List[Tuple[str, Optional[str]]] = []
-                await self.link_discovery(result, url, depth, visited, new_links, depths)
-                for new_url, new_parent in reversed(new_links):
-                    new_depth = depths.get(new_url, depth + 1)
-                    stack.append((new_url, new_parent, new_depth))
+                # Only count successful crawls toward max_pages limit
+                # and only discover links from successful crawls
+                if result.success:
+                    self._pages_crawled += 1
+                    
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, url, depth, visited, new_links, depths)
+                    for new_url, new_parent in reversed(new_links):
+                        new_depth = depths.get(new_url, depth + 1)
+                        stack.append((new_url, new_parent, new_depth))