refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled. Add score_threshold parameter to BFS/DFS strategies for quality control. Remove legacy parameter handling in AsyncWebCrawler. Improve error handling and logging in crawl strategies. BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()
2025-03-03 21:51:11 +08:00
parent c612f9a852
commit d024749633
7 changed files with 372 additions and 91 deletions
--- a/docs/examples/deepcrawl_example.py
+++ b/docs/examples/deepcrawl_example.py
@@ -80,7 +80,7 @@ async def stream_vs_nonstream():
    base_config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
        scraping_strategy=LXMLWebScrapingStrategy(),
-        verbose=True,
+        verbose=False,
    )

    async with AsyncWebCrawler() as crawler:
@@ -212,11 +212,11 @@ async def filters_and_scorers():

        # Create a keyword relevance scorer
        keyword_scorer = KeywordRelevanceScorer(
-            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
+            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
        )

        config = CrawlerRunConfig(
-            deep_crawl_strategy=BestFirstCrawlingStrategy(  # Note: Changed to BestFirst
+            deep_crawl_strategy=BestFirstCrawlingStrategy(  
                max_depth=1, include_external=False, url_scorer=keyword_scorer
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
@@ -373,6 +373,104 @@ async def advanced_filters():


 # Main function to run the entire tutorial
+async def max_pages_and_thresholds():
+    """
+    PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    
+    This function shows:
+    - How to limit the number of pages crawled
+    - How to set score thresholds for more targeted crawling
+    - Comparing BFS, DFS, and Best-First strategies with these parameters
+    """
+    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
+    
+    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+    
+    async with AsyncWebCrawler() as crawler:
+        # Define a common keyword scorer for all examples
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["browser", "crawler", "web", "automation"], 
+            weight=1.0
+        )
+        
+        # EXAMPLE 1: BFS WITH MAX PAGES
+        print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
+        print("  Limit the crawler to a maximum of 5 pages")
+        
+        bfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=2, 
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=5  # Only crawl 5 pages
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
+        
+        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | {result.url}")
+            
+        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
+        print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
+        print("  Only crawl pages with a relevance score above 0.5")
+        
+        dfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=DFSDeepCrawlStrategy(
+                max_depth=2,
+                include_external=False, 
+                url_scorer=keyword_scorer,
+                score_threshold=0.7,  # Only process URLs with scores above 0.5
+                max_pages=10
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
+        
+        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
+        for result in results:
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
+        print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
+        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
+        
+        bf_config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(
+                max_depth=2,
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=7,          # Limit to 7 pages total
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+            stream=True,
+        )
+        
+        results = []
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
+        if results:
+            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
+            print(f"  ✅ Average score: {avg_score:.2f}")
+            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
+
 async def run_tutorial():
    """
    Executes all tutorial sections in sequence.
@@ -384,9 +482,10 @@ async def run_tutorial():

    # Define sections - uncomment to run specific parts during development
    tutorial_sections = [
-        basic_deep_crawl,
-        stream_vs_nonstream,
-        filters_and_scorers,
+        # basic_deep_crawl,
+        # stream_vs_nonstream,
+        # filters_and_scorers,
+        max_pages_and_thresholds,  # Added new section
        wrap_up,
        advanced_filters,
    ]
--- a/docs/md_v2/core/deep-crawling.md
+++ b/docs/md_v2/core/deep-crawling.md
@@ -73,12 +73,18 @@ from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 strategy = BFSDeepCrawlStrategy(
    max_depth=2,               # Crawl initial page + 2 levels deep
    include_external=False,    # Stay within the same domain
+    max_pages=50,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
 )
 ```

 **Key parameters:**
 - **`max_depth`**: Number of levels to crawl beyond the starting page
 - **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs

 ### 2.2 DFSDeepCrawlStrategy (Depth-First Search)

@@ -91,12 +97,18 @@ from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
 strategy = DFSDeepCrawlStrategy(
    max_depth=2,               # Crawl initial page + 2 levels deep
    include_external=False,    # Stay within the same domain
+    max_pages=30,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.5,       # Minimum score for URLs to be crawled (optional)
 )
 ```

 **Key parameters:**
 - **`max_depth`**: Number of levels to crawl beyond the starting page
 - **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs

 ### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)

@@ -116,7 +128,8 @@ scorer = KeywordRelevanceScorer(
 strategy = BestFirstCrawlingStrategy(
    max_depth=2,
    include_external=False,
-    url_scorer=scorer
+    url_scorer=scorer,
+    max_pages=25,              # Maximum number of pages to crawl (optional)
 )
 ```

@@ -124,6 +137,8 @@ This crawling approach:
 - Evaluates each discovered URL based on scorer criteria
 - Visits higher-scoring pages first
 - Helps focus crawl resources on the most relevant content
+- Can limit total pages crawled with `max_pages`
+- Does not need `score_threshold` as it naturally prioritizes by score

 ---

@@ -410,27 +425,64 @@ if __name__ == "__main__":
 ---


-## 8. Common Pitfalls & Tips
+## 8. Limiting and Controlling Crawl Size

-1.**Set realistic depth limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. 
+### 8.1 Using max_pages
+
+You can limit the total number of pages crawled with the `max_pages` parameter:
+
+```python
+# Limit to exactly 20 pages regardless of depth
+strategy = BFSDeepCrawlStrategy(
+    max_depth=3,
+    max_pages=20
+)
+```
+
+This feature is useful for:
+- Controlling API costs
+- Setting predictable execution times
+- Focusing on the most important content
+- Testing crawl configurations before full execution
+
+### 8.2 Using score_threshold
+
+For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
+
+```python
+# Only follow links with scores above 0.4
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,
+    url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
+    score_threshold=0.4  # Skip URLs with scores below this value
+)
+```
+
+Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
+
+## 9. Common Pitfalls & Tips
+
+1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.

 2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.

 3.**Be a good web citizen.**  Respect robots.txt. (disabled by default)
  
+4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.

-4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.success` and `result.error_message` when processing results.
+5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.

 ---

-## 9. Summary & Next Steps
+## 10. Summary & Next Steps

 In this **Deep Crawling with Crawl4AI** tutorial, you learned to:

- Configure **BFSDeepCrawlStrategy** and **BestFirstCrawlingStrategy**
+- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
 - Process results in streaming or non-streaming mode
 - Apply filters to target specific content
 - Use scorers to prioritize the most relevant pages
+- Limit crawls with `max_pages` and `score_threshold` parameters
 - Build a complete advanced crawler with combined techniques

 With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.