refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled. Add score_threshold parameter to BFS/DFS strategies for quality control. Remove legacy parameter handling in AsyncWebCrawler. Improve error handling and logging in crawl strategies. BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()
2025-03-03 21:51:11 +08:00
parent c612f9a852
commit d024749633
7 changed files with 372 additions and 91 deletions
--- a/docs/examples/deepcrawl_example.py
+++ b/docs/examples/deepcrawl_example.py
@@ -0,0 +1,503 @@
+import asyncio
+import time
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+    ContentRelevanceFilter,
+    SEOFilter,
+)
+from crawl4ai.deep_crawling.scorers import (
+    KeywordRelevanceScorer,
+)
+
+
+# 1️⃣ Basic Deep Crawl Setup
+async def basic_deep_crawl():
+    """
+    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
+
+    This function shows:
+    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
+    - Setting depth and domain parameters
+    - Processing the results to show the hierarchy
+    """
+    print("\n===== BASIC DEEP CRAWL SETUP =====")
+
+    # Configure a 2-level deep crawl using Breadth-First Search strategy
+    # max_depth=2 means: initial page (depth 0) + 2 more levels
+    # include_external=False means: only follow links within the same domain
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True,  # Show progress during crawling
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        start_time = time.perf_counter()
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        # Group results by depth to visualize the crawl tree
+        pages_by_depth = {}
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            if depth not in pages_by_depth:
+                pages_by_depth[depth] = []
+            pages_by_depth[depth].append(result.url)
+
+        print(f"✅ Crawled {len(results)} pages total")
+
+        # Display crawl structure by depth
+        for depth, urls in sorted(pages_by_depth.items()):
+            print(f"\nDepth {depth}: {len(urls)} pages")
+            # Show first 3 URLs for each depth as examples
+            for url in urls[:3]:
+                print(f"  → {url}")
+            if len(urls) > 3:
+                print(f"  ... and {len(urls) - 3} more")
+
+        print(
+            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
+        )
+
+
+# 2️⃣ Stream vs. Non-Stream Execution
+async def stream_vs_nonstream():
+    """
+    PART 2: Demonstrates the difference between stream and non-stream execution.
+
+    Non-stream: Waits for all results before processing
+    Stream: Processes results as they become available
+    """
+    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
+
+    # Common configuration for both examples
+    base_config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=False,
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # NON-STREAMING MODE
+        print("\n📊 NON-STREAMING MODE:")
+        print("  In this mode, all results are collected before being returned.")
+
+        non_stream_config = base_config.clone()
+        non_stream_config.stream = False
+
+        start_time = time.perf_counter()
+        results = await crawler.arun(
+            url="https://docs.crawl4ai.com", config=non_stream_config
+        )
+
+        print(f"  ✅ Received all {len(results)} results at once")
+        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
+
+        # STREAMING MODE
+        print("\n📊 STREAMING MODE:")
+        print("  In this mode, results are processed as they become available.")
+
+        stream_config = base_config.clone()
+        stream_config.stream = True
+
+        start_time = time.perf_counter()
+        result_count = 0
+        first_result_time = None
+
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=stream_config
+        ):
+            result_count += 1
+            if result_count == 1:
+                first_result_time = time.perf_counter() - start_time
+                print(
+                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
+                )
+            elif result_count % 5 == 0:  # Show every 5th result for brevity
+                print(f"  → Result #{result_count}: {result.url}")
+
+        print(f"  ✅ Total: {result_count} results")
+        print(f"  ✅ First result: {first_result_time:.2f} seconds")
+        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
+        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
+
+
+# 3️⃣ Introduce Filters & Scorers
+async def filters_and_scorers():
+    """
+    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
+
+    This function progressively adds:
+    1. A single URL pattern filter
+    2. Multiple filters in a chain
+    3. Scorers for prioritizing pages
+    """
+    print("\n===== FILTERS AND SCORERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SINGLE FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
+        print("  Only crawl pages containing 'core' in the URL")
+
+        # Create a filter that only allows URLs with 'guide' in them
+        url_filter = URLPatternFilter(patterns=["*core*"])
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1,
+                include_external=False,
+                filter_chain=FilterChain([url_filter]),  # Single filter
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
+        for result in results[:3]:  # Show first 3 results
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # MULTIPLE FILTERS EXAMPLE
+        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
+        print("  Only crawl pages that:")
+        print("  1. Contain '2024' in the URL")
+        print("  2. Are from 'techcrunch.com'")
+        print("  3. Are of text/html or application/javascript content type")
+
+        # Create a chain of filters
+        filter_chain = FilterChain(
+            [
+                URLPatternFilter(patterns=["*2024*"]),
+                DomainFilter(
+                    allowed_domains=["techcrunch.com"],
+                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
+                ),
+                ContentTypeFilter(
+                    allowed_types=["text/html", "application/javascript"]
+                ),
+            ]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, include_external=False, filter_chain=filter_chain
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://techcrunch.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
+        for result in results[:3]:
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # SCORERS EXAMPLE
+        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
+        print(
+            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
+        )
+
+        # Create a keyword relevance scorer
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(  
+                max_depth=1, include_external=False, url_scorer=keyword_scorer
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+            stream=True,
+        )
+
+        results = []
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score")
+            print(f"  → Score: {score:.2f} | {result.url}")
+
+        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
+        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
+
+
+# 4️⃣ Wrap-Up and Key Takeaways
+async def wrap_up():
+    """
+    PART 4: Wrap-Up and Key Takeaways
+
+    Summarize the key concepts learned in this tutorial.
+    """
+    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
+    print("Combining filters, scorers, and streaming for an optimized crawl")
+
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+
+    # Create a composite scorer that combines multiple scoring strategies
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=1,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+    )
+
+    # Execute the crawl
+    results = []
+    start_time = time.perf_counter()
+
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    duration = time.perf_counter() - start_time
+
+    # Summarize the results
+    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+    print(
+        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
+    )
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("\n📊 Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+
+# 5️⃣ Advanced Filters
+async def advanced_filters():
+    """
+    PART 5: Demonstrates advanced filtering techniques for specialized crawling.
+
+    This function covers:
+    - SEO filters
+    - Text relevancy filtering
+    - Combining advanced filters
+    """
+    print("\n===== ADVANCED FILTERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SEO FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SEO FILTERS")
+        print(
+            "Quantitative SEO quality assessment filter based searching keywords in the head section"
+        )
+
+        seo_filter = SEOFilter(
+            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([seo_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages with relevant keywords")
+        for result in results:
+            print(f"  → {result.url}")
+
+        # ADVANCED TEXT RELEVANCY FILTER
+        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
+
+        # More sophisticated content relevance filter
+        relevance_filter = ContentRelevanceFilter(
+            query="Interact with the web using your authentic digital identity",
+            threshold=0.7,
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([relevance_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages")
+        for result in results:
+            relevance_score = result.metadata.get("relevance_score", 0)
+            print(f"  → Score: {relevance_score:.2f} | {result.url}")
+
+
+# Main function to run the entire tutorial
+async def max_pages_and_thresholds():
+    """
+    PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    
+    This function shows:
+    - How to limit the number of pages crawled
+    - How to set score thresholds for more targeted crawling
+    - Comparing BFS, DFS, and Best-First strategies with these parameters
+    """
+    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
+    
+    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+    
+    async with AsyncWebCrawler() as crawler:
+        # Define a common keyword scorer for all examples
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["browser", "crawler", "web", "automation"], 
+            weight=1.0
+        )
+        
+        # EXAMPLE 1: BFS WITH MAX PAGES
+        print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
+        print("  Limit the crawler to a maximum of 5 pages")
+        
+        bfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=2, 
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=5  # Only crawl 5 pages
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
+        
+        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | {result.url}")
+            
+        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
+        print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
+        print("  Only crawl pages with a relevance score above 0.5")
+        
+        dfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=DFSDeepCrawlStrategy(
+                max_depth=2,
+                include_external=False, 
+                url_scorer=keyword_scorer,
+                score_threshold=0.7,  # Only process URLs with scores above 0.5
+                max_pages=10
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
+        
+        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
+        for result in results:
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
+        print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
+        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
+        
+        bf_config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(
+                max_depth=2,
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=7,          # Limit to 7 pages total
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+            stream=True,
+        )
+        
+        results = []
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
+        if results:
+            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
+            print(f"  ✅ Average score: {avg_score:.2f}")
+            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
+
+async def run_tutorial():
+    """
+    Executes all tutorial sections in sequence.
+    """
+    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
+    print("======================================")
+    print("This tutorial will walk you through deep crawling techniques,")
+    print("from basic to advanced, using the Crawl4AI library.")
+
+    # Define sections - uncomment to run specific parts during development
+    tutorial_sections = [
+        # basic_deep_crawl,
+        # stream_vs_nonstream,
+        # filters_and_scorers,
+        max_pages_and_thresholds,  # Added new section
+        wrap_up,
+        advanced_filters,
+    ]
+
+    for section in tutorial_sections:
+        await section()
+
+    print("\n🎉 TUTORIAL COMPLETE! 🎉")
+    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
+    print("For more information, check out https://docs.crawl4ai.com")
+
+
+# Execute the tutorial when run directly
+if __name__ == "__main__":
+    asyncio.run(run_tutorial())