fix(deep-crawl): BestFirst priority inversion; remove pre-scoring truncation. ref #1253

Use negative scores in PQ to visit high-score URLs first and drop link cap prior to scoring; add test for ordering.
2025-08-11 18:16:57 +08:00
2 changed files with 121 additions and 8 deletions
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -116,11 +116,6 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
            valid_links.append(base_url)
        # If we have more valid links than capacity, limit them
        if len(valid_links) > remaining_capacity:
            valid_links = valid_links[:remaining_capacity]
            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
        # Record the new depths and add to next_links
        for url in valid_links:
            depths[url] = new_depth
@@ -140,7 +135,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
        """
        queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
        # Push the initial URL with score 0 and depth 0.
-        await queue.put((0, 0, start_url, None))
+        initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
        await queue.put((-initial_score, 0, start_url, None))
        visited: Set[str] = set()
        depths: Dict[str, int] = {start_url: 0}
@@ -187,7 +183,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                result.metadata = result.metadata or {}
                result.metadata["depth"] = depth
                result.metadata["parent_url"] = parent_url
-                result.metadata["score"] = score
+                result.metadata["score"] = -score
                # Count only successful crawls toward max_pages limit
                if result.success:
@@ -208,7 +204,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                    for new_url, new_parent in new_links:
                        new_depth = depths.get(new_url, depth + 1)
                        new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
-                        await queue.put((new_score, new_depth, new_url, new_parent))
+                        await queue.put((-new_score, new_depth, new_url, new_parent))
        # End of crawl.
--- a/tests/general/test_bff_scoring.py
+++ b/tests/general/test_bff_scoring.py
@@ -0,0 +1,117 @@
 #!/usr/bin/env python3
 """
 Simple test to verify BestFirstCrawlingStrategy fixes.
 This test crawls a real website and shows that:
 1. Higher-scoring pages are crawled first (priority queue fix)
 2. Links are scored before truncation (link discovery fix)
 """
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
 from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
 async def test_best_first_strategy():
    """Test BestFirstCrawlingStrategy with keyword scoring"""
    print("=" * 70)
    print("Testing BestFirstCrawlingStrategy with Real URL")
    print("=" * 70)
    print("\nThis test will:")
    print("1. Crawl Python.org documentation")
    print("2. Score pages based on keywords: 'tutorial', 'guide', 'reference'")
    print("3. Show that higher-scoring pages are crawled first")
    print("-" * 70)
    # Create a keyword scorer that prioritizes tutorial/guide pages
    scorer = KeywordRelevanceScorer(
        keywords=["tutorial", "guide", "reference", "documentation"],
        weight=1.0,
        case_sensitive=False
    )
    # Create the strategy with scoring
    strategy = BestFirstCrawlingStrategy(
        max_depth=2,          # Crawl 2 levels deep
        max_pages=10,         # Limit to 10 pages total
        url_scorer=scorer,    # Use keyword scoring
        include_external=False  # Only internal links
    )
    # Configure browser and crawler
    browser_config = BrowserConfig(
        headless=True,    # Run in background
        verbose=False     # Reduce output noise
    )
    crawler_config = CrawlerRunConfig(
        deep_crawl_strategy=strategy,
        verbose=False
    )
    print("\nStarting crawl of https://docs.python.org/3/")
    print("Looking for pages with keywords: tutorial, guide, reference, documentation")
    print("-" * 70)
    crawled_urls = []
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Crawl and collect results
        results = await crawler.arun(
            url="https://docs.python.org/3/",
            config=crawler_config
        )
        # Process results
        if isinstance(results, list):
            for result in results:
                score = result.metadata.get('score', 0) if result.metadata else 0
                depth = result.metadata.get('depth', 0) if result.metadata else 0
                crawled_urls.append({
                    'url': result.url,
                    'score': score,
                    'depth': depth,
                    'success': result.success
                })
    print("\n" + "=" * 70)
    print("CRAWL RESULTS (in order of crawling)")
    print("=" * 70)
    for i, item in enumerate(crawled_urls, 1):
        status = "✓" if item['success'] else "✗"
        # Highlight high-scoring pages
        if item['score'] > 0.5:
            print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
            print(f"     ^ HIGH SCORE - Contains keywords!")
        else:
            print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
    print("\n" + "=" * 70)
    print("ANALYSIS")
    print("=" * 70)
    # Check if higher scores appear early in the crawl
    scores = [item['score'] for item in crawled_urls[1:]]  # Skip initial URL
    high_score_indices = [i for i, s in enumerate(scores) if s > 0.3]
    if high_score_indices and high_score_indices[0] < len(scores) / 2:
        print("✅ SUCCESS: Higher-scoring pages (with keywords) were crawled early!")
        print("   This confirms the priority queue fix is working.")
    else:
        print("⚠️  Check the crawl order above - higher scores should appear early")
    # Show score distribution
    print(f"\nScore Statistics:")
    print(f"  - Total pages crawled: {len(crawled_urls)}")
    print(f"  - Average score: {sum(item['score'] for item in crawled_urls) / len(crawled_urls):.2f}")
    print(f"  - Max score: {max(item['score'] for item in crawled_urls):.2f}")
    print(f"  - Pages with keywords: {sum(1 for item in crawled_urls if item['score'] > 0.3)}")
    print("\n" + "=" * 70)
    print("TEST COMPLETE")
    print("=" * 70)
 if __name__ == "__main__":
    print("\n🔍 BestFirstCrawlingStrategy Simple Test\n")
    asyncio.run(test_best_first_strategy())