From 88a9fbbb7e507006bd5492c4bb83e0b4efa0614f Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 11 Aug 2025 18:16:57 +0800 Subject: [PATCH] fix(deep-crawl): BestFirst priority inversion; remove pre-scoring truncation. ref #1253 Use negative scores in PQ to visit high-score URLs first and drop link cap prior to scoring; add test for ordering. --- crawl4ai/deep_crawling/bff_strategy.py | 12 +-- tests/general/test_bff_scoring.py | 117 +++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 8 deletions(-) create mode 100644 tests/general/test_bff_scoring.py diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index 7779c9f4..82f8e184 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -116,11 +116,6 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): valid_links.append(base_url) - # If we have more valid links than capacity, limit them - if len(valid_links) > remaining_capacity: - valid_links = valid_links[:remaining_capacity] - self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit") - # Record the new depths and add to next_links for url in valid_links: depths[url] = new_depth @@ -140,7 +135,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): """ queue: asyncio.PriorityQueue = asyncio.PriorityQueue() # Push the initial URL with score 0 and depth 0. - await queue.put((0, 0, start_url, None)) + initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0 + await queue.put((-initial_score, 0, start_url, None)) visited: Set[str] = set() depths: Dict[str, int] = {start_url: 0} @@ -187,7 +183,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): result.metadata = result.metadata or {} result.metadata["depth"] = depth result.metadata["parent_url"] = parent_url - result.metadata["score"] = score + result.metadata["score"] = -score # Count only successful crawls toward max_pages limit if result.success: @@ -208,7 +204,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): for new_url, new_parent in new_links: new_depth = depths.get(new_url, depth + 1) new_score = self.url_scorer.score(new_url) if self.url_scorer else 0 - await queue.put((new_score, new_depth, new_url, new_parent)) + await queue.put((-new_score, new_depth, new_url, new_parent)) # End of crawl. diff --git a/tests/general/test_bff_scoring.py b/tests/general/test_bff_scoring.py new file mode 100644 index 00000000..d663d944 --- /dev/null +++ b/tests/general/test_bff_scoring.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +Simple test to verify BestFirstCrawlingStrategy fixes. +This test crawls a real website and shows that: +1. Higher-scoring pages are crawled first (priority queue fix) +2. Links are scored before truncation (link discovery fix) +""" + +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.deep_crawling import BestFirstCrawlingStrategy +from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer + +async def test_best_first_strategy(): + """Test BestFirstCrawlingStrategy with keyword scoring""" + + print("=" * 70) + print("Testing BestFirstCrawlingStrategy with Real URL") + print("=" * 70) + print("\nThis test will:") + print("1. Crawl Python.org documentation") + print("2. Score pages based on keywords: 'tutorial', 'guide', 'reference'") + print("3. Show that higher-scoring pages are crawled first") + print("-" * 70) + + # Create a keyword scorer that prioritizes tutorial/guide pages + scorer = KeywordRelevanceScorer( + keywords=["tutorial", "guide", "reference", "documentation"], + weight=1.0, + case_sensitive=False + ) + + # Create the strategy with scoring + strategy = BestFirstCrawlingStrategy( + max_depth=2, # Crawl 2 levels deep + max_pages=10, # Limit to 10 pages total + url_scorer=scorer, # Use keyword scoring + include_external=False # Only internal links + ) + + # Configure browser and crawler + browser_config = BrowserConfig( + headless=True, # Run in background + verbose=False # Reduce output noise + ) + + crawler_config = CrawlerRunConfig( + deep_crawl_strategy=strategy, + verbose=False + ) + + print("\nStarting crawl of https://docs.python.org/3/") + print("Looking for pages with keywords: tutorial, guide, reference, documentation") + print("-" * 70) + + crawled_urls = [] + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Crawl and collect results + results = await crawler.arun( + url="https://docs.python.org/3/", + config=crawler_config + ) + + # Process results + if isinstance(results, list): + for result in results: + score = result.metadata.get('score', 0) if result.metadata else 0 + depth = result.metadata.get('depth', 0) if result.metadata else 0 + crawled_urls.append({ + 'url': result.url, + 'score': score, + 'depth': depth, + 'success': result.success + }) + + print("\n" + "=" * 70) + print("CRAWL RESULTS (in order of crawling)") + print("=" * 70) + + for i, item in enumerate(crawled_urls, 1): + status = "✓" if item['success'] else "✗" + # Highlight high-scoring pages + if item['score'] > 0.5: + print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}") + print(f" ^ HIGH SCORE - Contains keywords!") + else: + print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}") + + print("\n" + "=" * 70) + print("ANALYSIS") + print("=" * 70) + + # Check if higher scores appear early in the crawl + scores = [item['score'] for item in crawled_urls[1:]] # Skip initial URL + high_score_indices = [i for i, s in enumerate(scores) if s > 0.3] + + if high_score_indices and high_score_indices[0] < len(scores) / 2: + print("✅ SUCCESS: Higher-scoring pages (with keywords) were crawled early!") + print(" This confirms the priority queue fix is working.") + else: + print("⚠️ Check the crawl order above - higher scores should appear early") + + # Show score distribution + print(f"\nScore Statistics:") + print(f" - Total pages crawled: {len(crawled_urls)}") + print(f" - Average score: {sum(item['score'] for item in crawled_urls) / len(crawled_urls):.2f}") + print(f" - Max score: {max(item['score'] for item in crawled_urls):.2f}") + print(f" - Pages with keywords: {sum(1 for item in crawled_urls if item['score'] > 0.3)}") + + print("\n" + "=" * 70) + print("TEST COMPLETE") + print("=" * 70) + +if __name__ == "__main__": + print("\n🔍 BestFirstCrawlingStrategy Simple Test\n") + asyncio.run(test_best_first_strategy()) \ No newline at end of file