Compare commits
1 Commits
feature/do
...
fix/deep-c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
88a9fbbb7e |
@@ -116,11 +116,6 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
|
|
||||||
valid_links.append(base_url)
|
valid_links.append(base_url)
|
||||||
|
|
||||||
# If we have more valid links than capacity, limit them
|
|
||||||
if len(valid_links) > remaining_capacity:
|
|
||||||
valid_links = valid_links[:remaining_capacity]
|
|
||||||
self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
|
|
||||||
|
|
||||||
# Record the new depths and add to next_links
|
# Record the new depths and add to next_links
|
||||||
for url in valid_links:
|
for url in valid_links:
|
||||||
depths[url] = new_depth
|
depths[url] = new_depth
|
||||||
@@ -140,7 +135,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
"""
|
"""
|
||||||
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
|
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
|
||||||
# Push the initial URL with score 0 and depth 0.
|
# Push the initial URL with score 0 and depth 0.
|
||||||
await queue.put((0, 0, start_url, None))
|
initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
|
||||||
|
await queue.put((-initial_score, 0, start_url, None))
|
||||||
visited: Set[str] = set()
|
visited: Set[str] = set()
|
||||||
depths: Dict[str, int] = {start_url: 0}
|
depths: Dict[str, int] = {start_url: 0}
|
||||||
|
|
||||||
@@ -187,7 +183,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
result.metadata = result.metadata or {}
|
result.metadata = result.metadata or {}
|
||||||
result.metadata["depth"] = depth
|
result.metadata["depth"] = depth
|
||||||
result.metadata["parent_url"] = parent_url
|
result.metadata["parent_url"] = parent_url
|
||||||
result.metadata["score"] = score
|
result.metadata["score"] = -score
|
||||||
|
|
||||||
# Count only successful crawls toward max_pages limit
|
# Count only successful crawls toward max_pages limit
|
||||||
if result.success:
|
if result.success:
|
||||||
@@ -208,7 +204,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
for new_url, new_parent in new_links:
|
for new_url, new_parent in new_links:
|
||||||
new_depth = depths.get(new_url, depth + 1)
|
new_depth = depths.get(new_url, depth + 1)
|
||||||
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
|
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
|
||||||
await queue.put((new_score, new_depth, new_url, new_parent))
|
await queue.put((-new_score, new_depth, new_url, new_parent))
|
||||||
|
|
||||||
# End of crawl.
|
# End of crawl.
|
||||||
|
|
||||||
|
|||||||
117
tests/general/test_bff_scoring.py
Normal file
117
tests/general/test_bff_scoring.py
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple test to verify BestFirstCrawlingStrategy fixes.
|
||||||
|
This test crawls a real website and shows that:
|
||||||
|
1. Higher-scoring pages are crawled first (priority queue fix)
|
||||||
|
2. Links are scored before truncation (link discovery fix)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
|
||||||
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||||
|
|
||||||
|
async def test_best_first_strategy():
|
||||||
|
"""Test BestFirstCrawlingStrategy with keyword scoring"""
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("Testing BestFirstCrawlingStrategy with Real URL")
|
||||||
|
print("=" * 70)
|
||||||
|
print("\nThis test will:")
|
||||||
|
print("1. Crawl Python.org documentation")
|
||||||
|
print("2. Score pages based on keywords: 'tutorial', 'guide', 'reference'")
|
||||||
|
print("3. Show that higher-scoring pages are crawled first")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
# Create a keyword scorer that prioritizes tutorial/guide pages
|
||||||
|
scorer = KeywordRelevanceScorer(
|
||||||
|
keywords=["tutorial", "guide", "reference", "documentation"],
|
||||||
|
weight=1.0,
|
||||||
|
case_sensitive=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the strategy with scoring
|
||||||
|
strategy = BestFirstCrawlingStrategy(
|
||||||
|
max_depth=2, # Crawl 2 levels deep
|
||||||
|
max_pages=10, # Limit to 10 pages total
|
||||||
|
url_scorer=scorer, # Use keyword scoring
|
||||||
|
include_external=False # Only internal links
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configure browser and crawler
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True, # Run in background
|
||||||
|
verbose=False # Reduce output noise
|
||||||
|
)
|
||||||
|
|
||||||
|
crawler_config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=strategy,
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nStarting crawl of https://docs.python.org/3/")
|
||||||
|
print("Looking for pages with keywords: tutorial, guide, reference, documentation")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
crawled_urls = []
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
# Crawl and collect results
|
||||||
|
results = await crawler.arun(
|
||||||
|
url="https://docs.python.org/3/",
|
||||||
|
config=crawler_config
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process results
|
||||||
|
if isinstance(results, list):
|
||||||
|
for result in results:
|
||||||
|
score = result.metadata.get('score', 0) if result.metadata else 0
|
||||||
|
depth = result.metadata.get('depth', 0) if result.metadata else 0
|
||||||
|
crawled_urls.append({
|
||||||
|
'url': result.url,
|
||||||
|
'score': score,
|
||||||
|
'depth': depth,
|
||||||
|
'success': result.success
|
||||||
|
})
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("CRAWL RESULTS (in order of crawling)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for i, item in enumerate(crawled_urls, 1):
|
||||||
|
status = "✓" if item['success'] else "✗"
|
||||||
|
# Highlight high-scoring pages
|
||||||
|
if item['score'] > 0.5:
|
||||||
|
print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
|
||||||
|
print(f" ^ HIGH SCORE - Contains keywords!")
|
||||||
|
else:
|
||||||
|
print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("ANALYSIS")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Check if higher scores appear early in the crawl
|
||||||
|
scores = [item['score'] for item in crawled_urls[1:]] # Skip initial URL
|
||||||
|
high_score_indices = [i for i, s in enumerate(scores) if s > 0.3]
|
||||||
|
|
||||||
|
if high_score_indices and high_score_indices[0] < len(scores) / 2:
|
||||||
|
print("✅ SUCCESS: Higher-scoring pages (with keywords) were crawled early!")
|
||||||
|
print(" This confirms the priority queue fix is working.")
|
||||||
|
else:
|
||||||
|
print("⚠️ Check the crawl order above - higher scores should appear early")
|
||||||
|
|
||||||
|
# Show score distribution
|
||||||
|
print(f"\nScore Statistics:")
|
||||||
|
print(f" - Total pages crawled: {len(crawled_urls)}")
|
||||||
|
print(f" - Average score: {sum(item['score'] for item in crawled_urls) / len(crawled_urls):.2f}")
|
||||||
|
print(f" - Max score: {max(item['score'] for item in crawled_urls):.2f}")
|
||||||
|
print(f" - Pages with keywords: {sum(1 for item in crawled_urls if item['score'] > 0.3)}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("TEST COMPLETE")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("\n🔍 BestFirstCrawlingStrategy Simple Test\n")
|
||||||
|
asyncio.run(test_best_first_strategy())
|
||||||
Reference in New Issue
Block a user