refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled.
Add score_threshold parameter to BFS/DFS strategies for quality control.
Remove legacy parameter handling in AsyncWebCrawler.
Improve error handling and logging in crawl strategies.

BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()
This commit is contained in:
UncleCode
2025-03-03 21:51:11 +08:00
parent c612f9a852
commit d024749633
7 changed files with 372 additions and 91 deletions

View File

@@ -80,7 +80,7 @@ async def stream_vs_nonstream():
base_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
verbose=False,
)
async with AsyncWebCrawler() as crawler:
@@ -212,11 +212,11 @@ async def filters_and_scorers():
# Create a keyword relevance scorer
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
)
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy( # Note: Changed to BestFirst
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=1, include_external=False, url_scorer=keyword_scorer
),
scraping_strategy=LXMLWebScrapingStrategy(),
@@ -373,6 +373,104 @@ async def advanced_filters():
# Main function to run the entire tutorial
async def max_pages_and_thresholds():
"""
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
This function shows:
- How to limit the number of pages crawled
- How to set score thresholds for more targeted crawling
- Comparing BFS, DFS, and Best-First strategies with these parameters
"""
print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
async with AsyncWebCrawler() as crawler:
# Define a common keyword scorer for all examples
keyword_scorer = KeywordRelevanceScorer(
keywords=["browser", "crawler", "web", "automation"],
weight=1.0
)
# EXAMPLE 1: BFS WITH MAX PAGES
print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
print(" Limit the crawler to a maximum of 5 pages")
bfs_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer,
max_pages=5 # Only crawl 5 pages
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
)
results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
print(f" ✅ Crawled exactly {len(results)} pages as specified by max_pages")
for result in results:
depth = result.metadata.get("depth", 0)
print(f" → Depth: {depth} | {result.url}")
# EXAMPLE 2: DFS WITH SCORE THRESHOLD
print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
print(" Only crawl pages with a relevance score above 0.5")
dfs_config = CrawlerRunConfig(
deep_crawl_strategy=DFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer,
score_threshold=0.7, # Only process URLs with scores above 0.5
max_pages=10
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
)
results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
print(f" ✅ Crawled {len(results)} pages with scores above threshold")
for result in results:
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
# EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
print(" Limit to 7 pages with scores above 0.3, prioritizing highest scores")
bf_config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer,
max_pages=7, # Limit to 7 pages total
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
stream=True,
)
results = []
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
results.append(result)
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
print(f" ✅ Crawled {len(results)} high-value pages with scores above 0.3")
if results:
avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
print(f" ✅ Average score: {avg_score:.2f}")
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
async def run_tutorial():
"""
Executes all tutorial sections in sequence.
@@ -384,9 +482,10 @@ async def run_tutorial():
# Define sections - uncomment to run specific parts during development
tutorial_sections = [
basic_deep_crawl,
stream_vs_nonstream,
filters_and_scorers,
# basic_deep_crawl,
# stream_vs_nonstream,
# filters_and_scorers,
max_pages_and_thresholds, # Added new section
wrap_up,
advanced_filters,
]