refactor(deep-crawl): add max_pages limit and improve crawl control
Add max_pages parameter to all deep crawling strategies to limit total pages crawled. Add score_threshold parameter to BFS/DFS strategies for quality control. Remove legacy parameter handling in AsyncWebCrawler. Improve error handling and logging in crawl strategies. BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()
This commit is contained in:
@@ -80,7 +80,7 @@ async def stream_vs_nonstream():
|
||||
base_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -212,11 +212,11 @@ async def filters_and_scorers():
|
||||
|
||||
# Create a keyword relevance scorer
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
|
||||
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy( # Note: Changed to BestFirst
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=1, include_external=False, url_scorer=keyword_scorer
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
@@ -373,6 +373,104 @@ async def advanced_filters():
|
||||
|
||||
|
||||
# Main function to run the entire tutorial
|
||||
async def max_pages_and_thresholds():
|
||||
"""
|
||||
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
||||
|
||||
This function shows:
|
||||
- How to limit the number of pages crawled
|
||||
- How to set score thresholds for more targeted crawling
|
||||
- Comparing BFS, DFS, and Best-First strategies with these parameters
|
||||
"""
|
||||
print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
|
||||
|
||||
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Define a common keyword scorer for all examples
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["browser", "crawler", "web", "automation"],
|
||||
weight=1.0
|
||||
)
|
||||
|
||||
# EXAMPLE 1: BFS WITH MAX PAGES
|
||||
print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
|
||||
print(" Limit the crawler to a maximum of 5 pages")
|
||||
|
||||
bfs_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=keyword_scorer,
|
||||
max_pages=5 # Only crawl 5 pages
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
|
||||
|
||||
print(f" ✅ Crawled exactly {len(results)} pages as specified by max_pages")
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f" → Depth: {depth} | {result.url}")
|
||||
|
||||
# EXAMPLE 2: DFS WITH SCORE THRESHOLD
|
||||
print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
|
||||
print(" Only crawl pages with a relevance score above 0.5")
|
||||
|
||||
dfs_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=DFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=keyword_scorer,
|
||||
score_threshold=0.7, # Only process URLs with scores above 0.5
|
||||
max_pages=10
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
|
||||
|
||||
print(f" ✅ Crawled {len(results)} pages with scores above threshold")
|
||||
for result in results:
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
# EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
|
||||
print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
|
||||
print(" Limit to 7 pages with scores above 0.3, prioritizing highest scores")
|
||||
|
||||
bf_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=keyword_scorer,
|
||||
max_pages=7, # Limit to 7 pages total
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
results = []
|
||||
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
print(f" ✅ Crawled {len(results)} high-value pages with scores above 0.3")
|
||||
if results:
|
||||
avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
|
||||
print(f" ✅ Average score: {avg_score:.2f}")
|
||||
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
|
||||
|
||||
async def run_tutorial():
|
||||
"""
|
||||
Executes all tutorial sections in sequence.
|
||||
@@ -384,9 +482,10 @@ async def run_tutorial():
|
||||
|
||||
# Define sections - uncomment to run specific parts during development
|
||||
tutorial_sections = [
|
||||
basic_deep_crawl,
|
||||
stream_vs_nonstream,
|
||||
filters_and_scorers,
|
||||
# basic_deep_crawl,
|
||||
# stream_vs_nonstream,
|
||||
# filters_and_scorers,
|
||||
max_pages_and_thresholds, # Added new section
|
||||
wrap_up,
|
||||
advanced_filters,
|
||||
]
|
||||
Reference in New Issue
Block a user