refactor(deep-crawl): add max_pages limit and improve crawl control
Add max_pages parameter to all deep crawling strategies to limit total pages crawled. Add score_threshold parameter to BFS/DFS strategies for quality control. Remove legacy parameter handling in AsyncWebCrawler. Improve error handling and logging in crawl strategies. BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()
This commit is contained in:
@@ -80,7 +80,7 @@ async def stream_vs_nonstream():
|
||||
base_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -212,11 +212,11 @@ async def filters_and_scorers():
|
||||
|
||||
# Create a keyword relevance scorer
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
|
||||
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy( # Note: Changed to BestFirst
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=1, include_external=False, url_scorer=keyword_scorer
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
@@ -373,6 +373,104 @@ async def advanced_filters():
|
||||
|
||||
|
||||
# Main function to run the entire tutorial
|
||||
async def max_pages_and_thresholds():
|
||||
"""
|
||||
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
||||
|
||||
This function shows:
|
||||
- How to limit the number of pages crawled
|
||||
- How to set score thresholds for more targeted crawling
|
||||
- Comparing BFS, DFS, and Best-First strategies with these parameters
|
||||
"""
|
||||
print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
|
||||
|
||||
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Define a common keyword scorer for all examples
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["browser", "crawler", "web", "automation"],
|
||||
weight=1.0
|
||||
)
|
||||
|
||||
# EXAMPLE 1: BFS WITH MAX PAGES
|
||||
print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
|
||||
print(" Limit the crawler to a maximum of 5 pages")
|
||||
|
||||
bfs_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=keyword_scorer,
|
||||
max_pages=5 # Only crawl 5 pages
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
|
||||
|
||||
print(f" ✅ Crawled exactly {len(results)} pages as specified by max_pages")
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f" → Depth: {depth} | {result.url}")
|
||||
|
||||
# EXAMPLE 2: DFS WITH SCORE THRESHOLD
|
||||
print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
|
||||
print(" Only crawl pages with a relevance score above 0.5")
|
||||
|
||||
dfs_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=DFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=keyword_scorer,
|
||||
score_threshold=0.7, # Only process URLs with scores above 0.5
|
||||
max_pages=10
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
|
||||
|
||||
print(f" ✅ Crawled {len(results)} pages with scores above threshold")
|
||||
for result in results:
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
# EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
|
||||
print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
|
||||
print(" Limit to 7 pages with scores above 0.3, prioritizing highest scores")
|
||||
|
||||
bf_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=keyword_scorer,
|
||||
max_pages=7, # Limit to 7 pages total
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
results = []
|
||||
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
print(f" ✅ Crawled {len(results)} high-value pages with scores above 0.3")
|
||||
if results:
|
||||
avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
|
||||
print(f" ✅ Average score: {avg_score:.2f}")
|
||||
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
|
||||
|
||||
async def run_tutorial():
|
||||
"""
|
||||
Executes all tutorial sections in sequence.
|
||||
@@ -384,9 +482,10 @@ async def run_tutorial():
|
||||
|
||||
# Define sections - uncomment to run specific parts during development
|
||||
tutorial_sections = [
|
||||
basic_deep_crawl,
|
||||
stream_vs_nonstream,
|
||||
filters_and_scorers,
|
||||
# basic_deep_crawl,
|
||||
# stream_vs_nonstream,
|
||||
# filters_and_scorers,
|
||||
max_pages_and_thresholds, # Added new section
|
||||
wrap_up,
|
||||
advanced_filters,
|
||||
]
|
||||
@@ -73,12 +73,18 @@ from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Crawl initial page + 2 levels deep
|
||||
include_external=False, # Stay within the same domain
|
||||
max_pages=50, # Maximum number of pages to crawl (optional)
|
||||
score_threshold=0.3, # Minimum score for URLs to be crawled (optional)
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- **`max_depth`**: Number of levels to crawl beyond the starting page
|
||||
- **`include_external`**: Whether to follow links to other domains
|
||||
- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
|
||||
- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
|
||||
- **`filter_chain`**: FilterChain instance for URL filtering
|
||||
- **`url_scorer`**: Scorer instance for evaluating URLs
|
||||
|
||||
### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
|
||||
|
||||
@@ -91,12 +97,18 @@ from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
|
||||
strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=2, # Crawl initial page + 2 levels deep
|
||||
include_external=False, # Stay within the same domain
|
||||
max_pages=30, # Maximum number of pages to crawl (optional)
|
||||
score_threshold=0.5, # Minimum score for URLs to be crawled (optional)
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- **`max_depth`**: Number of levels to crawl beyond the starting page
|
||||
- **`include_external`**: Whether to follow links to other domains
|
||||
- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
|
||||
- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
|
||||
- **`filter_chain`**: FilterChain instance for URL filtering
|
||||
- **`url_scorer`**: Scorer instance for evaluating URLs
|
||||
|
||||
### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
|
||||
|
||||
@@ -116,7 +128,8 @@ scorer = KeywordRelevanceScorer(
|
||||
strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=scorer
|
||||
url_scorer=scorer,
|
||||
max_pages=25, # Maximum number of pages to crawl (optional)
|
||||
)
|
||||
```
|
||||
|
||||
@@ -124,6 +137,8 @@ This crawling approach:
|
||||
- Evaluates each discovered URL based on scorer criteria
|
||||
- Visits higher-scoring pages first
|
||||
- Helps focus crawl resources on the most relevant content
|
||||
- Can limit total pages crawled with `max_pages`
|
||||
- Does not need `score_threshold` as it naturally prioritizes by score
|
||||
|
||||
---
|
||||
|
||||
@@ -410,27 +425,64 @@ if __name__ == "__main__":
|
||||
---
|
||||
|
||||
|
||||
## 8. Common Pitfalls & Tips
|
||||
## 8. Limiting and Controlling Crawl Size
|
||||
|
||||
1.**Set realistic depth limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size.
|
||||
### 8.1 Using max_pages
|
||||
|
||||
You can limit the total number of pages crawled with the `max_pages` parameter:
|
||||
|
||||
```python
|
||||
# Limit to exactly 20 pages regardless of depth
|
||||
strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=20
|
||||
)
|
||||
```
|
||||
|
||||
This feature is useful for:
|
||||
- Controlling API costs
|
||||
- Setting predictable execution times
|
||||
- Focusing on the most important content
|
||||
- Testing crawl configurations before full execution
|
||||
|
||||
### 8.2 Using score_threshold
|
||||
|
||||
For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
|
||||
|
||||
```python
|
||||
# Only follow links with scores above 0.4
|
||||
strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
|
||||
score_threshold=0.4 # Skip URLs with scores below this value
|
||||
)
|
||||
```
|
||||
|
||||
Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
|
||||
|
||||
## 9. Common Pitfalls & Tips
|
||||
|
||||
1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.
|
||||
|
||||
2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
|
||||
|
||||
3.**Be a good web citizen.** Respect robots.txt. (disabled by default)
|
||||
|
||||
4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.
|
||||
|
||||
4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.success` and `result.error_message` when processing results.
|
||||
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
||||
|
||||
---
|
||||
|
||||
## 9. Summary & Next Steps
|
||||
## 10. Summary & Next Steps
|
||||
|
||||
In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
|
||||
|
||||
- Configure **BFSDeepCrawlStrategy** and **BestFirstCrawlingStrategy**
|
||||
- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
|
||||
- Process results in streaming or non-streaming mode
|
||||
- Apply filters to target specific content
|
||||
- Use scorers to prioritize the most relevant pages
|
||||
- Limit crawls with `max_pages` and `score_threshold` parameters
|
||||
- Build a complete advanced crawler with combined techniques
|
||||
|
||||
With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.
|
||||
|
||||
Reference in New Issue
Block a user