Files
crawl4ai/docs/md_v2/assets/llm.txt/txt/deep_crawling.txt
UncleCode 40640badad feat: add Script Builder to Chrome Extension and reorganize LLM context files
This commit introduces significant enhancements to the Crawl4AI ecosystem:

  Chrome Extension - Script Builder (Alpha):
  - Add recording functionality to capture user interactions (clicks, typing, scrolling)
  - Implement smart event grouping for cleaner script generation
  - Support export to both JavaScript and C4A script formats
  - Add timeline view for visualizing and editing recorded actions
  - Include wait commands (time-based and element-based)
  - Add saved flows functionality for reusing automation scripts
  - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents)
  - Release new extension versions: v1.1.0, v1.2.0, v1.2.1

  LLM Context Builder Improvements:
  - Reorganize context files from llmtxt/ to llm.txt/ with better structure
  - Separate diagram templates from text content (diagrams/ and txt/ subdirectories)
  - Add comprehensive context files for all major Crawl4AI components
  - Improve file naming convention for better discoverability

  Documentation Updates:
  - Update apps index page to match main documentation theme
  - Standardize color scheme: "Available" tags use primary color (#50ffff)
  - Change "Coming Soon" tags to dark gray for better visual hierarchy
  - Add interactive two-column layout for extension landing page
  - Include code examples for both Schema Builder and Script Builder features

  Technical Improvements:
  - Enhance event capture mechanism with better element selection
  - Add support for contenteditable elements and complex form interactions
  - Implement proper scroll event handling for both window and element scrolling
  - Add meta key support for keyboard shortcuts
  - Improve selector generation for more reliable element targeting

  The Script Builder is released as Alpha, acknowledging potential bugs while providing
  early access to this powerful automation recording feature.
2025-06-08 22:02:12 +08:00

348 lines
11 KiB
Plaintext

## Deep Crawling
Multi-level website exploration with intelligent filtering, scoring, and prioritization strategies.
### Basic Deep Crawl Setup
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
# Basic breadth-first deep crawling
async def basic_deep_crawl():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2, # Initial page + 2 levels
include_external=False # Stay within same domain
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun("https://docs.crawl4ai.com", config=config)
# Group results by depth
pages_by_depth = {}
for result in results:
depth = result.metadata.get("depth", 0)
if depth not in pages_by_depth:
pages_by_depth[depth] = []
pages_by_depth[depth].append(result.url)
print(f"Crawled {len(results)} pages total")
for depth, urls in sorted(pages_by_depth.items()):
print(f"Depth {depth}: {len(urls)} pages")
```
### Deep Crawl Strategies
```python
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# Breadth-First Search - explores all links at one depth before going deeper
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
max_pages=50, # Limit total pages
score_threshold=0.3 # Minimum score for URLs
)
# Depth-First Search - explores as deep as possible before backtracking
dfs_strategy = DFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
max_pages=30,
score_threshold=0.5
)
# Best-First - prioritizes highest scoring pages (recommended)
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration"],
weight=0.7
)
best_first_strategy = BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer,
max_pages=25 # No score_threshold needed - naturally prioritizes
)
# Usage
config = CrawlerRunConfig(
deep_crawl_strategy=best_first_strategy, # Choose your strategy
scraping_strategy=LXMLWebScrapingStrategy()
)
```
### Streaming vs Batch Processing
```python
# Batch mode - wait for all results
async def batch_deep_crawl():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
stream=False # Default - collect all results first
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun("https://example.com", config=config)
# Process all results at once
for result in results:
print(f"Batch processed: {result.url}")
# Streaming mode - process results as they arrive
async def streaming_deep_crawl():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
stream=True # Process results immediately
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun("https://example.com", config=config):
depth = result.metadata.get("depth", 0)
print(f"Stream processed depth {depth}: {result.url}")
```
### Filtering with Filter Chains
```python
from crawl4ai.deep_crawling.filters import (
FilterChain,
URLPatternFilter,
DomainFilter,
ContentTypeFilter,
SEOFilter,
ContentRelevanceFilter
)
# Single URL pattern filter
url_filter = URLPatternFilter(patterns=["*core*", "*guide*"])
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1,
filter_chain=FilterChain([url_filter])
)
)
# Multiple filters in chain
advanced_filter_chain = FilterChain([
# Domain filtering
DomainFilter(
allowed_domains=["docs.example.com"],
blocked_domains=["old.docs.example.com", "staging.example.com"]
),
# URL pattern matching
URLPatternFilter(patterns=["*tutorial*", "*guide*", "*blog*"]),
# Content type filtering
ContentTypeFilter(allowed_types=["text/html"]),
# SEO quality filter
SEOFilter(
threshold=0.5,
keywords=["tutorial", "guide", "documentation"]
),
# Content relevance filter
ContentRelevanceFilter(
query="Web crawling and data extraction with Python",
threshold=0.7
)
])
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
filter_chain=advanced_filter_chain
)
)
```
### Intelligent Crawling with Scorers
```python
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# Keyword relevance scoring
async def scored_deep_crawl():
keyword_scorer = KeywordRelevanceScorer(
keywords=["browser", "crawler", "web", "automation"],
weight=1.0
)
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer
),
stream=True, # Recommended with BestFirst
verbose=True
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
```
### Limiting Crawl Size
```python
# Max pages limitation across strategies
async def limited_crawls():
# BFS with page limit
bfs_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
max_pages=5, # Only crawl 5 pages total
url_scorer=KeywordRelevanceScorer(keywords=["browser", "crawler"], weight=1.0)
)
)
# DFS with score threshold
dfs_config = CrawlerRunConfig(
deep_crawl_strategy=DFSDeepCrawlStrategy(
max_depth=2,
score_threshold=0.7, # Only URLs with scores above 0.7
max_pages=10,
url_scorer=KeywordRelevanceScorer(keywords=["web", "automation"], weight=1.0)
)
)
# Best-First with both constraints
bf_config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
max_pages=7, # Automatically gets highest scored pages
url_scorer=KeywordRelevanceScorer(keywords=["crawl", "example"], weight=1.0)
),
stream=True
)
async with AsyncWebCrawler() as crawler:
# Use any of the configs
async for result in await crawler.arun("https://docs.crawl4ai.com", config=bf_config):
score = result.metadata.get("score", 0)
print(f"Score: {score:.2f} | {result.url}")
```
### Complete Advanced Deep Crawler
```python
async def comprehensive_deep_crawl():
# Sophisticated filter chain
filter_chain = FilterChain([
DomainFilter(
allowed_domains=["docs.crawl4ai.com"],
blocked_domains=["old.docs.crawl4ai.com"]
),
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
ContentTypeFilter(allowed_types=["text/html"]),
SEOFilter(threshold=0.4, keywords=["crawl", "tutorial", "guide"])
])
# Multi-keyword scorer
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration", "browser"],
weight=0.8
)
# Complete configuration
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
filter_chain=filter_chain,
url_scorer=keyword_scorer,
max_pages=20
),
scraping_strategy=LXMLWebScrapingStrategy(),
stream=True,
verbose=True,
cache_mode=CacheMode.BYPASS
)
# Execute and analyze
results = []
start_time = time.time()
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
results.append(result)
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
# Performance analysis
duration = time.time() - start_time
avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
print(f"✅ Crawled {len(results)} pages in {duration:.2f}s")
print(f"✅ Average relevance score: {avg_score:.2f}")
# Depth distribution
depth_counts = {}
for result in results:
depth = result.metadata.get("depth", 0)
depth_counts[depth] = depth_counts.get(depth, 0) + 1
for depth, count in sorted(depth_counts.items()):
print(f"📊 Depth {depth}: {count} pages")
```
### Error Handling and Robustness
```python
async def robust_deep_crawl():
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
max_pages=15,
url_scorer=KeywordRelevanceScorer(keywords=["guide", "tutorial"])
),
stream=True,
page_timeout=30000 # 30 second timeout per page
)
successful_pages = []
failed_pages = []
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
if result.success:
successful_pages.append(result)
depth = result.metadata.get("depth", 0)
score = result.metadata.get("score", 0)
print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
else:
failed_pages.append({
'url': result.url,
'error': result.error_message,
'depth': result.metadata.get("depth", 0)
})
print(f"❌ Failed: {result.url} - {result.error_message}")
print(f"📊 Results: {len(successful_pages)} successful, {len(failed_pages)} failed")
# Analyze failures by depth
if failed_pages:
failure_by_depth = {}
for failure in failed_pages:
depth = failure['depth']
failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
print("❌ Failures by depth:")
for depth, count in sorted(failure_by_depth.items()):
print(f" Depth {depth}: {count} failures")
```
**📖 Learn more:** [Deep Crawling Guide](https://docs.crawl4ai.com/core/deep-crawling/), [Filter Documentation](https://docs.crawl4ai.com/core/content-selection/), [Scoring Strategies](https://docs.crawl4ai.com/advanced/advanced-features/)