Compare commits
5 Commits
fix/playwr
...
fix/releas
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d1970ae69 | ||
|
|
205df1e330 | ||
|
|
2640dc73a5 | ||
|
|
58024755c5 | ||
|
|
dd5ee752cf |
17
README.md
17
README.md
@@ -523,15 +523,18 @@ async def test_news_crawl():
|
|||||||
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
||||||
```python
|
```python
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7,
|
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||||
max_history=100,
|
max_depth=5, # Maximum crawl depth
|
||||||
learning_rate=0.2
|
max_pages=20, # Maximum number of pages to crawl
|
||||||
|
strategy="statistical"
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await crawler.arun(
|
async with AsyncWebCrawler() as crawler:
|
||||||
"https://news.example.com",
|
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
state = await adaptive_crawler.digest(
|
||||||
)
|
start_url="https://news.example.com",
|
||||||
|
query="latest news content"
|
||||||
|
)
|
||||||
# Crawler learns patterns and improves extraction over time
|
# Crawler learns patterns and improves extraction over time
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
|||||||
|
|
||||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||||
- **PDF Parsing**: Extract data from PDF documents
|
|
||||||
- **Performance Optimizations**: Significant speed and memory improvements
|
- **Performance Optimizations**: Significant speed and memory improvements
|
||||||
|
|
||||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||||
@@ -30,44 +29,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
- Extraction confidence scores
|
- Extraction confidence scores
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
|
||||||
# Initialize with custom learning parameters
|
# Initialize with custom adaptive parameters
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||||
max_history=100, # Remember last 100 crawls per domain
|
max_depth=5, # Maximum crawl depth
|
||||||
learning_rate=0.2, # How quickly to adapt to changes
|
max_pages=20, # Maximum number of pages to crawl
|
||||||
patterns_per_page=3, # Patterns to learn per page type
|
top_k_links=3, # Number of top links to follow per page
|
||||||
extraction_strategy='css' # 'css' or 'xpath'
|
strategy="statistical", # 'statistical' or 'embedding'
|
||||||
|
coverage_weight=0.4, # Weight for coverage in confidence calculation
|
||||||
|
consistency_weight=0.3, # Weight for consistency in confidence calculation
|
||||||
|
saturation_weight=0.3 # Weight for saturation in confidence calculation
|
||||||
)
|
)
|
||||||
|
|
||||||
adaptive_crawler = AdaptiveCrawler(config)
|
# Initialize adaptive crawler with web crawler
|
||||||
|
|
||||||
# First crawl - crawler learns the structure
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||||
"https://news.example.com/article/12345",
|
|
||||||
config=CrawlerRunConfig(
|
# Crawl and learn patterns
|
||||||
adaptive_config=config,
|
state = await adaptive_crawler.digest(
|
||||||
extraction_hints={ # Optional hints to speed up learning
|
start_url="https://news.example.com/article/12345",
|
||||||
"title": "article h1",
|
query="latest news articles and content"
|
||||||
"content": "article .body-content"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Crawler identifies and stores patterns
|
# Access results and confidence
|
||||||
if result.success:
|
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
|
||||||
state = adaptive_crawler.get_state("news.example.com")
|
print(f"Pages Crawled: {len(state.crawled_urls)}")
|
||||||
print(f"Learned {len(state.patterns)} patterns")
|
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
|
||||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
|
||||||
|
|
||||||
# Subsequent crawls - uses learned patterns
|
|
||||||
result2 = await crawler.arun(
|
|
||||||
"https://news.example.com/article/67890",
|
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
|
||||||
)
|
|
||||||
# Automatically extracts using learned patterns!
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
@@ -92,9 +81,7 @@ twitter_config = VirtualScrollConfig(
|
|||||||
container_selector="[data-testid='primaryColumn']",
|
container_selector="[data-testid='primaryColumn']",
|
||||||
scroll_count=20, # Number of scrolls
|
scroll_count=20, # Number of scrolls
|
||||||
scroll_by="container_height", # Smart scrolling by container size
|
scroll_by="container_height", # Smart scrolling by container size
|
||||||
wait_after_scroll=1.0, # Let content load
|
wait_after_scroll=1.0 # Let content load
|
||||||
capture_method="incremental", # Capture new content on each scroll
|
|
||||||
deduplicate=True # Remove duplicate elements
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For e-commerce product grids (Instagram style)
|
# For e-commerce product grids (Instagram style)
|
||||||
@@ -102,8 +89,7 @@ grid_config = VirtualScrollConfig(
|
|||||||
container_selector="main .product-grid",
|
container_selector="main .product-grid",
|
||||||
scroll_count=30,
|
scroll_count=30,
|
||||||
scroll_by=800, # Fixed pixel scrolling
|
scroll_by=800, # Fixed pixel scrolling
|
||||||
wait_after_scroll=1.5, # Images need time
|
wait_after_scroll=1.5 # Images need time
|
||||||
stop_on_no_change=True # Smart stopping
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For news feeds with lazy loading
|
# For news feeds with lazy loading
|
||||||
@@ -111,9 +97,7 @@ news_config = VirtualScrollConfig(
|
|||||||
container_selector=".article-feed",
|
container_selector=".article-feed",
|
||||||
scroll_count=50,
|
scroll_count=50,
|
||||||
scroll_by="page_height", # Viewport-based scrolling
|
scroll_by="page_height", # Viewport-based scrolling
|
||||||
wait_after_scroll=0.5,
|
wait_after_scroll=0.5 # Wait for content to load
|
||||||
wait_for_selector=".article-card", # Wait for specific elements
|
|
||||||
timeout=30000 # Max 30 seconds total
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use it in your crawl
|
# Use it in your crawl
|
||||||
@@ -160,29 +144,17 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
### The Three-Layer Scoring System
|
### The Three-Layer Scoring System
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
# Configure intelligent link analysis
|
# Configure intelligent link analysis
|
||||||
link_config = LinkPreviewConfig(
|
link_config = LinkPreviewConfig(
|
||||||
# What to analyze
|
|
||||||
include_internal=True,
|
include_internal=True,
|
||||||
include_external=True,
|
include_external=False,
|
||||||
max_links=100, # Analyze top 100 links
|
max_links=10,
|
||||||
|
concurrency=5,
|
||||||
# Relevance scoring
|
query="python tutorial", # For contextual scoring
|
||||||
query="machine learning tutorials", # Your interest
|
score_threshold=0.3,
|
||||||
score_threshold=0.3, # Minimum relevance score
|
verbose=True
|
||||||
|
|
||||||
# Performance
|
|
||||||
concurrent_requests=10, # Parallel processing
|
|
||||||
timeout_per_link=5000, # 5s per link
|
|
||||||
|
|
||||||
# Advanced scoring weights
|
|
||||||
scoring_weights={
|
|
||||||
"intrinsic": 0.3, # Link quality indicators
|
|
||||||
"contextual": 0.5, # Relevance to query
|
|
||||||
"popularity": 0.2 # Link prominence
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use in your crawl
|
# Use in your crawl
|
||||||
@@ -190,35 +162,51 @@ result = await crawler.arun(
|
|||||||
"https://tech-blog.example.com",
|
"https://tech-blog.example.com",
|
||||||
config=CrawlerRunConfig(
|
config=CrawlerRunConfig(
|
||||||
link_preview_config=link_config,
|
link_preview_config=link_config,
|
||||||
score_links=True
|
score_links=True, # Enable intrinsic scoring
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Access scored and sorted links
|
# Access scored and sorted links
|
||||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
if result.success and result.links:
|
||||||
print(f"Score: {link['total_score']:.3f}")
|
# Get scored links
|
||||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
internal_links = result.links.get("internal", [])
|
||||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
scored_links = [l for l in internal_links if l.get("total_score")]
|
||||||
print(f" URL: {link['href']}")
|
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||||
print(f" Title: {link['head_data']['title']}")
|
|
||||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
# Create a scoring table
|
||||||
|
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
||||||
|
table.add_column("Link Text", style="cyan", width=40)
|
||||||
|
table.add_column("Intrinsic Score", justify="center")
|
||||||
|
table.add_column("Contextual Score", justify="center")
|
||||||
|
table.add_column("Total Score", justify="center", style="bold green")
|
||||||
|
|
||||||
|
for link in scored_links[:5]:
|
||||||
|
text = link.get('text', 'No text')[:40]
|
||||||
|
table.add_row(
|
||||||
|
text,
|
||||||
|
f"{link.get('intrinsic_score', 0):.1f}/10",
|
||||||
|
f"{link.get('contextual_score', 0):.2f}/1",
|
||||||
|
f"{link.get('total_score', 0):.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scoring Components:**
|
**Scoring Components:**
|
||||||
|
|
||||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
1. **Intrinsic Score**: Based on link quality indicators
|
||||||
- Position on page (navigation, content, footer)
|
- Position on page (navigation, content, footer)
|
||||||
- Link attributes (rel, title, class names)
|
- Link attributes (rel, title, class names)
|
||||||
- Anchor text quality and length
|
- Anchor text quality and length
|
||||||
- URL structure and depth
|
- URL structure and depth
|
||||||
|
|
||||||
2. **Contextual Score (0-1)**: Relevance to your query
|
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||||
- Semantic similarity using embeddings
|
|
||||||
- Keyword matching in link text and title
|
- Keyword matching in link text and title
|
||||||
- Meta description analysis
|
- Meta description analysis
|
||||||
- Content preview scoring
|
- Content preview scoring
|
||||||
|
|
||||||
3. **Total Score**: Weighted combination for final ranking
|
3. **Total Score**: Combined score for final ranking
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||||
@@ -240,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
|||||||
# Basic discovery - find all product pages
|
# Basic discovery - find all product pages
|
||||||
seeder_config = SeedingConfig(
|
seeder_config = SeedingConfig(
|
||||||
# Discovery sources
|
# Discovery sources
|
||||||
source="sitemap+cc", # Sitemap + Common Crawl
|
source="cc+sitemap", # Sitemap + Common Crawl
|
||||||
|
|
||||||
# Filtering
|
# Filtering
|
||||||
pattern="*/product/*", # URL pattern matching
|
pattern="*/product/*", # URL pattern matching
|
||||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
|
||||||
|
|
||||||
# Validation
|
# Validation
|
||||||
live_check=True, # Verify URLs are alive
|
live_check=True, # Verify URLs are alive
|
||||||
max_urls=5000, # Stop at 5000 URLs
|
max_urls=50, # Stop at 50 URLs
|
||||||
|
|
||||||
# Performance
|
# Performance
|
||||||
concurrency=100, # Parallel requests
|
concurrency=100, # Maximum concurrent requests for live checks/head extraction
|
||||||
hits_per_sec=10 # Rate limiting
|
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
|
||||||
)
|
)
|
||||||
|
|
||||||
seeder = AsyncUrlSeeder(seeder_config)
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.discover("https://shop.example.com")
|
console.print("Discovering URLs from Python docs...")
|
||||||
|
urls = await seeder.urls("docs.python.org", seeding_config)
|
||||||
|
console.print(f"\n✓ Discovered {len(urls)} URLs")
|
||||||
|
|
||||||
# Advanced: Relevance-based discovery
|
# Advanced: Relevance-based discovery
|
||||||
research_config = SeedingConfig(
|
research_config = SeedingConfig(
|
||||||
source="crawl+sitemap", # Deep crawl + sitemap
|
source="sitemap+cc", # Sitemap + Common Crawl
|
||||||
pattern="*/blog/*", # Blog posts only
|
pattern="*/blog/*", # Blog posts only
|
||||||
|
|
||||||
# Content relevance
|
# Content relevance
|
||||||
extract_head=True, # Get meta tags
|
extract_head=True, # Get meta tags
|
||||||
query="quantum computing tutorials",
|
query="quantum computing tutorials",
|
||||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
scoring_method="bm25", # BM25 scoring method
|
||||||
score_threshold=0.4, # High relevance only
|
score_threshold=0.4, # High relevance only
|
||||||
|
|
||||||
# Smart filtering
|
# Smart filtering
|
||||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
||||||
min_content_length=500, # Skip thin content
|
|
||||||
|
|
||||||
force=True # Bypass cache
|
force=True # Bypass cache
|
||||||
)
|
)
|
||||||
|
|
||||||
# Discover with progress tracking
|
# Discover with progress tracking
|
||||||
discovered = []
|
discovered = []
|
||||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
async with AsyncUrlSeeder() as seeder:
|
||||||
discovered.extend(batch)
|
discovered = await seeder.urls("https://physics-blog.com", research_config)
|
||||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
console.print(f"\n✓ Discovered {len(discovered)} URLs")
|
||||||
|
|
||||||
# Results include scores and metadata
|
# Results include scores and metadata
|
||||||
for url_data in discovered[:5]:
|
for url_data in discovered[:5]:
|
||||||
print(f"URL: {url_data['url']}")
|
print(f"URL: {url_data['url']}")
|
||||||
print(f"Score: {url_data['score']:.3f}")
|
print(f"Score: {url_data['relevance_score']:.3f}")
|
||||||
print(f"Title: {url_data['title']}")
|
print(f"Title: {url_data['head_data']['title']}")
|
||||||
```
|
```
|
||||||
|
|
||||||
**Discovery Methods:**
|
**Discovery Methods:**
|
||||||
@@ -309,35 +297,18 @@ This release includes significant performance improvements through optimized res
|
|||||||
### What We Optimized
|
### What We Optimized
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Before v0.7.0 (slow)
|
# Optimized crawling with v0.7.0 improvements
|
||||||
results = []
|
results = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
result = await crawler.arun(url)
|
result = await crawler.arun(
|
||||||
results.append(result)
|
url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
# After v0.7.0 (fast)
|
# Performance optimizations
|
||||||
# Automatic batching and connection pooling
|
wait_until="domcontentloaded", # Faster than networkidle
|
||||||
results = await crawler.arun_batch(
|
cache_mode=CacheMode.ENABLED # Enable caching
|
||||||
urls,
|
)
|
||||||
config=CrawlerRunConfig(
|
|
||||||
# New performance options
|
|
||||||
batch_size=10, # Process 10 URLs concurrently
|
|
||||||
reuse_browser=True, # Keep browser warm
|
|
||||||
eager_loading=False, # Load only what's needed
|
|
||||||
streaming_extraction=True, # Stream large extractions
|
|
||||||
|
|
||||||
# Optimized defaults
|
|
||||||
wait_until="domcontentloaded", # Faster than networkidle
|
|
||||||
exclude_external_resources=True, # Skip third-party assets
|
|
||||||
block_ads=True # Ad blocking built-in
|
|
||||||
)
|
)
|
||||||
)
|
results.append(result)
|
||||||
|
|
||||||
# Memory-efficient streaming for large crawls
|
|
||||||
async for result in crawler.arun_stream(large_url_list):
|
|
||||||
# Process results as they complete
|
|
||||||
await process_result(result)
|
|
||||||
# Memory is freed after each iteration
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Performance Gains:**
|
**Performance Gains:**
|
||||||
@@ -347,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list):
|
|||||||
- **Memory Usage**: 60% reduction with streaming processing
|
- **Memory Usage**: 60% reduction with streaming processing
|
||||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||||
|
|
||||||
## 📄 PDF Support
|
|
||||||
|
|
||||||
PDF extraction is now natively supported in Crawl4AI.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Extract data from PDF documents
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://example.com/report.pdf",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
pdf_extraction=True,
|
|
||||||
extraction_strategy=JsonCssExtractionStrategy({
|
|
||||||
# Works on converted PDF structure
|
|
||||||
"title": {"selector": "h1", "type": "text"},
|
|
||||||
"sections": {"selector": "h2", "type": "list"}
|
|
||||||
})
|
|
||||||
)
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔧 Important Changes
|
## 🔧 Important Changes
|
||||||
|
|
||||||
|
|||||||
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
|
|||||||
from crawl4ai.cache_context import CacheMode
|
from crawl4ai.cache_context import CacheMode
|
||||||
|
|
||||||
async def crawl_dynamic_content():
|
async def crawl_dynamic_content():
|
||||||
async with AsyncWebCrawler() as crawler:
|
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||||
session_id = "github_commits_session"
|
session_id = "wait_for_session"
|
||||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
all_commits = []
|
||||||
all_commits = []
|
|
||||||
|
|
||||||
# Define extraction schema
|
js_next_page = """
|
||||||
schema = {
|
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||||
"name": "Commit Extractor",
|
if (commits.length > 0) {
|
||||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
window.lastCommit = commits[0].textContent.trim();
|
||||||
"fields": [{
|
}
|
||||||
"name": "title", "selector": "h4.markdown-title", "type": "text"
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||||
}],
|
if (button) {button.click(); console.log('button clicked') }
|
||||||
}
|
"""
|
||||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
|
||||||
|
|
||||||
# JavaScript and wait configurations
|
wait_for = """() => {
|
||||||
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
|
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||||
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
|
if (commits.length === 0) return false;
|
||||||
|
const firstCommit = commits[0].textContent.trim();
|
||||||
# Crawl multiple pages
|
return firstCommit !== window.lastCommit;
|
||||||
|
}"""
|
||||||
|
|
||||||
|
schema = {
|
||||||
|
"name": "Commit Extractor",
|
||||||
|
"baseSelector": "li[data-testid='commit-row-item']",
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"name": "title",
|
||||||
|
"selector": "h4 a",
|
||||||
|
"type": "text",
|
||||||
|
"transform": "strip",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
verbose=True,
|
||||||
|
headless=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
for page in range(3):
|
for page in range(3):
|
||||||
config = CrawlerRunConfig(
|
crawler_config = CrawlerRunConfig(
|
||||||
url=url,
|
|
||||||
session_id=session_id,
|
session_id=session_id,
|
||||||
|
css_selector="li[data-testid='commit-row-item']",
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
js_code=js_next_page if page > 0 else None,
|
js_code=js_next_page if page > 0 else None,
|
||||||
wait_for=wait_for if page > 0 else None,
|
wait_for=wait_for if page > 0 else None,
|
||||||
js_only=page > 0,
|
js_only=page > 0,
|
||||||
cache_mode=CacheMode.BYPASS
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
capture_console_messages=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await crawler.arun(config=config)
|
result = await crawler.arun(url=url, config=crawler_config)
|
||||||
if result.success:
|
|
||||||
|
if result.console_messages:
|
||||||
|
print(f"Page {page + 1} console messages:", result.console_messages)
|
||||||
|
|
||||||
|
if result.extracted_content:
|
||||||
|
# print(f"Page {page + 1} result:", result.extracted_content)
|
||||||
commits = json.loads(result.extracted_content)
|
commits = json.loads(result.extracted_content)
|
||||||
all_commits.extend(commits)
|
all_commits.extend(commits)
|
||||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||||
|
else:
|
||||||
|
print(f"Page {page + 1}: No content extracted")
|
||||||
|
|
||||||
|
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||||
# Clean up session
|
# Clean up session
|
||||||
await crawler.crawler_strategy.kill_session(session_id)
|
await crawler.crawler_strategy.kill_session(session_id)
|
||||||
return all_commits
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
|
|||||||
wait_after_scroll=1.0 # Twitter needs time to load
|
wait_after_scroll=1.0 # Twitter needs time to load
|
||||||
)
|
)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
virtual_scroll_config=virtual_config,
|
virtual_scroll_config=virtual_config
|
||||||
# Optional: Set headless=False to watch it work
|
|
||||||
# browser_config=BrowserConfig(headless=False)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://twitter.com/search?q=AI",
|
url="https://twitter.com/search?q=AI",
|
||||||
config=config
|
config=config
|
||||||
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
|
|||||||
Virtual Scroll works seamlessly with extraction strategies:
|
Virtual Scroll works seamlessly with extraction strategies:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy, LLMConfig
|
||||||
|
|
||||||
# Define extraction schema
|
# Define extraction schema
|
||||||
schema = {
|
schema = {
|
||||||
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
|
|||||||
scroll_count=20
|
scroll_count=20
|
||||||
),
|
),
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
provider="openai/gpt-4o-mini",
|
llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
|
||||||
schema=schema
|
schema=schema
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
|||||||
|
|
||||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||||
- **PDF Parsing**: Extract data from PDF documents
|
|
||||||
- **Performance Optimizations**: Significant speed and memory improvements
|
- **Performance Optimizations**: Significant speed and memory improvements
|
||||||
|
|
||||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||||
@@ -30,44 +29,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
- Extraction confidence scores
|
- Extraction confidence scores
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
|
||||||
# Initialize with custom learning parameters
|
# Initialize with custom adaptive parameters
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||||
max_history=100, # Remember last 100 crawls per domain
|
max_depth=5, # Maximum crawl depth
|
||||||
learning_rate=0.2, # How quickly to adapt to changes
|
max_pages=20, # Maximum number of pages to crawl
|
||||||
patterns_per_page=3, # Patterns to learn per page type
|
top_k_links=3, # Number of top links to follow per page
|
||||||
extraction_strategy='css' # 'css' or 'xpath'
|
strategy="statistical", # 'statistical' or 'embedding'
|
||||||
|
coverage_weight=0.4, # Weight for coverage in confidence calculation
|
||||||
|
consistency_weight=0.3, # Weight for consistency in confidence calculation
|
||||||
|
saturation_weight=0.3 # Weight for saturation in confidence calculation
|
||||||
)
|
)
|
||||||
|
|
||||||
adaptive_crawler = AdaptiveCrawler(config)
|
# Initialize adaptive crawler with web crawler
|
||||||
|
|
||||||
# First crawl - crawler learns the structure
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||||
"https://news.example.com/article/12345",
|
|
||||||
config=CrawlerRunConfig(
|
# Crawl and learn patterns
|
||||||
adaptive_config=config,
|
state = await adaptive_crawler.digest(
|
||||||
extraction_hints={ # Optional hints to speed up learning
|
start_url="https://news.example.com/article/12345",
|
||||||
"title": "article h1",
|
query="latest news articles and content"
|
||||||
"content": "article .body-content"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Crawler identifies and stores patterns
|
# Access results and confidence
|
||||||
if result.success:
|
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
|
||||||
state = adaptive_crawler.get_state("news.example.com")
|
print(f"Pages Crawled: {len(state.crawled_urls)}")
|
||||||
print(f"Learned {len(state.patterns)} patterns")
|
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
|
||||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
|
||||||
|
|
||||||
# Subsequent crawls - uses learned patterns
|
|
||||||
result2 = await crawler.arun(
|
|
||||||
"https://news.example.com/article/67890",
|
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
|
||||||
)
|
|
||||||
# Automatically extracts using learned patterns!
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
@@ -92,9 +81,7 @@ twitter_config = VirtualScrollConfig(
|
|||||||
container_selector="[data-testid='primaryColumn']",
|
container_selector="[data-testid='primaryColumn']",
|
||||||
scroll_count=20, # Number of scrolls
|
scroll_count=20, # Number of scrolls
|
||||||
scroll_by="container_height", # Smart scrolling by container size
|
scroll_by="container_height", # Smart scrolling by container size
|
||||||
wait_after_scroll=1.0, # Let content load
|
wait_after_scroll=1.0 # Let content load
|
||||||
capture_method="incremental", # Capture new content on each scroll
|
|
||||||
deduplicate=True # Remove duplicate elements
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For e-commerce product grids (Instagram style)
|
# For e-commerce product grids (Instagram style)
|
||||||
@@ -102,8 +89,7 @@ grid_config = VirtualScrollConfig(
|
|||||||
container_selector="main .product-grid",
|
container_selector="main .product-grid",
|
||||||
scroll_count=30,
|
scroll_count=30,
|
||||||
scroll_by=800, # Fixed pixel scrolling
|
scroll_by=800, # Fixed pixel scrolling
|
||||||
wait_after_scroll=1.5, # Images need time
|
wait_after_scroll=1.5 # Images need time
|
||||||
stop_on_no_change=True # Smart stopping
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For news feeds with lazy loading
|
# For news feeds with lazy loading
|
||||||
@@ -111,9 +97,7 @@ news_config = VirtualScrollConfig(
|
|||||||
container_selector=".article-feed",
|
container_selector=".article-feed",
|
||||||
scroll_count=50,
|
scroll_count=50,
|
||||||
scroll_by="page_height", # Viewport-based scrolling
|
scroll_by="page_height", # Viewport-based scrolling
|
||||||
wait_after_scroll=0.5,
|
wait_after_scroll=0.5 # Wait for content to load
|
||||||
wait_for_selector=".article-card", # Wait for specific elements
|
|
||||||
timeout=30000 # Max 30 seconds total
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use it in your crawl
|
# Use it in your crawl
|
||||||
@@ -160,29 +144,17 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
### The Three-Layer Scoring System
|
### The Three-Layer Scoring System
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
# Configure intelligent link analysis
|
# Configure intelligent link analysis
|
||||||
link_config = LinkPreviewConfig(
|
link_config = LinkPreviewConfig(
|
||||||
# What to analyze
|
|
||||||
include_internal=True,
|
include_internal=True,
|
||||||
include_external=True,
|
include_external=False,
|
||||||
max_links=100, # Analyze top 100 links
|
max_links=10,
|
||||||
|
concurrency=5,
|
||||||
# Relevance scoring
|
query="python tutorial", # For contextual scoring
|
||||||
query="machine learning tutorials", # Your interest
|
score_threshold=0.3,
|
||||||
score_threshold=0.3, # Minimum relevance score
|
verbose=True
|
||||||
|
|
||||||
# Performance
|
|
||||||
concurrent_requests=10, # Parallel processing
|
|
||||||
timeout_per_link=5000, # 5s per link
|
|
||||||
|
|
||||||
# Advanced scoring weights
|
|
||||||
scoring_weights={
|
|
||||||
"intrinsic": 0.3, # Link quality indicators
|
|
||||||
"contextual": 0.5, # Relevance to query
|
|
||||||
"popularity": 0.2 # Link prominence
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use in your crawl
|
# Use in your crawl
|
||||||
@@ -190,35 +162,51 @@ result = await crawler.arun(
|
|||||||
"https://tech-blog.example.com",
|
"https://tech-blog.example.com",
|
||||||
config=CrawlerRunConfig(
|
config=CrawlerRunConfig(
|
||||||
link_preview_config=link_config,
|
link_preview_config=link_config,
|
||||||
score_links=True
|
score_links=True, # Enable intrinsic scoring
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Access scored and sorted links
|
# Access scored and sorted links
|
||||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
if result.success and result.links:
|
||||||
print(f"Score: {link['total_score']:.3f}")
|
# Get scored links
|
||||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
internal_links = result.links.get("internal", [])
|
||||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
scored_links = [l for l in internal_links if l.get("total_score")]
|
||||||
print(f" URL: {link['href']}")
|
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||||
print(f" Title: {link['head_data']['title']}")
|
|
||||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
# Create a scoring table
|
||||||
|
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
||||||
|
table.add_column("Link Text", style="cyan", width=40)
|
||||||
|
table.add_column("Intrinsic Score", justify="center")
|
||||||
|
table.add_column("Contextual Score", justify="center")
|
||||||
|
table.add_column("Total Score", justify="center", style="bold green")
|
||||||
|
|
||||||
|
for link in scored_links[:5]:
|
||||||
|
text = link.get('text', 'No text')[:40]
|
||||||
|
table.add_row(
|
||||||
|
text,
|
||||||
|
f"{link.get('intrinsic_score', 0):.1f}/10",
|
||||||
|
f"{link.get('contextual_score', 0):.2f}/1",
|
||||||
|
f"{link.get('total_score', 0):.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scoring Components:**
|
**Scoring Components:**
|
||||||
|
|
||||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
1. **Intrinsic Score**: Based on link quality indicators
|
||||||
- Position on page (navigation, content, footer)
|
- Position on page (navigation, content, footer)
|
||||||
- Link attributes (rel, title, class names)
|
- Link attributes (rel, title, class names)
|
||||||
- Anchor text quality and length
|
- Anchor text quality and length
|
||||||
- URL structure and depth
|
- URL structure and depth
|
||||||
|
|
||||||
2. **Contextual Score (0-1)**: Relevance to your query
|
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||||
- Semantic similarity using embeddings
|
|
||||||
- Keyword matching in link text and title
|
- Keyword matching in link text and title
|
||||||
- Meta description analysis
|
- Meta description analysis
|
||||||
- Content preview scoring
|
- Content preview scoring
|
||||||
|
|
||||||
3. **Total Score**: Weighted combination for final ranking
|
3. **Total Score**: Combined score for final ranking
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||||
@@ -240,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
|||||||
# Basic discovery - find all product pages
|
# Basic discovery - find all product pages
|
||||||
seeder_config = SeedingConfig(
|
seeder_config = SeedingConfig(
|
||||||
# Discovery sources
|
# Discovery sources
|
||||||
source="sitemap+cc", # Sitemap + Common Crawl
|
source="cc+sitemap", # Sitemap + Common Crawl
|
||||||
|
|
||||||
# Filtering
|
# Filtering
|
||||||
pattern="*/product/*", # URL pattern matching
|
pattern="*/product/*", # URL pattern matching
|
||||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
|
||||||
|
|
||||||
# Validation
|
# Validation
|
||||||
live_check=True, # Verify URLs are alive
|
live_check=True, # Verify URLs are alive
|
||||||
max_urls=5000, # Stop at 5000 URLs
|
max_urls=50, # Stop at 50 URLs
|
||||||
|
|
||||||
# Performance
|
# Performance
|
||||||
concurrency=100, # Parallel requests
|
concurrency=100, # Maximum concurrent requests for live checks/head extraction
|
||||||
hits_per_sec=10 # Rate limiting
|
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
|
||||||
)
|
)
|
||||||
|
|
||||||
seeder = AsyncUrlSeeder(seeder_config)
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.discover("https://shop.example.com")
|
console.print("Discovering URLs from Python docs...")
|
||||||
|
urls = await seeder.urls("docs.python.org", seeding_config)
|
||||||
|
console.print(f"\n✓ Discovered {len(urls)} URLs")
|
||||||
|
|
||||||
# Advanced: Relevance-based discovery
|
# Advanced: Relevance-based discovery
|
||||||
research_config = SeedingConfig(
|
research_config = SeedingConfig(
|
||||||
source="crawl+sitemap", # Deep crawl + sitemap
|
source="sitemap+cc", # Sitemap + Common Crawl
|
||||||
pattern="*/blog/*", # Blog posts only
|
pattern="*/blog/*", # Blog posts only
|
||||||
|
|
||||||
# Content relevance
|
# Content relevance
|
||||||
extract_head=True, # Get meta tags
|
extract_head=True, # Get meta tags
|
||||||
query="quantum computing tutorials",
|
query="quantum computing tutorials",
|
||||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
scoring_method="bm25", # BM25 scoring method
|
||||||
score_threshold=0.4, # High relevance only
|
score_threshold=0.4, # High relevance only
|
||||||
|
|
||||||
# Smart filtering
|
# Smart filtering
|
||||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
||||||
min_content_length=500, # Skip thin content
|
|
||||||
|
|
||||||
force=True # Bypass cache
|
force=True # Bypass cache
|
||||||
)
|
)
|
||||||
|
|
||||||
# Discover with progress tracking
|
# Discover with progress tracking
|
||||||
discovered = []
|
discovered = []
|
||||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
async with AsyncUrlSeeder() as seeder:
|
||||||
discovered.extend(batch)
|
discovered = await seeder.urls("https://physics-blog.com", research_config)
|
||||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
console.print(f"\n✓ Discovered {len(discovered)} URLs")
|
||||||
|
|
||||||
# Results include scores and metadata
|
# Results include scores and metadata
|
||||||
for url_data in discovered[:5]:
|
for url_data in discovered[:5]:
|
||||||
print(f"URL: {url_data['url']}")
|
print(f"URL: {url_data['url']}")
|
||||||
print(f"Score: {url_data['score']:.3f}")
|
print(f"Score: {url_data['relevance_score']:.3f}")
|
||||||
print(f"Title: {url_data['title']}")
|
print(f"Title: {url_data['head_data']['title']}")
|
||||||
```
|
```
|
||||||
|
|
||||||
**Discovery Methods:**
|
**Discovery Methods:**
|
||||||
@@ -309,35 +297,18 @@ This release includes significant performance improvements through optimized res
|
|||||||
### What We Optimized
|
### What We Optimized
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Before v0.7.0 (slow)
|
# Optimized crawling with v0.7.0 improvements
|
||||||
results = []
|
results = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
result = await crawler.arun(url)
|
result = await crawler.arun(
|
||||||
results.append(result)
|
url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
# After v0.7.0 (fast)
|
# Performance optimizations
|
||||||
# Automatic batching and connection pooling
|
wait_until="domcontentloaded", # Faster than networkidle
|
||||||
results = await crawler.arun_batch(
|
cache_mode=CacheMode.ENABLED # Enable caching
|
||||||
urls,
|
)
|
||||||
config=CrawlerRunConfig(
|
|
||||||
# New performance options
|
|
||||||
batch_size=10, # Process 10 URLs concurrently
|
|
||||||
reuse_browser=True, # Keep browser warm
|
|
||||||
eager_loading=False, # Load only what's needed
|
|
||||||
streaming_extraction=True, # Stream large extractions
|
|
||||||
|
|
||||||
# Optimized defaults
|
|
||||||
wait_until="domcontentloaded", # Faster than networkidle
|
|
||||||
exclude_external_resources=True, # Skip third-party assets
|
|
||||||
block_ads=True # Ad blocking built-in
|
|
||||||
)
|
)
|
||||||
)
|
results.append(result)
|
||||||
|
|
||||||
# Memory-efficient streaming for large crawls
|
|
||||||
async for result in crawler.arun_stream(large_url_list):
|
|
||||||
# Process results as they complete
|
|
||||||
await process_result(result)
|
|
||||||
# Memory is freed after each iteration
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Performance Gains:**
|
**Performance Gains:**
|
||||||
@@ -347,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list):
|
|||||||
- **Memory Usage**: 60% reduction with streaming processing
|
- **Memory Usage**: 60% reduction with streaming processing
|
||||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||||
|
|
||||||
## 📄 PDF Support
|
|
||||||
|
|
||||||
PDF extraction is now natively supported in Crawl4AI.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Extract data from PDF documents
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://example.com/report.pdf",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
pdf_extraction=True,
|
|
||||||
extraction_strategy=JsonCssExtractionStrategy({
|
|
||||||
# Works on converted PDF structure
|
|
||||||
"title": {"selector": "h1", "type": "text"},
|
|
||||||
"sections": {"selector": "h2", "type": "list"}
|
|
||||||
})
|
|
||||||
)
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔧 Important Changes
|
## 🔧 Important Changes
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
|||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
# Create an adaptive crawler
|
# Create an adaptive crawler (config is optional)
|
||||||
adaptive = AdaptiveCrawler(crawler)
|
adaptive = AdaptiveCrawler(crawler)
|
||||||
|
|
||||||
# Start crawling with a query
|
# Start crawling with a query
|
||||||
@@ -59,13 +59,13 @@ async def main():
|
|||||||
from crawl4ai import AdaptiveConfig
|
from crawl4ai import AdaptiveConfig
|
||||||
|
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8)
|
confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
|
||||||
max_pages=20, # Maximum pages to crawl (default: 50)
|
max_pages=30, # Maximum pages to crawl (default: 20)
|
||||||
top_k_links=3, # Links to follow per page (default: 5)
|
top_k_links=5, # Links to follow per page (default: 3)
|
||||||
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
||||||
)
|
)
|
||||||
|
|
||||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Crawling Strategies
|
## Crawling Strategies
|
||||||
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
|
|||||||
The confidence score (0-1) indicates how sufficient the gathered information is:
|
The confidence score (0-1) indicates how sufficient the gathered information is:
|
||||||
- **0.0-0.3**: Insufficient information, needs more crawling
|
- **0.0-0.3**: Insufficient information, needs more crawling
|
||||||
- **0.3-0.6**: Partial information, may answer basic queries
|
- **0.3-0.6**: Partial information, may answer basic queries
|
||||||
- **0.6-0.8**: Good coverage, can answer most queries
|
- **0.6-0.7**: Good coverage, can answer most queries
|
||||||
- **0.8-1.0**: Excellent coverage, comprehensive information
|
- **0.7-1.0**: Excellent coverage, comprehensive information
|
||||||
|
|
||||||
### Statistics Display
|
### Statistics Display
|
||||||
|
|
||||||
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
|||||||
- Avoid overly broad queries
|
- Avoid overly broad queries
|
||||||
|
|
||||||
### 2. Threshold Tuning
|
### 2. Threshold Tuning
|
||||||
- Start with default (0.8) for general use
|
- Start with default (0.7) for general use
|
||||||
- Lower to 0.6-0.7 for exploratory crawling
|
- Lower to 0.5-0.6 for exploratory crawling
|
||||||
- Raise to 0.9+ for exhaustive coverage
|
- Raise to 0.8+ for exhaustive coverage
|
||||||
|
|
||||||
### 3. Performance Optimization
|
### 3. Performance Optimization
|
||||||
- Use appropriate `max_pages` limits
|
- Use appropriate `max_pages` limits
|
||||||
|
|||||||
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
|
|||||||
word_count_threshold=300 # Only substantial articles
|
word_count_threshold=300 # Only substantial articles
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract URLs and stream results as they come
|
# Extract URLs and crawl them
|
||||||
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
||||||
results = await crawler.arun_many(tutorial_urls, config=config)
|
results = await crawler.arun_many(tutorial_urls, config=config)
|
||||||
|
|
||||||
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
# Use both sources
|
# Use both sources
|
||||||
config = SeedingConfig(source="cc+sitemap")
|
config = SeedingConfig(source="sitemap+cc")
|
||||||
urls = await seeder.urls("example.com", config)
|
urls = await seeder.urls("example.com", config)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
|
|||||||
|
|
||||||
| Parameter | Type | Default | Description |
|
| Parameter | Type | Default | Description |
|
||||||
|-----------|------|---------|-------------|
|
|-----------|------|---------|-------------|
|
||||||
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
|
| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
|
||||||
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
||||||
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
||||||
| `live_check` | bool | False | Verify URLs are accessible |
|
| `live_check` | bool | False | Verify URLs are accessible |
|
||||||
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
||||||
| `concurrency` | int | 10 | Parallel workers for fetching |
|
| `concurrency` | int | 10 | Parallel workers for fetching |
|
||||||
| `hits_per_sec` | int | None | Rate limit for requests |
|
| `hits_per_sec` | int | 5 | Rate limit for requests |
|
||||||
| `force` | bool | False | Bypass cache, fetch fresh data |
|
| `force` | bool | False | Bypass cache, fetch fresh data |
|
||||||
| `verbose` | bool | False | Show detailed progress |
|
| `verbose` | bool | False | Show detailed progress |
|
||||||
| `query` | str | None | Search query for BM25 scoring |
|
| `query` | str | None | Search query for BM25 scoring |
|
||||||
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
|
|||||||
```python
|
```python
|
||||||
# Find specific products
|
# Find specific products
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="cc+sitemap", # Use both sources
|
source="sitemap+cc", # Use both sources
|
||||||
extract_head=True,
|
extract_head=True,
|
||||||
query="wireless headphones noise canceling",
|
query="wireless headphones noise canceling",
|
||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
@@ -782,7 +782,7 @@ class ResearchAssistant:
|
|||||||
|
|
||||||
# Step 1: Discover relevant URLs
|
# Step 1: Discover relevant URLs
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="cc+sitemap", # Maximum coverage
|
source="sitemap+cc", # Maximum coverage
|
||||||
extract_head=True, # Get metadata
|
extract_head=True, # Get metadata
|
||||||
query=topic, # Research topic
|
query=topic, # Research topic
|
||||||
scoring_method="bm25", # Smart scoring
|
scoring_method="bm25", # Smart scoring
|
||||||
@@ -832,7 +832,8 @@ class ResearchAssistant:
|
|||||||
# Extract URLs and crawl all articles
|
# Extract URLs and crawl all articles
|
||||||
article_urls = [article['url'] for article in top_articles]
|
article_urls = [article['url'] for article in top_articles]
|
||||||
results = []
|
results = []
|
||||||
async for result in await crawler.arun_many(article_urls, config=config):
|
crawl_results = await crawler.arun_many(article_urls, config=config)
|
||||||
|
async for result in crawl_results:
|
||||||
if result.success:
|
if result.success:
|
||||||
results.append({
|
results.append({
|
||||||
'url': result.url,
|
'url': result.url,
|
||||||
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
|
|||||||
# When crawling many URLs
|
# When crawling many URLs
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
# Assuming urls is a list of URL strings
|
# Assuming urls is a list of URL strings
|
||||||
results = await crawler.arun_many(urls, config=config)
|
crawl_results = await crawler.arun_many(urls, config=config)
|
||||||
|
|
||||||
# Process as they arrive
|
# Process as they arrive
|
||||||
async for result in results:
|
async for result in crawl_results:
|
||||||
process_immediately(result) # Don't wait for all
|
process_immediately(result) # Don't wait for all
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -1020,7 +1021,7 @@ config = SeedingConfig(
|
|||||||
|
|
||||||
# E-commerce product discovery
|
# E-commerce product discovery
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="cc+sitemap",
|
source="sitemap+cc",
|
||||||
pattern="*/product/*",
|
pattern="*/product/*",
|
||||||
extract_head=True,
|
extract_head=True,
|
||||||
live_check=True
|
live_check=True
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ nav:
|
|||||||
- "Command Line Interface": "core/cli.md"
|
- "Command Line Interface": "core/cli.md"
|
||||||
- "Simple Crawling": "core/simple-crawling.md"
|
- "Simple Crawling": "core/simple-crawling.md"
|
||||||
- "Deep Crawling": "core/deep-crawling.md"
|
- "Deep Crawling": "core/deep-crawling.md"
|
||||||
|
- "Adaptive Crawling": "core/adaptive-crawling.md"
|
||||||
|
- "URL Seeding": "core/url-seeding.md"
|
||||||
- "C4A-Script": "core/c4a-script.md"
|
- "C4A-Script": "core/c4a-script.md"
|
||||||
- "Crawler Result": "core/crawler-result.md"
|
- "Crawler Result": "core/crawler-result.md"
|
||||||
- "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
|
- "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
|
||||||
@@ -37,6 +39,7 @@ nav:
|
|||||||
- "Link & Media": "core/link-media.md"
|
- "Link & Media": "core/link-media.md"
|
||||||
- Advanced:
|
- Advanced:
|
||||||
- "Overview": "advanced/advanced-features.md"
|
- "Overview": "advanced/advanced-features.md"
|
||||||
|
- "Adaptive Strategies": "advanced/adaptive-strategies.md"
|
||||||
- "Virtual Scroll": "advanced/virtual-scroll.md"
|
- "Virtual Scroll": "advanced/virtual-scroll.md"
|
||||||
- "File Downloading": "advanced/file-downloading.md"
|
- "File Downloading": "advanced/file-downloading.md"
|
||||||
- "Lazy Loading": "advanced/lazy-loading.md"
|
- "Lazy Loading": "advanced/lazy-loading.md"
|
||||||
|
|||||||
Reference in New Issue
Block a user