docs: Update release notes and docs for v0.7.0 with teh correct parameters and explanations
This commit is contained in:
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **PDF Parsing**: Extract data from PDF documents
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
@@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler:
|
||||
### The Three-Layer Scoring System
|
||||
|
||||
```python
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
# What to analyze
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
max_links=100, # Analyze top 100 links
|
||||
|
||||
# Relevance scoring
|
||||
query="machine learning tutorials", # Your interest
|
||||
score_threshold=0.3, # Minimum relevance score
|
||||
|
||||
# Performance
|
||||
concurrent_requests=10, # Parallel processing
|
||||
timeout_per_link=5000, # 5s per link
|
||||
|
||||
# Advanced scoring weights
|
||||
scoring_weights={
|
||||
"intrinsic": 0.3, # Link quality indicators
|
||||
"contextual": 0.5, # Relevance to query
|
||||
"popularity": 0.2 # Link prominence
|
||||
}
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Use in your crawl
|
||||
@@ -175,35 +162,51 @@ result = await crawler.arun(
|
||||
"https://tech-blog.example.com",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
print(f"Score: {link['total_score']:.3f}")
|
||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
||||
print(f" URL: {link['href']}")
|
||||
print(f" Title: {link['head_data']['title']}")
|
||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
||||
if result.success and result.links:
|
||||
# Get scored links
|
||||
internal_links = result.links.get("internal", [])
|
||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||
|
||||
# Create a scoring table
|
||||
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
||||
table.add_column("Link Text", style="cyan", width=40)
|
||||
table.add_column("Intrinsic Score", justify="center")
|
||||
table.add_column("Contextual Score", justify="center")
|
||||
table.add_column("Total Score", justify="center", style="bold green")
|
||||
|
||||
for link in scored_links[:5]:
|
||||
text = link.get('text', 'No text')[:40]
|
||||
table.add_row(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1",
|
||||
f"{link.get('total_score', 0):.3f}"
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score (0-1)**: Relevance to your query
|
||||
- Semantic similarity using embeddings
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Weighted combination for final ranking
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
@@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
# Basic discovery - find all product pages
|
||||
seeder_config = SeedingConfig(
|
||||
# Discovery sources
|
||||
source="sitemap+cc", # Sitemap + Common Crawl
|
||||
source="cc+sitemap", # Sitemap + Common Crawl
|
||||
|
||||
# Filtering
|
||||
pattern="*/product/*", # URL pattern matching
|
||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
||||
|
||||
# Validation
|
||||
live_check=True, # Verify URLs are alive
|
||||
max_urls=5000, # Stop at 5000 URLs
|
||||
max_urls=50, # Stop at 50 URLs
|
||||
|
||||
# Performance
|
||||
concurrency=100, # Parallel requests
|
||||
hits_per_sec=10 # Rate limiting
|
||||
concurrency=100, # Maximum concurrent requests for live checks/head extraction
|
||||
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
|
||||
)
|
||||
|
||||
seeder = AsyncUrlSeeder(seeder_config)
|
||||
urls = await seeder.discover("https://shop.example.com")
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
console.print("Discovering URLs from Python docs...")
|
||||
urls = await seeder.urls("docs.python.org", seeding_config)
|
||||
console.print(f"\n✓ Discovered {len(urls)} URLs")
|
||||
|
||||
# Advanced: Relevance-based discovery
|
||||
research_config = SeedingConfig(
|
||||
source="crawl+sitemap", # Deep crawl + sitemap
|
||||
source="sitemap+cc", # Sitemap + Common Crawl
|
||||
pattern="*/blog/*", # Blog posts only
|
||||
|
||||
# Content relevance
|
||||
extract_head=True, # Get meta tags
|
||||
query="quantum computing tutorials",
|
||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
||||
scoring_method="bm25", # BM25 scoring method
|
||||
score_threshold=0.4, # High relevance only
|
||||
|
||||
# Smart filtering
|
||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
||||
min_content_length=500, # Skip thin content
|
||||
|
||||
force=True # Bypass cache
|
||||
)
|
||||
|
||||
# Discover with progress tracking
|
||||
discovered = []
|
||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
||||
discovered.extend(batch)
|
||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
discovered = await seeder.urls("https://physics-blog.com", research_config)
|
||||
console.print(f"\n✓ Discovered {len(discovered)} URLs")
|
||||
|
||||
# Results include scores and metadata
|
||||
for url_data in discovered[:5]:
|
||||
print(f"URL: {url_data['url']}")
|
||||
print(f"Score: {url_data['score']:.3f}")
|
||||
print(f"Title: {url_data['title']}")
|
||||
print(f"Score: {url_data['relevance_score']:.3f}")
|
||||
print(f"Title: {url_data['head_data']['title']}")
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
@@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Before v0.7.0 (slow)
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(url)
|
||||
results.append(result)
|
||||
|
||||
# After v0.7.0 (fast)
|
||||
# Automatic batching and connection pooling
|
||||
results = await crawler.arun_batch(
|
||||
urls,
|
||||
config=CrawlerRunConfig(
|
||||
# New performance options
|
||||
batch_size=10, # Process 10 URLs concurrently
|
||||
reuse_browser=True, # Keep browser warm
|
||||
eager_loading=False, # Load only what's needed
|
||||
streaming_extraction=True, # Stream large extractions
|
||||
|
||||
# Optimized defaults
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
exclude_external_resources=True, # Skip third-party assets
|
||||
block_ads=True # Ad blocking built-in
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Memory-efficient streaming for large crawls
|
||||
async for result in crawler.arun_stream(large_url_list):
|
||||
# Process results as they complete
|
||||
await process_result(result)
|
||||
# Memory is freed after each iteration
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
@@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list):
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
## 📄 PDF Support
|
||||
|
||||
PDF extraction is now natively supported in Crawl4AI.
|
||||
|
||||
```python
|
||||
# Extract data from PDF documents
|
||||
result = await crawler.arun(
|
||||
"https://example.com/report.pdf",
|
||||
config=CrawlerRunConfig(
|
||||
pdf_extraction=True,
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
# Works on converted PDF structure
|
||||
"title": {"selector": "h1", "type": "text"},
|
||||
"sections": {"selector": "h2", "type": "list"}
|
||||
})
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
|
||||
Reference in New Issue
Block a user