docs: Update release notes and docs for v0.7.0 with teh correct parameters and explanations

This commit is contained in:
ntohidi
2025-07-15 11:32:04 +02:00
parent 205df1e330
commit 1d1970ae69
5 changed files with 146 additions and 210 deletions

View File

@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization - **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
- **PDF Parsing**: Extract data from PDF documents
- **Performance Optimizations**: Significant speed and memory improvements - **Performance Optimizations**: Significant speed and memory improvements
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
@@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler:
### The Three-Layer Scoring System ### The Three-Layer Scoring System
```python ```python
from crawl4ai import LinkPreviewConfig from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
# Configure intelligent link analysis # Configure intelligent link analysis
link_config = LinkPreviewConfig( link_config = LinkPreviewConfig(
# What to analyze
include_internal=True, include_internal=True,
include_external=True, include_external=False,
max_links=100, # Analyze top 100 links max_links=10,
concurrency=5,
# Relevance scoring query="python tutorial", # For contextual scoring
query="machine learning tutorials", # Your interest score_threshold=0.3,
score_threshold=0.3, # Minimum relevance score verbose=True
# Performance
concurrent_requests=10, # Parallel processing
timeout_per_link=5000, # 5s per link
# Advanced scoring weights
scoring_weights={
"intrinsic": 0.3, # Link quality indicators
"contextual": 0.5, # Relevance to query
"popularity": 0.2 # Link prominence
}
) )
# Use in your crawl # Use in your crawl
@@ -175,35 +162,51 @@ result = await crawler.arun(
"https://tech-blog.example.com", "https://tech-blog.example.com",
config=CrawlerRunConfig( config=CrawlerRunConfig(
link_preview_config=link_config, link_preview_config=link_config,
score_links=True score_links=True, # Enable intrinsic scoring
cache_mode=CacheMode.BYPASS
) )
) )
# Access scored and sorted links # Access scored and sorted links
for link in result.links["internal"][:10]: # Top 10 internal links if result.success and result.links:
print(f"Score: {link['total_score']:.3f}") # Get scored links
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes internal_links = result.links.get("internal", [])
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query scored_links = [l for l in internal_links if l.get("total_score")]
print(f" URL: {link['href']}") scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
print(f" Title: {link['head_data']['title']}")
print(f" Description: {link['head_data']['meta']['description'][:100]}...") # Create a scoring table
table = Table(title="Link Scoring Results", box=box.ROUNDED)
table.add_column("Link Text", style="cyan", width=40)
table.add_column("Intrinsic Score", justify="center")
table.add_column("Contextual Score", justify="center")
table.add_column("Total Score", justify="center", style="bold green")
for link in scored_links[:5]:
text = link.get('text', 'No text')[:40]
table.add_row(
text,
f"{link.get('intrinsic_score', 0):.1f}/10",
f"{link.get('contextual_score', 0):.2f}/1",
f"{link.get('total_score', 0):.3f}"
)
console.print(table)
``` ```
**Scoring Components:** **Scoring Components:**
1. **Intrinsic Score (0-10)**: Based on link quality indicators 1. **Intrinsic Score**: Based on link quality indicators
- Position on page (navigation, content, footer) - Position on page (navigation, content, footer)
- Link attributes (rel, title, class names) - Link attributes (rel, title, class names)
- Anchor text quality and length - Anchor text quality and length
- URL structure and depth - URL structure and depth
2. **Contextual Score (0-1)**: Relevance to your query 2. **Contextual Score**: Relevance to your query using BM25 algorithm
- Semantic similarity using embeddings
- Keyword matching in link text and title - Keyword matching in link text and title
- Meta description analysis - Meta description analysis
- Content preview scoring - Content preview scoring
3. **Total Score**: Weighted combination for final ranking 3. **Total Score**: Combined score for final ranking
**Expected Real-World Impact:** **Expected Real-World Impact:**
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
@@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
# Basic discovery - find all product pages # Basic discovery - find all product pages
seeder_config = SeedingConfig( seeder_config = SeedingConfig(
# Discovery sources # Discovery sources
source="sitemap+cc", # Sitemap + Common Crawl source="cc+sitemap", # Sitemap + Common Crawl
# Filtering # Filtering
pattern="*/product/*", # URL pattern matching pattern="*/product/*", # URL pattern matching
ignore_patterns=["*/reviews/*", "*/questions/*"],
# Validation # Validation
live_check=True, # Verify URLs are alive live_check=True, # Verify URLs are alive
max_urls=5000, # Stop at 5000 URLs max_urls=50, # Stop at 50 URLs
# Performance # Performance
concurrency=100, # Parallel requests concurrency=100, # Maximum concurrent requests for live checks/head extraction
hits_per_sec=10 # Rate limiting hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
) )
seeder = AsyncUrlSeeder(seeder_config) async with AsyncUrlSeeder() as seeder:
urls = await seeder.discover("https://shop.example.com") console.print("Discovering URLs from Python docs...")
urls = await seeder.urls("docs.python.org", seeding_config)
console.print(f"\n✓ Discovered {len(urls)} URLs")
# Advanced: Relevance-based discovery # Advanced: Relevance-based discovery
research_config = SeedingConfig( research_config = SeedingConfig(
source="crawl+sitemap", # Deep crawl + sitemap source="sitemap+cc", # Sitemap + Common Crawl
pattern="*/blog/*", # Blog posts only pattern="*/blog/*", # Blog posts only
# Content relevance # Content relevance
extract_head=True, # Get meta tags extract_head=True, # Get meta tags
query="quantum computing tutorials", query="quantum computing tutorials",
scoring_method="bm25", # Or "semantic" (coming soon) scoring_method="bm25", # BM25 scoring method
score_threshold=0.4, # High relevance only score_threshold=0.4, # High relevance only
# Smart filtering # Smart filtering
filter_nonsense_urls=True, # Remove .xml, .txt, etc. filter_nonsense_urls=True, # Remove .xml, .txt, etc.
min_content_length=500, # Skip thin content
force=True # Bypass cache force=True # Bypass cache
) )
# Discover with progress tracking # Discover with progress tracking
discovered = [] discovered = []
async for batch in seeder.discover_iter("https://physics-blog.com", research_config): async with AsyncUrlSeeder() as seeder:
discovered.extend(batch) discovered = await seeder.urls("https://physics-blog.com", research_config)
print(f"Found {len(discovered)} relevant URLs so far...") console.print(f"\n✓ Discovered {len(discovered)} URLs")
# Results include scores and metadata # Results include scores and metadata
for url_data in discovered[:5]: for url_data in discovered[:5]:
print(f"URL: {url_data['url']}") print(f"URL: {url_data['url']}")
print(f"Score: {url_data['score']:.3f}") print(f"Score: {url_data['relevance_score']:.3f}")
print(f"Title: {url_data['title']}") print(f"Title: {url_data['head_data']['title']}")
``` ```
**Discovery Methods:** **Discovery Methods:**
@@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res
### What We Optimized ### What We Optimized
```python ```python
# Before v0.7.0 (slow) # Optimized crawling with v0.7.0 improvements
results = [] results = []
for url in urls: for url in urls:
result = await crawler.arun(url) result = await crawler.arun(
results.append(result) url,
config=CrawlerRunConfig(
# After v0.7.0 (fast) # Performance optimizations
# Automatic batching and connection pooling wait_until="domcontentloaded", # Faster than networkidle
results = await crawler.arun_batch( cache_mode=CacheMode.ENABLED # Enable caching
urls, )
config=CrawlerRunConfig(
# New performance options
batch_size=10, # Process 10 URLs concurrently
reuse_browser=True, # Keep browser warm
eager_loading=False, # Load only what's needed
streaming_extraction=True, # Stream large extractions
# Optimized defaults
wait_until="domcontentloaded", # Faster than networkidle
exclude_external_resources=True, # Skip third-party assets
block_ads=True # Ad blocking built-in
) )
) results.append(result)
# Memory-efficient streaming for large crawls
async for result in crawler.arun_stream(large_url_list):
# Process results as they complete
await process_result(result)
# Memory is freed after each iteration
``` ```
**Performance Gains:** **Performance Gains:**
@@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list):
- **Memory Usage**: 60% reduction with streaming processing - **Memory Usage**: 60% reduction with streaming processing
- **Concurrent Crawls**: Handle 5x more parallel requests - **Concurrent Crawls**: Handle 5x more parallel requests
## 📄 PDF Support
PDF extraction is now natively supported in Crawl4AI.
```python
# Extract data from PDF documents
result = await crawler.arun(
"https://example.com/report.pdf",
config=CrawlerRunConfig(
pdf_extraction=True,
extraction_strategy=JsonCssExtractionStrategy({
# Works on converted PDF structure
"title": {"selector": "h1", "type": "text"},
"sections": {"selector": "h2", "type": "list"}
})
)
)
```
## 🔧 Important Changes ## 🔧 Important Changes

View File

@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
wait_after_scroll=1.0 # Twitter needs time to load wait_after_scroll=1.0 # Twitter needs time to load
) )
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
config = CrawlerRunConfig( config = CrawlerRunConfig(
virtual_scroll_config=virtual_config, virtual_scroll_config=virtual_config
# Optional: Set headless=False to watch it work
# browser_config=BrowserConfig(headless=False)
) )
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun( result = await crawler.arun(
url="https://twitter.com/search?q=AI", url="https://twitter.com/search?q=AI",
config=config config=config
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
Virtual Scroll works seamlessly with extraction strategies: Virtual Scroll works seamlessly with extraction strategies:
```python ```python
from crawl4ai import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy, LLMConfig
# Define extraction schema # Define extraction schema
schema = { schema = {
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
scroll_count=20 scroll_count=20
), ),
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o-mini", llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
schema=schema schema=schema
) )
) )

View File

@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization - **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
- **PDF Parsing**: Extract data from PDF documents
- **Performance Optimizations**: Significant speed and memory improvements - **Performance Optimizations**: Significant speed and memory improvements
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
@@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler:
### The Three-Layer Scoring System ### The Three-Layer Scoring System
```python ```python
from crawl4ai import LinkPreviewConfig from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
# Configure intelligent link analysis # Configure intelligent link analysis
link_config = LinkPreviewConfig( link_config = LinkPreviewConfig(
# What to analyze
include_internal=True, include_internal=True,
include_external=True, include_external=False,
max_links=100, # Analyze top 100 links max_links=10,
concurrency=5,
# Relevance scoring query="python tutorial", # For contextual scoring
query="machine learning tutorials", # Your interest score_threshold=0.3,
score_threshold=0.3, # Minimum relevance score verbose=True
# Performance
concurrent_requests=10, # Parallel processing
timeout_per_link=5000, # 5s per link
# Advanced scoring weights
scoring_weights={
"intrinsic": 0.3, # Link quality indicators
"contextual": 0.5, # Relevance to query
"popularity": 0.2 # Link prominence
}
) )
# Use in your crawl # Use in your crawl
@@ -175,35 +162,51 @@ result = await crawler.arun(
"https://tech-blog.example.com", "https://tech-blog.example.com",
config=CrawlerRunConfig( config=CrawlerRunConfig(
link_preview_config=link_config, link_preview_config=link_config,
score_links=True score_links=True, # Enable intrinsic scoring
cache_mode=CacheMode.BYPASS
) )
) )
# Access scored and sorted links # Access scored and sorted links
for link in result.links["internal"][:10]: # Top 10 internal links if result.success and result.links:
print(f"Score: {link['total_score']:.3f}") # Get scored links
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes internal_links = result.links.get("internal", [])
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query scored_links = [l for l in internal_links if l.get("total_score")]
print(f" URL: {link['href']}") scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
print(f" Title: {link['head_data']['title']}")
print(f" Description: {link['head_data']['meta']['description'][:100]}...") # Create a scoring table
table = Table(title="Link Scoring Results", box=box.ROUNDED)
table.add_column("Link Text", style="cyan", width=40)
table.add_column("Intrinsic Score", justify="center")
table.add_column("Contextual Score", justify="center")
table.add_column("Total Score", justify="center", style="bold green")
for link in scored_links[:5]:
text = link.get('text', 'No text')[:40]
table.add_row(
text,
f"{link.get('intrinsic_score', 0):.1f}/10",
f"{link.get('contextual_score', 0):.2f}/1",
f"{link.get('total_score', 0):.3f}"
)
console.print(table)
``` ```
**Scoring Components:** **Scoring Components:**
1. **Intrinsic Score (0-10)**: Based on link quality indicators 1. **Intrinsic Score**: Based on link quality indicators
- Position on page (navigation, content, footer) - Position on page (navigation, content, footer)
- Link attributes (rel, title, class names) - Link attributes (rel, title, class names)
- Anchor text quality and length - Anchor text quality and length
- URL structure and depth - URL structure and depth
2. **Contextual Score (0-1)**: Relevance to your query 2. **Contextual Score**: Relevance to your query using BM25 algorithm
- Semantic similarity using embeddings
- Keyword matching in link text and title - Keyword matching in link text and title
- Meta description analysis - Meta description analysis
- Content preview scoring - Content preview scoring
3. **Total Score**: Weighted combination for final ranking 3. **Total Score**: Combined score for final ranking
**Expected Real-World Impact:** **Expected Real-World Impact:**
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
@@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
# Basic discovery - find all product pages # Basic discovery - find all product pages
seeder_config = SeedingConfig( seeder_config = SeedingConfig(
# Discovery sources # Discovery sources
source="sitemap+cc", # Sitemap + Common Crawl source="cc+sitemap", # Sitemap + Common Crawl
# Filtering # Filtering
pattern="*/product/*", # URL pattern matching pattern="*/product/*", # URL pattern matching
ignore_patterns=["*/reviews/*", "*/questions/*"],
# Validation # Validation
live_check=True, # Verify URLs are alive live_check=True, # Verify URLs are alive
max_urls=5000, # Stop at 5000 URLs max_urls=50, # Stop at 50 URLs
# Performance # Performance
concurrency=100, # Parallel requests concurrency=100, # Maximum concurrent requests for live checks/head extraction
hits_per_sec=10 # Rate limiting hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
) )
seeder = AsyncUrlSeeder(seeder_config) async with AsyncUrlSeeder() as seeder:
urls = await seeder.discover("https://shop.example.com") console.print("Discovering URLs from Python docs...")
urls = await seeder.urls("docs.python.org", seeding_config)
console.print(f"\n✓ Discovered {len(urls)} URLs")
# Advanced: Relevance-based discovery # Advanced: Relevance-based discovery
research_config = SeedingConfig( research_config = SeedingConfig(
source="crawl+sitemap", # Deep crawl + sitemap source="sitemap+cc", # Sitemap + Common Crawl
pattern="*/blog/*", # Blog posts only pattern="*/blog/*", # Blog posts only
# Content relevance # Content relevance
extract_head=True, # Get meta tags extract_head=True, # Get meta tags
query="quantum computing tutorials", query="quantum computing tutorials",
scoring_method="bm25", # Or "semantic" (coming soon) scoring_method="bm25", # BM25 scoring method
score_threshold=0.4, # High relevance only score_threshold=0.4, # High relevance only
# Smart filtering # Smart filtering
filter_nonsense_urls=True, # Remove .xml, .txt, etc. filter_nonsense_urls=True, # Remove .xml, .txt, etc.
min_content_length=500, # Skip thin content
force=True # Bypass cache force=True # Bypass cache
) )
# Discover with progress tracking # Discover with progress tracking
discovered = [] discovered = []
async for batch in seeder.discover_iter("https://physics-blog.com", research_config): async with AsyncUrlSeeder() as seeder:
discovered.extend(batch) discovered = await seeder.urls("https://physics-blog.com", research_config)
print(f"Found {len(discovered)} relevant URLs so far...") console.print(f"\n✓ Discovered {len(discovered)} URLs")
# Results include scores and metadata # Results include scores and metadata
for url_data in discovered[:5]: for url_data in discovered[:5]:
print(f"URL: {url_data['url']}") print(f"URL: {url_data['url']}")
print(f"Score: {url_data['score']:.3f}") print(f"Score: {url_data['relevance_score']:.3f}")
print(f"Title: {url_data['title']}") print(f"Title: {url_data['head_data']['title']}")
``` ```
**Discovery Methods:** **Discovery Methods:**
@@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res
### What We Optimized ### What We Optimized
```python ```python
# Before v0.7.0 (slow) # Optimized crawling with v0.7.0 improvements
results = [] results = []
for url in urls: for url in urls:
result = await crawler.arun(url) result = await crawler.arun(
results.append(result) url,
config=CrawlerRunConfig(
# After v0.7.0 (fast) # Performance optimizations
# Automatic batching and connection pooling wait_until="domcontentloaded", # Faster than networkidle
results = await crawler.arun_batch( cache_mode=CacheMode.ENABLED # Enable caching
urls, )
config=CrawlerRunConfig(
# New performance options
batch_size=10, # Process 10 URLs concurrently
reuse_browser=True, # Keep browser warm
eager_loading=False, # Load only what's needed
streaming_extraction=True, # Stream large extractions
# Optimized defaults
wait_until="domcontentloaded", # Faster than networkidle
exclude_external_resources=True, # Skip third-party assets
block_ads=True # Ad blocking built-in
) )
) results.append(result)
# Memory-efficient streaming for large crawls
async for result in crawler.arun_stream(large_url_list):
# Process results as they complete
await process_result(result)
# Memory is freed after each iteration
``` ```
**Performance Gains:** **Performance Gains:**
@@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list):
- **Memory Usage**: 60% reduction with streaming processing - **Memory Usage**: 60% reduction with streaming processing
- **Concurrent Crawls**: Handle 5x more parallel requests - **Concurrent Crawls**: Handle 5x more parallel requests
## 📄 PDF Support
PDF extraction is now natively supported in Crawl4AI.
```python
# Extract data from PDF documents
result = await crawler.arun(
"https://example.com/report.pdf",
config=CrawlerRunConfig(
pdf_extraction=True,
extraction_strategy=JsonCssExtractionStrategy({
# Works on converted PDF structure
"title": {"selector": "h1", "type": "text"},
"sections": {"selector": "h2", "type": "list"}
})
)
)
```
## 🔧 Important Changes ## 🔧 Important Changes

View File

@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
async def main(): async def main():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
# Create an adaptive crawler # Create an adaptive crawler (config is optional)
adaptive = AdaptiveCrawler(crawler) adaptive = AdaptiveCrawler(crawler)
# Start crawling with a query # Start crawling with a query
@@ -59,13 +59,13 @@ async def main():
from crawl4ai import AdaptiveConfig from crawl4ai import AdaptiveConfig
config = AdaptiveConfig( config = AdaptiveConfig(
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8) confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
max_pages=20, # Maximum pages to crawl (default: 50) max_pages=30, # Maximum pages to crawl (default: 20)
top_k_links=3, # Links to follow per page (default: 5) top_k_links=5, # Links to follow per page (default: 3)
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1) min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
) )
adaptive = AdaptiveCrawler(crawler, config=config) adaptive = AdaptiveCrawler(crawler, config)
``` ```
## Crawling Strategies ## Crawling Strategies
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
The confidence score (0-1) indicates how sufficient the gathered information is: The confidence score (0-1) indicates how sufficient the gathered information is:
- **0.0-0.3**: Insufficient information, needs more crawling - **0.0-0.3**: Insufficient information, needs more crawling
- **0.3-0.6**: Partial information, may answer basic queries - **0.3-0.6**: Partial information, may answer basic queries
- **0.6-0.8**: Good coverage, can answer most queries - **0.6-0.7**: Good coverage, can answer most queries
- **0.8-1.0**: Excellent coverage, comprehensive information - **0.7-1.0**: Excellent coverage, comprehensive information
### Statistics Display ### Statistics Display
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
- Avoid overly broad queries - Avoid overly broad queries
### 2. Threshold Tuning ### 2. Threshold Tuning
- Start with default (0.8) for general use - Start with default (0.7) for general use
- Lower to 0.6-0.7 for exploratory crawling - Lower to 0.5-0.6 for exploratory crawling
- Raise to 0.9+ for exhaustive coverage - Raise to 0.8+ for exhaustive coverage
### 3. Performance Optimization ### 3. Performance Optimization
- Use appropriate `max_pages` limits - Use appropriate `max_pages` limits

View File

@@ -137,7 +137,7 @@ async def smart_blog_crawler():
word_count_threshold=300 # Only substantial articles word_count_threshold=300 # Only substantial articles
) )
# Extract URLs and stream results as they come # Extract URLs and crawl them
tutorial_urls = [t["url"] for t in tutorials[:10]] tutorial_urls = [t["url"] for t in tutorials[:10]]
results = await crawler.arun_many(tutorial_urls, config=config) results = await crawler.arun_many(tutorial_urls, config=config)
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
```python ```python
# Use both sources # Use both sources
config = SeedingConfig(source="cc+sitemap") config = SeedingConfig(source="sitemap+cc")
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
``` ```
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
| Parameter | Type | Default | Description | | Parameter | Type | Default | Description |
|-----------|------|---------|-------------| |-----------|------|---------|-------------|
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" | | `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") | | `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
| `extract_head` | bool | False | Extract metadata from page `<head>` | | `extract_head` | bool | False | Extract metadata from page `<head>` |
| `live_check` | bool | False | Verify URLs are accessible | | `live_check` | bool | False | Verify URLs are accessible |
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) | | `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
| `concurrency` | int | 10 | Parallel workers for fetching | | `concurrency` | int | 10 | Parallel workers for fetching |
| `hits_per_sec` | int | None | Rate limit for requests | | `hits_per_sec` | int | 5 | Rate limit for requests |
| `force` | bool | False | Bypass cache, fetch fresh data | | `force` | bool | False | Bypass cache, fetch fresh data |
| `verbose` | bool | False | Show detailed progress | | `verbose` | bool | False | Show detailed progress |
| `query` | str | None | Search query for BM25 scoring | | `query` | str | None | Search query for BM25 scoring |
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
```python ```python
# Find specific products # Find specific products
config = SeedingConfig( config = SeedingConfig(
source="cc+sitemap", # Use both sources source="sitemap+cc", # Use both sources
extract_head=True, extract_head=True,
query="wireless headphones noise canceling", query="wireless headphones noise canceling",
scoring_method="bm25", scoring_method="bm25",
@@ -782,7 +782,7 @@ class ResearchAssistant:
# Step 1: Discover relevant URLs # Step 1: Discover relevant URLs
config = SeedingConfig( config = SeedingConfig(
source="cc+sitemap", # Maximum coverage source="sitemap+cc", # Maximum coverage
extract_head=True, # Get metadata extract_head=True, # Get metadata
query=topic, # Research topic query=topic, # Research topic
scoring_method="bm25", # Smart scoring scoring_method="bm25", # Smart scoring
@@ -832,7 +832,8 @@ class ResearchAssistant:
# Extract URLs and crawl all articles # Extract URLs and crawl all articles
article_urls = [article['url'] for article in top_articles] article_urls = [article['url'] for article in top_articles]
results = [] results = []
async for result in await crawler.arun_many(article_urls, config=config): crawl_results = await crawler.arun_many(article_urls, config=config)
async for result in crawl_results:
if result.success: if result.success:
results.append({ results.append({
'url': result.url, 'url': result.url,
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
# When crawling many URLs # When crawling many URLs
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
# Assuming urls is a list of URL strings # Assuming urls is a list of URL strings
results = await crawler.arun_many(urls, config=config) crawl_results = await crawler.arun_many(urls, config=config)
# Process as they arrive # Process as they arrive
async for result in results: async for result in crawl_results:
process_immediately(result) # Don't wait for all process_immediately(result) # Don't wait for all
``` ```
@@ -1020,7 +1021,7 @@ config = SeedingConfig(
# E-commerce product discovery # E-commerce product discovery
config = SeedingConfig( config = SeedingConfig(
source="cc+sitemap", source="sitemap+cc",
pattern="*/product/*", pattern="*/product/*",
extract_head=True, extract_head=True,
live_check=True live_check=True