docs: Update release notes and docs for v0.7.0 with teh correct parameters and explanations

2025-07-15 11:32:04 +02:00
parent 205df1e330
commit 1d1970ae69
5 changed files with 146 additions and 210 deletions
--- a/docs/blog/release-v0.7.0.md
+++ b/docs/blog/release-v0.7.0.md
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel

 - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
 - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
+- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
 - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
- **PDF Parsing**: Extract data from PDF documents
 - **Performance Optimizations**: Significant speed and memory improvements

 ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
@@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler:
 ### The Three-Layer Scoring System

 ```python
-from crawl4ai import LinkPreviewConfig
+from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode

 # Configure intelligent link analysis
 link_config = LinkPreviewConfig(
-    # What to analyze
    include_internal=True,
-    include_external=True,
-    max_links=100,              # Analyze top 100 links
-    
-    # Relevance scoring
-    query="machine learning tutorials",  # Your interest
-    score_threshold=0.3,        # Minimum relevance score
-    
-    # Performance
-    concurrent_requests=10,     # Parallel processing
-    timeout_per_link=5000,      # 5s per link
-    
-    # Advanced scoring weights
-    scoring_weights={
-        "intrinsic": 0.3,       # Link quality indicators
-        "contextual": 0.5,      # Relevance to query
-        "popularity": 0.2       # Link prominence
-    }
+    include_external=False,
+    max_links=10,
+    concurrency=5,
+    query="python tutorial",  # For contextual scoring
+    score_threshold=0.3,
+    verbose=True
 )

 # Use in your crawl
@@ -175,35 +162,51 @@ result = await crawler.arun(
    "https://tech-blog.example.com",
    config=CrawlerRunConfig(
        link_preview_config=link_config,
-        score_links=True
+        score_links=True,  # Enable intrinsic scoring
+        cache_mode=CacheMode.BYPASS
    )
 )

 # Access scored and sorted links
-for link in result.links["internal"][:10]:  # Top 10 internal links
-    print(f"Score: {link['total_score']:.3f}")
-    print(f"  Intrinsic: {link['intrinsic_score']:.1f}/10")  # Position, attributes
-    print(f"  Contextual: {link['contextual_score']:.1f}/1")  # Relevance to query
-    print(f"  URL: {link['href']}")
-    print(f"  Title: {link['head_data']['title']}")
-    print(f"  Description: {link['head_data']['meta']['description'][:100]}...")
+if result.success and result.links:
+# Get scored links
+internal_links = result.links.get("internal", [])
+scored_links = [l for l in internal_links if l.get("total_score")]
+scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
+
+# Create a scoring table
+table = Table(title="Link Scoring Results", box=box.ROUNDED)
+table.add_column("Link Text", style="cyan", width=40)
+table.add_column("Intrinsic Score", justify="center")
+table.add_column("Contextual Score", justify="center")
+table.add_column("Total Score", justify="center", style="bold green")
+
+for link in scored_links[:5]:
+    text = link.get('text', 'No text')[:40]
+    table.add_row(
+        text,
+        f"{link.get('intrinsic_score', 0):.1f}/10",
+        f"{link.get('contextual_score', 0):.2f}/1",
+        f"{link.get('total_score', 0):.3f}"
+    )
+
+console.print(table)
 ```

 **Scoring Components:**

-1. **Intrinsic Score (0-10)**: Based on link quality indicators
+1. **Intrinsic Score**: Based on link quality indicators
   - Position on page (navigation, content, footer)
   - Link attributes (rel, title, class names)
   - Anchor text quality and length
   - URL structure and depth

-2. **Contextual Score (0-1)**: Relevance to your query
-   - Semantic similarity using embeddings
+2. **Contextual Score**: Relevance to your query using BM25 algorithm
   - Keyword matching in link text and title
   - Meta description analysis
   - Content preview scoring

-3. **Total Score**: Weighted combination for final ranking
+3. **Total Score**: Combined score for final ranking

 **Expected Real-World Impact:**
 - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
@@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
 # Basic discovery - find all product pages
 seeder_config = SeedingConfig(
    # Discovery sources
-    source="sitemap+cc",        # Sitemap + Common Crawl
+    source="cc+sitemap",        # Sitemap + Common Crawl
    
    # Filtering
    pattern="*/product/*",      # URL pattern matching
-    ignore_patterns=["*/reviews/*", "*/questions/*"],
    
    # Validation
    live_check=True,           # Verify URLs are alive
-    max_urls=5000,             # Stop at 5000 URLs
+    max_urls=50,             # Stop at 50 URLs
    
    # Performance  
-    concurrency=100,           # Parallel requests
-    hits_per_sec=10           # Rate limiting
+    concurrency=100,           # Maximum concurrent requests for live checks/head extraction
+    hits_per_sec=10           # Rate limit in requests per second to avoid overwhelming servers
 )

-seeder = AsyncUrlSeeder(seeder_config)
-urls = await seeder.discover("https://shop.example.com")
+async with AsyncUrlSeeder() as seeder:
+    console.print("Discovering URLs from Python docs...")
+    urls = await seeder.urls("docs.python.org", seeding_config)
+    console.print(f"\n✓ Discovered {len(urls)} URLs")

 # Advanced: Relevance-based discovery
 research_config = SeedingConfig(
-    source="crawl+sitemap",    # Deep crawl + sitemap
+    source="sitemap+cc",       # Sitemap + Common Crawl
    pattern="*/blog/*",        # Blog posts only
    
    # Content relevance
    extract_head=True,         # Get meta tags
    query="quantum computing tutorials",
-    scoring_method="bm25",     # Or "semantic" (coming soon)
+    scoring_method="bm25",     # BM25 scoring method
    score_threshold=0.4,       # High relevance only
    
    # Smart filtering
    filter_nonsense_urls=True,  # Remove .xml, .txt, etc.
-    min_content_length=500,     # Skip thin content
    
    force=True                 # Bypass cache
 )

 # Discover with progress tracking
 discovered = []
-async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
-    discovered.extend(batch)
-    print(f"Found {len(discovered)} relevant URLs so far...")
+async with AsyncUrlSeeder() as seeder:
+    discovered = await seeder.urls("https://physics-blog.com", research_config)
+    console.print(f"\n✓ Discovered {len(discovered)} URLs")

 # Results include scores and metadata
 for url_data in discovered[:5]:
    print(f"URL: {url_data['url']}")
-    print(f"Score: {url_data['score']:.3f}")
-    print(f"Title: {url_data['title']}")
+    print(f"Score: {url_data['relevance_score']:.3f}")
+    print(f"Title: {url_data['head_data']['title']}")
 ```

 **Discovery Methods:**
@@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res
 ### What We Optimized

 ```python
-# Before v0.7.0 (slow)
+# Optimized crawling with v0.7.0 improvements
 results = []
 for url in urls:
-    result = await crawler.arun(url)
-    results.append(result)
-
-# After v0.7.0 (fast)
-# Automatic batching and connection pooling
-results = await crawler.arun_batch(
-    urls,
-    config=CrawlerRunConfig(
-        # New performance options
-        batch_size=10,              # Process 10 URLs concurrently
-        reuse_browser=True,         # Keep browser warm
-        eager_loading=False,        # Load only what's needed
-        streaming_extraction=True,  # Stream large extractions
-        
-        # Optimized defaults
-        wait_until="domcontentloaded",  # Faster than networkidle
-        exclude_external_resources=True, # Skip third-party assets
-        block_ads=True                  # Ad blocking built-in
+    result = await crawler.arun(
+        url,
+        config=CrawlerRunConfig(
+            # Performance optimizations
+            wait_until="domcontentloaded",  # Faster than networkidle
+            cache_mode=CacheMode.ENABLED    # Enable caching
+        )
    )
-)
-
-# Memory-efficient streaming for large crawls
-async for result in crawler.arun_stream(large_url_list):
-    # Process results as they complete
-    await process_result(result)
-    # Memory is freed after each iteration
+    results.append(result)
 ```

 **Performance Gains:**
@@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list):
 - **Memory Usage**: 60% reduction with streaming processing
 - **Concurrent Crawls**: Handle 5x more parallel requests

-## 📄 PDF Support
-
-PDF extraction is now natively supported in Crawl4AI.
-
-```python
-# Extract data from PDF documents
-result = await crawler.arun(
-    "https://example.com/report.pdf",
-    config=CrawlerRunConfig(
-        pdf_extraction=True,
-        extraction_strategy=JsonCssExtractionStrategy({
-            # Works on converted PDF structure
-            "title": {"selector": "h1", "type": "text"},
-            "sections": {"selector": "h2", "type": "list"}
-        })
-    )
-)
-```

 ## 🔧 Important Changes

--- a/docs/md_v2/advanced/virtual-scroll.md
+++ b/docs/md_v2/advanced/virtual-scroll.md
@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
        wait_after_scroll=1.0  # Twitter needs time to load
    )
    
+    browser_config = BrowserConfig(headless=True)  # Set to False to watch it work
    config = CrawlerRunConfig(
-        virtual_scroll_config=virtual_config,
-        # Optional: Set headless=False to watch it work
-        # browser_config=BrowserConfig(headless=False)
+        virtual_scroll_config=virtual_config
    )
    
-    async with AsyncWebCrawler() as crawler:
+    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url="https://twitter.com/search?q=AI",
            config=config
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
 Virtual Scroll works seamlessly with extraction strategies:

 ```python
-from crawl4ai import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy, LLMConfig

 # Define extraction schema
 schema = {
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
        scroll_count=20
    ),
    extraction_strategy=LLMExtractionStrategy(
-        provider="openai/gpt-4o-mini",
+        llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
        schema=schema
    )
 )
--- a/docs/md_v2/blog/releases/0.7.0.md
+++ b/docs/md_v2/blog/releases/0.7.0.md
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel

 - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
 - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
+- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
 - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
- **PDF Parsing**: Extract data from PDF documents
 - **Performance Optimizations**: Significant speed and memory improvements

 ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
@@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler:
 ### The Three-Layer Scoring System

 ```python
-from crawl4ai import LinkPreviewConfig
+from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode

 # Configure intelligent link analysis
 link_config = LinkPreviewConfig(
-    # What to analyze
    include_internal=True,
-    include_external=True,
-    max_links=100,              # Analyze top 100 links
-    
-    # Relevance scoring
-    query="machine learning tutorials",  # Your interest
-    score_threshold=0.3,        # Minimum relevance score
-    
-    # Performance
-    concurrent_requests=10,     # Parallel processing
-    timeout_per_link=5000,      # 5s per link
-    
-    # Advanced scoring weights
-    scoring_weights={
-        "intrinsic": 0.3,       # Link quality indicators
-        "contextual": 0.5,      # Relevance to query
-        "popularity": 0.2       # Link prominence
-    }
+    include_external=False,
+    max_links=10,
+    concurrency=5,
+    query="python tutorial",  # For contextual scoring
+    score_threshold=0.3,
+    verbose=True
 )

 # Use in your crawl
@@ -175,35 +162,51 @@ result = await crawler.arun(
    "https://tech-blog.example.com",
    config=CrawlerRunConfig(
        link_preview_config=link_config,
-        score_links=True
+        score_links=True,  # Enable intrinsic scoring
+        cache_mode=CacheMode.BYPASS
    )
 )

 # Access scored and sorted links
-for link in result.links["internal"][:10]:  # Top 10 internal links
-    print(f"Score: {link['total_score']:.3f}")
-    print(f"  Intrinsic: {link['intrinsic_score']:.1f}/10")  # Position, attributes
-    print(f"  Contextual: {link['contextual_score']:.1f}/1")  # Relevance to query
-    print(f"  URL: {link['href']}")
-    print(f"  Title: {link['head_data']['title']}")
-    print(f"  Description: {link['head_data']['meta']['description'][:100]}...")
+if result.success and result.links:
+# Get scored links
+internal_links = result.links.get("internal", [])
+scored_links = [l for l in internal_links if l.get("total_score")]
+scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
+
+# Create a scoring table
+table = Table(title="Link Scoring Results", box=box.ROUNDED)
+table.add_column("Link Text", style="cyan", width=40)
+table.add_column("Intrinsic Score", justify="center")
+table.add_column("Contextual Score", justify="center")
+table.add_column("Total Score", justify="center", style="bold green")
+
+for link in scored_links[:5]:
+    text = link.get('text', 'No text')[:40]
+    table.add_row(
+        text,
+        f"{link.get('intrinsic_score', 0):.1f}/10",
+        f"{link.get('contextual_score', 0):.2f}/1",
+        f"{link.get('total_score', 0):.3f}"
+    )
+
+console.print(table)
 ```

 **Scoring Components:**

-1. **Intrinsic Score (0-10)**: Based on link quality indicators
+1. **Intrinsic Score**: Based on link quality indicators
   - Position on page (navigation, content, footer)
   - Link attributes (rel, title, class names)
   - Anchor text quality and length
   - URL structure and depth

-2. **Contextual Score (0-1)**: Relevance to your query
-   - Semantic similarity using embeddings
+2. **Contextual Score**: Relevance to your query using BM25 algorithm
   - Keyword matching in link text and title
   - Meta description analysis
   - Content preview scoring

-3. **Total Score**: Weighted combination for final ranking
+3. **Total Score**: Combined score for final ranking

 **Expected Real-World Impact:**
 - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
@@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
 # Basic discovery - find all product pages
 seeder_config = SeedingConfig(
    # Discovery sources
-    source="sitemap+cc",        # Sitemap + Common Crawl
+    source="cc+sitemap",        # Sitemap + Common Crawl
    
    # Filtering
    pattern="*/product/*",      # URL pattern matching
-    ignore_patterns=["*/reviews/*", "*/questions/*"],
    
    # Validation
    live_check=True,           # Verify URLs are alive
-    max_urls=5000,             # Stop at 5000 URLs
+    max_urls=50,             # Stop at 50 URLs
    
    # Performance  
-    concurrency=100,           # Parallel requests
-    hits_per_sec=10           # Rate limiting
+    concurrency=100,           # Maximum concurrent requests for live checks/head extraction
+    hits_per_sec=10           # Rate limit in requests per second to avoid overwhelming servers
 )

-seeder = AsyncUrlSeeder(seeder_config)
-urls = await seeder.discover("https://shop.example.com")
+async with AsyncUrlSeeder() as seeder:
+    console.print("Discovering URLs from Python docs...")
+    urls = await seeder.urls("docs.python.org", seeding_config)
+    console.print(f"\n✓ Discovered {len(urls)} URLs")

 # Advanced: Relevance-based discovery
 research_config = SeedingConfig(
-    source="crawl+sitemap",    # Deep crawl + sitemap
+    source="sitemap+cc",       # Sitemap + Common Crawl
    pattern="*/blog/*",        # Blog posts only
    
    # Content relevance
    extract_head=True,         # Get meta tags
    query="quantum computing tutorials",
-    scoring_method="bm25",     # Or "semantic" (coming soon)
+    scoring_method="bm25",     # BM25 scoring method
    score_threshold=0.4,       # High relevance only
    
    # Smart filtering
    filter_nonsense_urls=True,  # Remove .xml, .txt, etc.
-    min_content_length=500,     # Skip thin content
    
    force=True                 # Bypass cache
 )

 # Discover with progress tracking
 discovered = []
-async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
-    discovered.extend(batch)
-    print(f"Found {len(discovered)} relevant URLs so far...")
+async with AsyncUrlSeeder() as seeder:
+    discovered = await seeder.urls("https://physics-blog.com", research_config)
+    console.print(f"\n✓ Discovered {len(discovered)} URLs")

 # Results include scores and metadata
 for url_data in discovered[:5]:
    print(f"URL: {url_data['url']}")
-    print(f"Score: {url_data['score']:.3f}")
-    print(f"Title: {url_data['title']}")
+    print(f"Score: {url_data['relevance_score']:.3f}")
+    print(f"Title: {url_data['head_data']['title']}")
 ```

 **Discovery Methods:**
@@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res
 ### What We Optimized

 ```python
-# Before v0.7.0 (slow)
+# Optimized crawling with v0.7.0 improvements
 results = []
 for url in urls:
-    result = await crawler.arun(url)
-    results.append(result)
-
-# After v0.7.0 (fast)
-# Automatic batching and connection pooling
-results = await crawler.arun_batch(
-    urls,
-    config=CrawlerRunConfig(
-        # New performance options
-        batch_size=10,              # Process 10 URLs concurrently
-        reuse_browser=True,         # Keep browser warm
-        eager_loading=False,        # Load only what's needed
-        streaming_extraction=True,  # Stream large extractions
-        
-        # Optimized defaults
-        wait_until="domcontentloaded",  # Faster than networkidle
-        exclude_external_resources=True, # Skip third-party assets
-        block_ads=True                  # Ad blocking built-in
+    result = await crawler.arun(
+        url,
+        config=CrawlerRunConfig(
+            # Performance optimizations
+            wait_until="domcontentloaded",  # Faster than networkidle
+            cache_mode=CacheMode.ENABLED    # Enable caching
+        )
    )
-)
-
-# Memory-efficient streaming for large crawls
-async for result in crawler.arun_stream(large_url_list):
-    # Process results as they complete
-    await process_result(result)
-    # Memory is freed after each iteration
+    results.append(result)
 ```

 **Performance Gains:**
@@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list):
 - **Memory Usage**: 60% reduction with streaming processing
 - **Concurrent Crawls**: Handle 5x more parallel requests

-## 📄 PDF Support
-
-PDF extraction is now natively supported in Crawl4AI.
-
-```python
-# Extract data from PDF documents
-result = await crawler.arun(
-    "https://example.com/report.pdf",
-    config=CrawlerRunConfig(
-        pdf_extraction=True,
-        extraction_strategy=JsonCssExtractionStrategy({
-            # Works on converted PDF structure
-            "title": {"selector": "h1", "type": "text"},
-            "sections": {"selector": "h2", "type": "list"}
-        })
-    )
-)
-```

 ## 🔧 Important Changes

--- a/docs/md_v2/core/adaptive-crawling.md
+++ b/docs/md_v2/core/adaptive-crawling.md
@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler

 async def main():
    async with AsyncWebCrawler() as crawler:
-        # Create an adaptive crawler
+        # Create an adaptive crawler (config is optional)
        adaptive = AdaptiveCrawler(crawler)
        
        # Start crawling with a query
@@ -59,13 +59,13 @@ async def main():
 from crawl4ai import AdaptiveConfig

 config = AdaptiveConfig(
-    confidence_threshold=0.7,    # Stop when 70% confident (default: 0.8)
-    max_pages=20,               # Maximum pages to crawl (default: 50)
-    top_k_links=3,              # Links to follow per page (default: 5)
+    confidence_threshold=0.8,    # Stop when 80% confident (default: 0.7)
+    max_pages=30,               # Maximum pages to crawl (default: 20)
+    top_k_links=5,              # Links to follow per page (default: 3)
    min_gain_threshold=0.05     # Minimum expected gain to continue (default: 0.1)
 )

-adaptive = AdaptiveCrawler(crawler, config=config)
+adaptive = AdaptiveCrawler(crawler, config)
 ```

 ## Crawling Strategies
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
 The confidence score (0-1) indicates how sufficient the gathered information is:
 - **0.0-0.3**: Insufficient information, needs more crawling
 - **0.3-0.6**: Partial information, may answer basic queries
- **0.6-0.8**: Good coverage, can answer most queries
- **0.8-1.0**: Excellent coverage, comprehensive information
+- **0.6-0.7**: Good coverage, can answer most queries
+- **0.7-1.0**: Excellent coverage, comprehensive information

 ### Statistics Display

@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
 - Avoid overly broad queries

 ### 2. Threshold Tuning
- Start with default (0.8) for general use
- Lower to 0.6-0.7 for exploratory crawling
- Raise to 0.9+ for exhaustive coverage
+- Start with default (0.7) for general use
+- Lower to 0.5-0.6 for exploratory crawling
+- Raise to 0.8+ for exhaustive coverage

 ### 3. Performance Optimization
 - Use appropriate `max_pages` limits
--- a/docs/md_v2/core/url-seeding.md
+++ b/docs/md_v2/core/url-seeding.md
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
            word_count_threshold=300  # Only substantial articles
        )
        
-        # Extract URLs and stream results as they come
+        # Extract URLs and crawl them
        tutorial_urls = [t["url"] for t in tutorials[:10]]
        results = await crawler.arun_many(tutorial_urls, config=config)
        
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I

 ```python
 # Use both sources
-config = SeedingConfig(source="cc+sitemap")
+config = SeedingConfig(source="sitemap+cc")
 urls = await seeder.urls("example.com", config)
 ```

@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf

 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
-| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
+| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
 | `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
 | `extract_head` | bool | False | Extract metadata from page `<head>` |
 | `live_check` | bool | False | Verify URLs are accessible |
 | `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
 | `concurrency` | int | 10 | Parallel workers for fetching |
-| `hits_per_sec` | int | None | Rate limit for requests |
+| `hits_per_sec` | int | 5 | Rate limit for requests |
 | `force` | bool | False | Bypass cache, fetch fresh data |
 | `verbose` | bool | False | Show detailed progress |
 | `query` | str | None | Search query for BM25 scoring |
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
 ```python
 # Find specific products
 config = SeedingConfig(
-    source="cc+sitemap",  # Use both sources
+    source="sitemap+cc",  # Use both sources
    extract_head=True,
    query="wireless headphones noise canceling",
    scoring_method="bm25",
@@ -782,7 +782,7 @@ class ResearchAssistant:
        
        # Step 1: Discover relevant URLs
        config = SeedingConfig(
-            source="cc+sitemap",     # Maximum coverage
+            source="sitemap+cc",     # Maximum coverage
            extract_head=True,       # Get metadata
            query=topic,             # Research topic
            scoring_method="bm25",   # Smart scoring
@@ -832,7 +832,8 @@ class ResearchAssistant:
            # Extract URLs and crawl all articles
            article_urls = [article['url'] for article in top_articles]
            results = []
-            async for result in await crawler.arun_many(article_urls, config=config):
+            crawl_results = await crawler.arun_many(article_urls, config=config)
+            async for result in crawl_results:
                if result.success:
                    results.append({
                        'url': result.url,
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
 # When crawling many URLs
 async with AsyncWebCrawler() as crawler:
    # Assuming urls is a list of URL strings
-    results = await crawler.arun_many(urls, config=config)
+    crawl_results = await crawler.arun_many(urls, config=config)
    
    # Process as they arrive
-    async for result in results:
+    async for result in crawl_results:
        process_immediately(result)  # Don't wait for all
 ```

@@ -1020,7 +1021,7 @@ config = SeedingConfig(

 # E-commerce product discovery
 config = SeedingConfig(
-    source="cc+sitemap",
+    source="sitemap+cc",
    pattern="*/product/*",
    extract_head=True,
    live_check=True