Add examples for deep crawl crash recovery and prefetch mode in documentation

2026-01-14 12:58:44 +01:00
parent 530cde351f
commit 315eae9e6f
3 changed files with 828 additions and 6 deletions
--- a/docs/examples/deep_crawl_crash_recovery.py
+++ b/docs/examples/deep_crawl_crash_recovery.py
@@ -0,0 +1,297 @@
 #!/usr/bin/env python3
 """
 Deep Crawl Crash Recovery Example
 This example demonstrates how to implement crash recovery for long-running
 deep crawls. The feature is useful for:
 - Cloud deployments with spot/preemptible instances
 - Long-running crawls that may be interrupted
 - Distributed crawling with state coordination
 Key concepts:
 - `on_state_change`: Callback fired after each URL is processed
 - `resume_state`: Pass saved state to continue from a checkpoint
 - `export_state()`: Get the last captured state manually
 Works with all strategies: BFSDeepCrawlStrategy, DFSDeepCrawlStrategy,
 BestFirstCrawlingStrategy
 """
 import asyncio
 import json
 import os
 from pathlib import Path
 from typing import Dict, Any, List
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 # File to store crawl state (in production, use Redis/database)
 STATE_FILE = Path("crawl_state.json")
 async def save_state_to_file(state: Dict[str, Any]) -> None:
    """
    Callback to save state after each URL is processed.
    In production, you might save to:
    - Redis: await redis.set("crawl_state", json.dumps(state))
    - Database: await db.execute("UPDATE crawls SET state = ?", json.dumps(state))
    - S3: await s3.put_object(Bucket="crawls", Key="state.json", Body=json.dumps(state))
    """
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)
    print(f"  [State saved] Pages: {state['pages_crawled']}, Pending: {len(state['pending'])}")
 def load_state_from_file() -> Dict[str, Any] | None:
    """Load previously saved state, if it exists."""
    if STATE_FILE.exists():
        with open(STATE_FILE, "r") as f:
            return json.load(f)
    return None
 async def example_basic_state_persistence():
    """
    Example 1: Basic state persistence with file storage.
    The on_state_change callback is called after each URL is processed,
    allowing you to save progress in real-time.
    """
    print("\n" + "=" * 60)
    print("Example 1: Basic State Persistence")
    print("=" * 60)
    # Clean up any previous state
    if STATE_FILE.exists():
        STATE_FILE.unlink()
    strategy = BFSDeepCrawlStrategy(
        max_depth=2,
        max_pages=5,
        on_state_change=save_state_to_file,  # Save after each URL
    )
    config = CrawlerRunConfig(
        deep_crawl_strategy=strategy,
        verbose=False,
    )
    print("\nStarting crawl with state persistence...")
    async with AsyncWebCrawler(verbose=False) as crawler:
        results = await crawler.arun("https://books.toscrape.com", config=config)
    # Show final state
    if STATE_FILE.exists():
        with open(STATE_FILE, "r") as f:
            final_state = json.load(f)
        print(f"\nFinal state saved to {STATE_FILE}:")
        print(f"  - Strategy: {final_state['strategy_type']}")
        print(f"  - Pages crawled: {final_state['pages_crawled']}")
        print(f"  - URLs visited: {len(final_state['visited'])}")
        print(f"  - URLs pending: {len(final_state['pending'])}")
    print(f"\nCrawled {len(results)} pages total")
 async def example_crash_and_resume():
    """
    Example 2: Simulate a crash and resume from checkpoint.
    This demonstrates the full crash recovery workflow:
    1. Start crawling with state persistence
    2. "Crash" after N pages
    3. Resume from saved state
    4. Verify no duplicate work
    """
    print("\n" + "=" * 60)
    print("Example 2: Crash and Resume")
    print("=" * 60)
    # Clean up any previous state
    if STATE_FILE.exists():
        STATE_FILE.unlink()
    crash_after = 3
    crawled_urls_phase1: List[str] = []
    async def save_and_maybe_crash(state: Dict[str, Any]) -> None:
        """Save state, then simulate crash after N pages."""
        # Always save state first
        await save_state_to_file(state)
        crawled_urls_phase1.clear()
        crawled_urls_phase1.extend(state["visited"])
        # Simulate crash after reaching threshold
        if state["pages_crawled"] >= crash_after:
            raise Exception("Simulated crash! (This is intentional)")
    # Phase 1: Start crawl that will "crash"
    print(f"\n--- Phase 1: Crawl until 'crash' after {crash_after} pages ---")
    strategy1 = BFSDeepCrawlStrategy(
        max_depth=2,
        max_pages=10,
        on_state_change=save_and_maybe_crash,
    )
    config = CrawlerRunConfig(
        deep_crawl_strategy=strategy1,
        verbose=False,
    )
    try:
        async with AsyncWebCrawler(verbose=False) as crawler:
            await crawler.arun("https://books.toscrape.com", config=config)
    except Exception as e:
        print(f"\n  Crash occurred: {e}")
        print(f"  URLs crawled before crash: {len(crawled_urls_phase1)}")
    # Phase 2: Resume from checkpoint
    print("\n--- Phase 2: Resume from checkpoint ---")
    saved_state = load_state_from_file()
    if not saved_state:
        print("  ERROR: No saved state found!")
        return
    print(f"  Loaded state: {saved_state['pages_crawled']} pages, {len(saved_state['pending'])} pending")
    crawled_urls_phase2: List[str] = []
    async def track_resumed_crawl(state: Dict[str, Any]) -> None:
        """Track new URLs crawled in phase 2."""
        await save_state_to_file(state)
        new_urls = set(state["visited"]) - set(saved_state["visited"])
        for url in new_urls:
            if url not in crawled_urls_phase2:
                crawled_urls_phase2.append(url)
    strategy2 = BFSDeepCrawlStrategy(
        max_depth=2,
        max_pages=10,
        resume_state=saved_state,  # Resume from checkpoint!
        on_state_change=track_resumed_crawl,
    )
    config2 = CrawlerRunConfig(
        deep_crawl_strategy=strategy2,
        verbose=False,
    )
    async with AsyncWebCrawler(verbose=False) as crawler:
        results = await crawler.arun("https://books.toscrape.com", config=config2)
    # Verify no duplicates
    already_crawled = set(saved_state["visited"])
    duplicates = set(crawled_urls_phase2) & already_crawled
    print(f"\n--- Results ---")
    print(f"  Phase 1 URLs: {len(crawled_urls_phase1)}")
    print(f"  Phase 2 new URLs: {len(crawled_urls_phase2)}")
    print(f"  Duplicate crawls: {len(duplicates)} (should be 0)")
    print(f"  Total results: {len(results)}")
    if len(duplicates) == 0:
        print("\n  SUCCESS: No duplicate work after resume!")
    else:
        print(f"\n  WARNING: Found duplicates: {duplicates}")
 async def example_export_state():
    """
    Example 3: Manual state export using export_state().
    If you don't need real-time persistence, you can export
    the state manually after the crawl completes.
    """
    print("\n" + "=" * 60)
    print("Example 3: Manual State Export")
    print("=" * 60)
    strategy = BFSDeepCrawlStrategy(
        max_depth=1,
        max_pages=3,
        # No callback - state is still tracked internally
    )
    config = CrawlerRunConfig(
        deep_crawl_strategy=strategy,
        verbose=False,
    )
    print("\nCrawling without callback...")
    async with AsyncWebCrawler(verbose=False) as crawler:
        results = await crawler.arun("https://books.toscrape.com", config=config)
    # Export state after crawl completes
    # Note: This only works if on_state_change was set during crawl
    # For this example, we'd need to set on_state_change to get state
    print(f"\nCrawled {len(results)} pages")
    print("(For manual export, set on_state_change to capture state)")
 async def example_state_structure():
    """
    Example 4: Understanding the state structure.
    Shows the complete state dictionary that gets saved.
    """
    print("\n" + "=" * 60)
    print("Example 4: State Structure")
    print("=" * 60)
    captured_state = None
    async def capture_state(state: Dict[str, Any]) -> None:
        nonlocal captured_state
        captured_state = state
    strategy = BFSDeepCrawlStrategy(
        max_depth=1,
        max_pages=2,
        on_state_change=capture_state,
    )
    config = CrawlerRunConfig(
        deep_crawl_strategy=strategy,
        verbose=False,
    )
    async with AsyncWebCrawler(verbose=False) as crawler:
        await crawler.arun("https://books.toscrape.com", config=config)
    if captured_state:
        print("\nState structure:")
        print(json.dumps(captured_state, indent=2, default=str)[:1000] + "...")
        print("\n\nKey fields:")
        print(f"  strategy_type: '{captured_state['strategy_type']}'")
        print(f"  visited: List of {len(captured_state['visited'])} URLs")
        print(f"  pending: List of {len(captured_state['pending'])} queued items")
        print(f"  depths: Dict mapping URL -> depth level")
        print(f"  pages_crawled: {captured_state['pages_crawled']}")
 async def main():
    """Run all examples."""
    print("=" * 60)
    print("Deep Crawl Crash Recovery Examples")
    print("=" * 60)
    await example_basic_state_persistence()
    await example_crash_and_resume()
    await example_state_structure()
    # # Cleanup
    # if STATE_FILE.exists():
    #     STATE_FILE.unlink()
    #     print(f"\n[Cleaned up {STATE_FILE}]")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/prefetch_two_phase_crawl.py
+++ b/docs/examples/prefetch_two_phase_crawl.py
@@ -0,0 +1,279 @@
 #!/usr/bin/env python3
 """
 Prefetch Mode and Two-Phase Crawling Example
 Prefetch mode is a fast path that skips heavy processing and returns
 only HTML + links. This is ideal for:
 - Site mapping: Quickly discover all URLs
 - Selective crawling: Find URLs first, then process only what you need
 - Link validation: Check which pages exist without full processing
 - Crawl planning: Estimate size before committing resources
 Key concept:
 - `prefetch=True` in CrawlerRunConfig enables fast link-only extraction
 - Skips: markdown generation, content scraping, media extraction, LLM extraction
 - Returns: HTML and links dictionary
 Performance benefit: ~5-10x faster than full processing
 """
 import asyncio
 import time
 from typing import List, Dict
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 async def example_basic_prefetch():
    """
    Example 1: Basic prefetch mode.
    Shows how prefetch returns HTML and links without heavy processing.
    """
    print("\n" + "=" * 60)
    print("Example 1: Basic Prefetch Mode")
    print("=" * 60)
    async with AsyncWebCrawler(verbose=False) as crawler:
        # Enable prefetch mode
        config = CrawlerRunConfig(prefetch=True)
        print("\nFetching with prefetch=True...")
        result = await crawler.arun("https://books.toscrape.com", config=config)
        print(f"\nResult summary:")
        print(f"  Success: {result.success}")
        print(f"  HTML length: {len(result.html) if result.html else 0} chars")
        print(f"  Internal links: {len(result.links.get('internal', []))}")
        print(f"  External links: {len(result.links.get('external', []))}")
        # These should be None/empty in prefetch mode
        print(f"\n  Skipped processing:")
        print(f"    Markdown: {result.markdown}")
        print(f"    Cleaned HTML: {result.cleaned_html}")
        print(f"    Extracted content: {result.extracted_content}")
        # Show some discovered links
        internal_links = result.links.get("internal", [])
        if internal_links:
            print(f"\n  Sample internal links:")
            for link in internal_links[:5]:
                print(f"    - {link['href'][:60]}...")
 async def example_performance_comparison():
    """
    Example 2: Compare prefetch vs full processing performance.
    """
    print("\n" + "=" * 60)
    print("Example 2: Performance Comparison")
    print("=" * 60)
    url = "https://books.toscrape.com"
    async with AsyncWebCrawler(verbose=False) as crawler:
        # Warm up - first request is slower due to browser startup
        await crawler.arun(url, config=CrawlerRunConfig())
        # Prefetch mode timing
        start = time.time()
        prefetch_result = await crawler.arun(url, config=CrawlerRunConfig(prefetch=True))
        prefetch_time = time.time() - start
        # Full processing timing
        start = time.time()
        full_result = await crawler.arun(url, config=CrawlerRunConfig())
        full_time = time.time() - start
        print(f"\nTiming comparison:")
        print(f"  Prefetch mode: {prefetch_time:.3f}s")
        print(f"  Full processing: {full_time:.3f}s")
        print(f"  Speedup: {full_time / prefetch_time:.1f}x faster")
        print(f"\nOutput comparison:")
        print(f"  Prefetch - Links found: {len(prefetch_result.links.get('internal', []))}")
        print(f"  Full - Links found: {len(full_result.links.get('internal', []))}")
        print(f"  Full - Markdown length: {len(full_result.markdown.raw_markdown) if full_result.markdown else 0}")
 async def example_two_phase_crawl():
    """
    Example 3: Two-phase crawling pattern.
    Phase 1: Fast discovery with prefetch
    Phase 2: Full processing on selected URLs
    """
    print("\n" + "=" * 60)
    print("Example 3: Two-Phase Crawling")
    print("=" * 60)
    async with AsyncWebCrawler(verbose=False) as crawler:
        # ═══════════════════════════════════════════════════════════
        # Phase 1: Fast URL discovery
        # ═══════════════════════════════════════════════════════════
        print("\n--- Phase 1: Fast Discovery ---")
        prefetch_config = CrawlerRunConfig(prefetch=True)
        start = time.time()
        discovery = await crawler.arun("https://books.toscrape.com", config=prefetch_config)
        discovery_time = time.time() - start
        all_urls = [link["href"] for link in discovery.links.get("internal", [])]
        print(f"  Discovered {len(all_urls)} URLs in {discovery_time:.2f}s")
        # Filter to URLs we care about (e.g., book detail pages)
        # On books.toscrape.com, book pages contain "catalogue/" but not "category/"
        book_urls = [
            url for url in all_urls
            if "catalogue/" in url and "category/" not in url
        ][:5]  # Limit to 5 for demo
        print(f"  Filtered to {len(book_urls)} book pages")
        # ═══════════════════════════════════════════════════════════
        # Phase 2: Full processing on selected URLs
        # ═══════════════════════════════════════════════════════════
        print("\n--- Phase 2: Full Processing ---")
        full_config = CrawlerRunConfig(
            word_count_threshold=10,
            remove_overlay_elements=True,
        )
        results = []
        start = time.time()
        for url in book_urls:
            result = await crawler.arun(url, config=full_config)
            if result.success:
                results.append(result)
                title = result.url.split("/")[-2].replace("-", " ").title()[:40]
                md_len = len(result.markdown.raw_markdown) if result.markdown else 0
                print(f"    Processed: {title}... ({md_len} chars)")
        processing_time = time.time() - start
        print(f"\n  Processed {len(results)} pages in {processing_time:.2f}s")
        # ═══════════════════════════════════════════════════════════
        # Summary
        # ═══════════════════════════════════════════════════════════
        print(f"\n--- Summary ---")
        print(f"  Discovery phase: {discovery_time:.2f}s ({len(all_urls)} URLs)")
        print(f"  Processing phase: {processing_time:.2f}s ({len(results)} pages)")
        print(f"  Total time: {discovery_time + processing_time:.2f}s")
        print(f"  URLs skipped: {len(all_urls) - len(book_urls)} (not matching filter)")
 async def example_prefetch_with_deep_crawl():
    """
    Example 4: Combine prefetch with deep crawl strategy.
    Use prefetch mode during deep crawl for maximum speed.
    """
    print("\n" + "=" * 60)
    print("Example 4: Prefetch with Deep Crawl")
    print("=" * 60)
    from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
    async with AsyncWebCrawler(verbose=False) as crawler:
        # Deep crawl with prefetch - maximum discovery speed
        config = CrawlerRunConfig(
            prefetch=True,  # Fast mode
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1,
                max_pages=10,
            )
        )
        print("\nDeep crawling with prefetch mode...")
        start = time.time()
        result_container = await crawler.arun("https://books.toscrape.com", config=config)
        # Handle iterator result from deep crawl
        if hasattr(result_container, '__iter__'):
            results = list(result_container)
        else:
            results = [result_container]
        elapsed = time.time() - start
        # Collect all discovered links
        all_internal_links = set()
        all_external_links = set()
        for result in results:
            for link in result.links.get("internal", []):
                all_internal_links.add(link["href"])
            for link in result.links.get("external", []):
                all_external_links.add(link["href"])
        print(f"\nResults:")
        print(f"  Pages crawled: {len(results)}")
        print(f"  Total internal links discovered: {len(all_internal_links)}")
        print(f"  Total external links discovered: {len(all_external_links)}")
        print(f"  Time: {elapsed:.2f}s")
 async def example_prefetch_with_raw_html():
    """
    Example 5: Prefetch with raw HTML input.
    You can also use prefetch mode with raw: URLs for cached content.
    """
    print("\n" + "=" * 60)
    print("Example 5: Prefetch with Raw HTML")
    print("=" * 60)
    sample_html = """
    <html>
        <head><title>Sample Page</title></head>
        <body>
            <h1>Hello World</h1>
            <nav>
                <a href="/page1">Internal Page 1</a>
                <a href="/page2">Internal Page 2</a>
                <a href="https://example.com/external">External Link</a>
            </nav>
            <main>
                <p>This is the main content with <a href="/page3">another link</a>.</p>
            </main>
        </body>
    </html>
    """
    async with AsyncWebCrawler(verbose=False) as crawler:
        config = CrawlerRunConfig(
            prefetch=True,
            base_url="https://mysite.com",  # For resolving relative links
        )
        result = await crawler.arun(f"raw:{sample_html}", config=config)
        print(f"\nExtracted from raw HTML:")
        print(f"  Internal links: {len(result.links.get('internal', []))}")
        for link in result.links.get("internal", []):
            print(f"    - {link['href']} ({link['text']})")
        print(f"\n  External links: {len(result.links.get('external', []))}")
        for link in result.links.get("external", []):
            print(f"    - {link['href']} ({link['text']})")
 async def main():
    """Run all examples."""
    print("=" * 60)
    print("Prefetch Mode and Two-Phase Crawling Examples")
    print("=" * 60)
    await example_basic_prefetch()
    await example_performance_comparison()
    await example_two_phase_crawl()
    await example_prefetch_with_deep_crawl()
    await example_prefetch_with_raw_html()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/md_v2/core/deep-crawling.md
+++ b/docs/md_v2/core/deep-crawling.md
@@ -4,11 +4,13 @@ One of Crawl4AI's most powerful features is its ability to perform **configurabl
 In this tutorial, you'll learn:
-1. How to set up a **Basic Deep Crawler** with BFS strategy  
+1. How to set up a **Basic Deep Crawler** with BFS strategy
-2. Understanding the difference between **streamed and non-streamed** output  
+2. Understanding the difference between **streamed and non-streamed** output
-3. Implementing **filters and scorers** to target specific content  
+3. Implementing **filters and scorers** to target specific content
-4. Creating **advanced filtering chains** for sophisticated crawls  
+4. Creating **advanced filtering chains** for sophisticated crawls
-5. Using **BestFirstCrawling** for intelligent exploration prioritization  
+5. Using **BestFirstCrawling** for intelligent exploration prioritization
 6. **Crash recovery** for long-running production crawls
 7. **Prefetch mode** for fast URL discovery  
 > **Prerequisites**  
 > - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
@@ -485,7 +487,249 @@ This is especially useful for security-conscious crawling or when dealing with s
 ---
-## 10. Summary & Next Steps
+## 10. Crash Recovery for Long-Running Crawls
 For production deployments, especially in cloud environments where instances can be terminated unexpectedly, Crawl4AI provides built-in crash recovery support for all deep crawl strategies.
 ### 10.1 Enabling State Persistence
 All deep crawl strategies (BFS, DFS, Best-First) support two optional parameters:
 - **`resume_state`**: Pass a previously saved state to resume from a checkpoint
 - **`on_state_change`**: Async callback fired after each URL is processed
 ```python
 from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 import json
 # Callback to save state after each URL
 async def save_state_to_redis(state: dict):
    await redis.set("crawl_state", json.dumps(state))
 strategy = BFSDeepCrawlStrategy(
    max_depth=3,
    on_state_change=save_state_to_redis,  # Called after each URL
 )
 ```
 ### 10.2 State Structure
 The state dictionary is JSON-serializable and contains:
 ```python
 {
    "strategy_type": "bfs",  # or "dfs", "best_first"
    "visited": ["url1", "url2", ...],  # Already crawled URLs
    "pending": [{"url": "...", "parent_url": "..."}],  # Queue/stack
    "depths": {"url1": 0, "url2": 1},  # Depth tracking
    "pages_crawled": 42  # Counter
 }
 ```
 ### 10.3 Resuming from a Checkpoint
 ```python
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 # Load saved state (e.g., from Redis, database, or file)
 saved_state = json.loads(await redis.get("crawl_state"))
 # Resume crawling from where we left off
 strategy = BFSDeepCrawlStrategy(
    max_depth=3,
    resume_state=saved_state,  # Continue from checkpoint
    on_state_change=save_state_to_redis,  # Keep saving progress
 )
 config = CrawlerRunConfig(deep_crawl_strategy=strategy)
 async with AsyncWebCrawler() as crawler:
    # Will skip already-visited URLs and continue from pending queue
    results = await crawler.arun(start_url, config=config)
 ```
 ### 10.4 Manual State Export
 You can export the last captured state using `export_state()`. Note that this requires `on_state_change` to be set (state is captured in the callback):
 ```python
 import json
 captured_state = None
 async def capture_state(state: dict):
    global captured_state
    captured_state = state
 strategy = BFSDeepCrawlStrategy(
    max_depth=2,
    on_state_change=capture_state,  # Required for state capture
 )
 config = CrawlerRunConfig(deep_crawl_strategy=strategy)
 async with AsyncWebCrawler() as crawler:
    results = await crawler.arun(start_url, config=config)
 # Get the last captured state
 state = strategy.export_state()
 if state:
    # Save to your preferred storage
    with open("crawl_checkpoint.json", "w") as f:
        json.dump(state, f)
 ```
 ### 10.5 Complete Example: Redis-Based Recovery
 ```python
 import asyncio
 import json
 import redis.asyncio as redis
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 REDIS_KEY = "crawl4ai:crawl_state"
 async def main():
    redis_client = redis.Redis(host='localhost', port=6379, db=0)
    # Check for existing state
    saved_state = None
    existing = await redis_client.get(REDIS_KEY)
    if existing:
        saved_state = json.loads(existing)
        print(f"Resuming from checkpoint: {saved_state['pages_crawled']} pages already crawled")
    # State persistence callback
    async def persist_state(state: dict):
        await redis_client.set(REDIS_KEY, json.dumps(state))
    # Create strategy with recovery support
    strategy = BFSDeepCrawlStrategy(
        max_depth=3,
        max_pages=100,
        resume_state=saved_state,
        on_state_change=persist_state,
    )
    config = CrawlerRunConfig(deep_crawl_strategy=strategy, stream=True)
    try:
        async with AsyncWebCrawler() as crawler:
            async for result in await crawler.arun("https://example.com", config=config):
                print(f"Crawled: {result.url}")
    except Exception as e:
        print(f"Crawl interrupted: {e}")
        print("State saved - restart to resume")
    finally:
        await redis_client.close()
 if __name__ == "__main__":
    asyncio.run(main())
 ```
 ### 10.6 Zero Overhead
 When `resume_state=None` and `on_state_change=None` (the defaults), there is no performance impact. State tracking only activates when you enable these features.
 ---
 ## 11. Prefetch Mode for Fast URL Discovery
 When you need to quickly discover URLs without full page processing, use **prefetch mode**. This is ideal for two-phase crawling where you first map the site, then selectively process specific pages.
 ### 11.1 Enabling Prefetch Mode
 ```python
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 config = CrawlerRunConfig(prefetch=True)
 async with AsyncWebCrawler() as crawler:
    result = await crawler.arun("https://example.com", config=config)
    # Result contains only HTML and links - no markdown, no extraction
    print(f"Found {len(result.links['internal'])} internal links")
    print(f"Found {len(result.links['external'])} external links")
 ```
 ### 11.2 What Gets Skipped
 Prefetch mode uses a fast path that bypasses heavy processing:
 | Processing Step | Normal Mode | Prefetch Mode |
 |----------------|-------------|---------------|
 | Fetch HTML | ✅ | ✅ |
 | Extract links | ✅ | ✅ (fast `quick_extract_links()`) |
 | Generate markdown | ✅ | ❌ Skipped |
 | Content scraping | ✅ | ❌ Skipped |
 | Media extraction | ✅ | ❌ Skipped |
 | LLM extraction | ✅ | ❌ Skipped |
 ### 11.3 Performance Benefit
 - **Normal mode**: Full pipeline (~2-5 seconds per page)
 - **Prefetch mode**: HTML + links only (~200-500ms per page)
 This makes prefetch mode **5-10x faster** for URL discovery.
 ### 11.4 Two-Phase Crawling Pattern
 The most common use case is two-phase crawling:
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 async def two_phase_crawl(start_url: str):
    async with AsyncWebCrawler() as crawler:
        # ═══════════════════════════════════════════════
        # Phase 1: Fast discovery (prefetch mode)
        # ═══════════════════════════════════════════════
        prefetch_config = CrawlerRunConfig(prefetch=True)
        discovery = await crawler.arun(start_url, config=prefetch_config)
        all_urls = [link["href"] for link in discovery.links.get("internal", [])]
        print(f"Discovered {len(all_urls)} URLs")
        # Filter to URLs you care about
        blog_urls = [url for url in all_urls if "/blog/" in url]
        print(f"Found {len(blog_urls)} blog posts to process")
        # ═══════════════════════════════════════════════
        # Phase 2: Full processing on selected URLs only
        # ═══════════════════════════════════════════════
        full_config = CrawlerRunConfig(
            # Your normal extraction settings
            word_count_threshold=100,
            remove_overlay_elements=True,
        )
        results = []
        for url in blog_urls:
            result = await crawler.arun(url, config=full_config)
            if result.success:
                results.append(result)
                print(f"Processed: {url}")
        return results
 if __name__ == "__main__":
    results = asyncio.run(two_phase_crawl("https://example.com"))
    print(f"Fully processed {len(results)} pages")
 ```
 ### 11.5 Use Cases
 - **Site mapping**: Quickly discover all URLs before deciding what to process
 - **Link validation**: Check which pages exist without heavy processing
 - **Selective deep crawl**: Prefetch to find URLs, filter by pattern, then full crawl
 - **Crawl planning**: Estimate crawl size before committing resources
 ---
 ## 12. Summary & Next Steps
 In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
@@ -495,5 +739,7 @@ In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
 - Use scorers to prioritize the most relevant pages
 - Limit crawls with `max_pages` and `score_threshold` parameters
 - Build a complete advanced crawler with combined techniques
 - **Implement crash recovery** with `resume_state` and `on_state_change` for production deployments
 - **Use prefetch mode** for fast URL discovery and two-phase crawling
 With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.