crawl4ai/docs/examples/smart_cache.py

"""
SMART Cache Mode Example for Crawl4AI

This example demonstrates how to use the SMART cache mode to intelligently
validate cached content before using it. SMART mode can save 70-95% bandwidth
on unchanged content while ensuring you always get fresh data when it changes.

SMART Cache Mode: Only Crawl When Changes
"""

import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))

import asyncio
import time
from crawl4ai import AsyncWebCrawler
from crawl4ai.cache_context import CacheMode
from crawl4ai.async_configs import CrawlerRunConfig


async def basic_smart_cache_example():
    """Basic example showing SMART cache mode in action"""
    print("=== Basic SMART Cache Example ===\n")

    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://example.com"

        # First crawl: Cache the content
        print("1. Initial crawl to cache the content:")
        config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
        result1 = await crawler.arun(url=url, config=config)
        print(f"   Initial crawl: {len(result1.html)} bytes\n")

        # Second crawl: Use SMART mode
        print("2. SMART mode crawl (should use cache for static content):")
        smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
        start_time = time.time()
        result2 = await crawler.arun(url=url, config=smart_config)
        elapsed = time.time() - start_time
        print(f"   SMART crawl: {len(result2.html)} bytes in {elapsed:.2f}s")
        print(f"   Content identical: {result1.html == result2.html}\n")


async def news_site_monitoring():
    """Monitor a news site for changes using SMART cache mode"""
    print("=== News Site Monitoring Example ===\n")

    async with AsyncWebCrawler(verbose=True) as crawler:
        config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
        url = "https://news.ycombinator.com"

        print("Monitoring Hacker News for changes...\n")

        previous_length = 0
        for i in range(3):
            result = await crawler.arun(url=url, config=config)
            current_length = len(result.html)

            if i == 0:
                print(f"Check {i+1}: Initial crawl - {current_length} bytes")
            else:
                if current_length != previous_length:
                    print(f"Check {i+1}: Content changed! {previous_length} -> {current_length} bytes")
                else:
                    print(f"Check {i+1}: Content unchanged - {current_length} bytes")

            previous_length = current_length

            if i < 2:  # Don't wait after last check
                print("   Waiting 10 seconds before next check...")
                await asyncio.sleep(10)

        print()


async def compare_cache_modes():
    """Compare different cache modes to understand SMART mode benefits"""
    print("=== Cache Mode Comparison ===\n")

    async with AsyncWebCrawler(verbose=False) as crawler:
        url = "https://www.wikipedia.org"

        # First, populate the cache
        config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
        await crawler.arun(url=url, config=config)
        print("Cache populated.\n")

        # Test different cache modes
        modes = [
            (CacheMode.ENABLED, "ENABLED (always uses cache if available)"),
            (CacheMode.BYPASS, "BYPASS (never uses cache)"),
            (CacheMode.SMART, "SMART (validates cache before using)")
        ]

        for mode, description in modes:
            config = CrawlerRunConfig(cache_mode=mode)
            start_time = time.time()
            result = await crawler.arun(url=url, config=config)
            elapsed = time.time() - start_time

            print(f"{description}:")
            print(f"  Time: {elapsed:.2f}s")
            print(f"  Size: {len(result.html)} bytes\n")


async def dynamic_content_example():
    """Show how SMART mode handles dynamic content"""
    print("=== Dynamic Content Example ===\n")

    async with AsyncWebCrawler(verbose=True) as crawler:
        # URL that returns different content each time
        dynamic_url = "https://httpbin.org/uuid"

        print("Testing with dynamic content (changes every request):\n")

        # First crawl
        config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
        result1 = await crawler.arun(url=dynamic_url, config=config)

        # Extract UUID from the response
        import re
        uuid1 = re.search(r'"uuid":\s*"([^"]+)"', result1.html)
        if uuid1:
            print(f"1. First crawl UUID: {uuid1.group(1)}")

        # SMART mode crawl - should detect change and re-crawl
        smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
        result2 = await crawler.arun(url=dynamic_url, config=smart_config)

        uuid2 = re.search(r'"uuid":\s*"([^"]+)"', result2.html)
        if uuid2:
            print(f"2. SMART crawl UUID: {uuid2.group(1)}")
            print(f"   Different UUIDs: {uuid1.group(1) != uuid2.group(1)} (should be True)")


async def bandwidth_savings_demo():
    """Demonstrate bandwidth savings with SMART mode"""
    print("=== Bandwidth Savings Demo ===\n")

    async with AsyncWebCrawler(verbose=True) as crawler:
        # List of URLs to crawl
        urls = [
            "https://example.com",
            "https://www.python.org",
            "https://docs.python.org/3/",
        ]

        print("Crawling multiple URLs twice to show bandwidth savings:\n")

        # First pass: Cache all URLs
        print("First pass - Caching all URLs:")
        total_bytes_pass1 = 0
        config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)

        for url in urls:
            result = await crawler.arun(url=url, config=config)
            total_bytes_pass1 += len(result.html)
            print(f"  {url}: {len(result.html)} bytes")

        print(f"\nTotal downloaded in first pass: {total_bytes_pass1} bytes")

        # Second pass: Use SMART mode
        print("\nSecond pass - Using SMART mode:")
        total_bytes_pass2 = 0
        smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)

        for url in urls:
            result = await crawler.arun(url=url, config=smart_config)
            # In SMART mode, unchanged content uses cache (minimal bandwidth)
            print(f"  {url}: Using {'cache' if result else 'fresh crawl'}")

        print(f"\nBandwidth saved: ~{total_bytes_pass1} bytes (only HEAD requests sent)")


async def main():
    """Run all examples"""
    examples = [
        basic_smart_cache_example,
        news_site_monitoring,
        compare_cache_modes,
        dynamic_content_example,
        bandwidth_savings_demo
    ]

    for example in examples:
        await example()
        print("\n" + "="*50 + "\n")
        await asyncio.sleep(2)  # Brief pause between examples


if __name__ == "__main__":
    print("""
Crawl4AI SMART Cache Mode Examples
==================================

These examples demonstrate the SMART cache mode that intelligently
validates cached content using HEAD requests before deciding whether
to use cache or perform a fresh crawl.

""")
    asyncio.run(main())