""" SMART Cache Mode Example for Crawl4AI This example demonstrates how to use the SMART cache mode to intelligently validate cached content before using it. SMART mode can save 70-95% bandwidth on unchanged content while ensuring you always get fresh data when it changes. SMART Cache Mode: Only Crawl When Changes """ import sys import os sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) import asyncio import time from crawl4ai import AsyncWebCrawler from crawl4ai.cache_context import CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def basic_smart_cache_example(): """Basic example showing SMART cache mode in action""" print("=== Basic SMART Cache Example ===\n") async with AsyncWebCrawler(verbose=True) as crawler: url = "https://example.com" # First crawl: Cache the content print("1. Initial crawl to cache the content:") config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) result1 = await crawler.arun(url=url, config=config) print(f" Initial crawl: {len(result1.html)} bytes\n") # Second crawl: Use SMART mode print("2. SMART mode crawl (should use cache for static content):") smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART) start_time = time.time() result2 = await crawler.arun(url=url, config=smart_config) elapsed = time.time() - start_time print(f" SMART crawl: {len(result2.html)} bytes in {elapsed:.2f}s") print(f" Content identical: {result1.html == result2.html}\n") async def news_site_monitoring(): """Monitor a news site for changes using SMART cache mode""" print("=== News Site Monitoring Example ===\n") async with AsyncWebCrawler(verbose=True) as crawler: config = CrawlerRunConfig(cache_mode=CacheMode.SMART) url = "https://news.ycombinator.com" print("Monitoring Hacker News for changes...\n") previous_length = 0 for i in range(3): result = await crawler.arun(url=url, config=config) current_length = len(result.html) if i == 0: print(f"Check {i+1}: Initial crawl - {current_length} bytes") else: if current_length != previous_length: print(f"Check {i+1}: Content changed! {previous_length} -> {current_length} bytes") else: print(f"Check {i+1}: Content unchanged - {current_length} bytes") previous_length = current_length if i < 2: # Don't wait after last check print(" Waiting 10 seconds before next check...") await asyncio.sleep(10) print() async def compare_cache_modes(): """Compare different cache modes to understand SMART mode benefits""" print("=== Cache Mode Comparison ===\n") async with AsyncWebCrawler(verbose=False) as crawler: url = "https://www.wikipedia.org" # First, populate the cache config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) await crawler.arun(url=url, config=config) print("Cache populated.\n") # Test different cache modes modes = [ (CacheMode.ENABLED, "ENABLED (always uses cache if available)"), (CacheMode.BYPASS, "BYPASS (never uses cache)"), (CacheMode.SMART, "SMART (validates cache before using)") ] for mode, description in modes: config = CrawlerRunConfig(cache_mode=mode) start_time = time.time() result = await crawler.arun(url=url, config=config) elapsed = time.time() - start_time print(f"{description}:") print(f" Time: {elapsed:.2f}s") print(f" Size: {len(result.html)} bytes\n") async def dynamic_content_example(): """Show how SMART mode handles dynamic content""" print("=== Dynamic Content Example ===\n") async with AsyncWebCrawler(verbose=True) as crawler: # URL that returns different content each time dynamic_url = "https://httpbin.org/uuid" print("Testing with dynamic content (changes every request):\n") # First crawl config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) result1 = await crawler.arun(url=dynamic_url, config=config) # Extract UUID from the response import re uuid1 = re.search(r'"uuid":\s*"([^"]+)"', result1.html) if uuid1: print(f"1. First crawl UUID: {uuid1.group(1)}") # SMART mode crawl - should detect change and re-crawl smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART) result2 = await crawler.arun(url=dynamic_url, config=smart_config) uuid2 = re.search(r'"uuid":\s*"([^"]+)"', result2.html) if uuid2: print(f"2. SMART crawl UUID: {uuid2.group(1)}") print(f" Different UUIDs: {uuid1.group(1) != uuid2.group(1)} (should be True)") async def bandwidth_savings_demo(): """Demonstrate bandwidth savings with SMART mode""" print("=== Bandwidth Savings Demo ===\n") async with AsyncWebCrawler(verbose=True) as crawler: # List of URLs to crawl urls = [ "https://example.com", "https://www.python.org", "https://docs.python.org/3/", ] print("Crawling multiple URLs twice to show bandwidth savings:\n") # First pass: Cache all URLs print("First pass - Caching all URLs:") total_bytes_pass1 = 0 config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) for url in urls: result = await crawler.arun(url=url, config=config) total_bytes_pass1 += len(result.html) print(f" {url}: {len(result.html)} bytes") print(f"\nTotal downloaded in first pass: {total_bytes_pass1} bytes") # Second pass: Use SMART mode print("\nSecond pass - Using SMART mode:") total_bytes_pass2 = 0 smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART) for url in urls: result = await crawler.arun(url=url, config=smart_config) # In SMART mode, unchanged content uses cache (minimal bandwidth) print(f" {url}: Using {'cache' if result else 'fresh crawl'}") print(f"\nBandwidth saved: ~{total_bytes_pass1} bytes (only HEAD requests sent)") async def main(): """Run all examples""" examples = [ basic_smart_cache_example, news_site_monitoring, compare_cache_modes, dynamic_content_example, bandwidth_savings_demo ] for example in examples: await example() print("\n" + "="*50 + "\n") await asyncio.sleep(2) # Brief pause between examples if __name__ == "__main__": print(""" Crawl4AI SMART Cache Mode Examples ================================== These examples demonstrate the SMART cache mode that intelligently validates cached content using HEAD requests before deciding whether to use cache or perform a fresh crawl. """) asyncio.run(main())