feat(crawl4ai): Implement SMART cache mode

This commit introduces a new cache mode, SMART, to the crawl4ai library. The SMART mode intelligently validates cached content using HEAD requests before using it, saving significant bandwidth while ensuring fresh content. The changes include modifications to the async_webcrawler.py, cache_context.py, and utils.py files in the crawl4ai directory. The async_webcrawler.py file now includes a check for the SMART cache mode and performs a HEAD check to see if the content has changed. If the content has changed, the url is re-crawled; otherwise, the cached result is used. The cache_context.py and utils.py files have been updated to support these changes. The documentation has also been updated to reflect these changes. The cache-modes.md file now includes a detailed explanation of the SMART mode, its logs, limitations, and an advanced example. The examples.md file now includes a link to the SMART Cache Mode example. The quickstart.md file now mentions the SMART mode in the note about cache modes. These changes improve the efficiency of the crawl4ai library by reducing unnecessary re-crawling and bandwidth usage. BREAKING CHANGE: The introduction of the SMART cache mode may affect existing code that uses the crawl4ai library and does not expect this new mode. Users should review the updated documentation to understand how to use this new mode.
2025-07-21 21:19:37 +08:00
parent 8a04351406
commit d1de82a332
11 changed files with 1139 additions and 29 deletions
--- a/tests/validity/test_head_change_detection.py
+++ b/tests/validity/test_head_change_detection.py
@@ -0,0 +1,211 @@
+import asyncio
+import httpx
+import email.utils
+from datetime import datetime
+import json
+from typing import Dict, Optional
+import time
+
+
+async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
+    """
+    Check if a URL should be crawled based on HEAD request headers.
+    
+    Args:
+        url: The URL to check
+        cache: Previous cache data containing etag, last_modified, digest, content_length
+    
+    Returns:
+        True if the page has changed and should be crawled, False otherwise
+    """
+    if cache is None:
+        cache = {}
+    
+    headers = {
+        "Accept-Encoding": "identity",
+        "Want-Content-Digest": "sha-256",
+    }
+    
+    if cache.get("etag"):
+        headers["If-None-Match"] = cache["etag"]
+    if cache.get("last_modified"):
+        headers["If-Modified-Since"] = cache["last_modified"]
+    
+    try:
+        async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
+            response = await client.head(url, headers=headers)
+        
+        # 304 Not Modified - content hasn't changed
+        if response.status_code == 304:
+            print(f"✓ 304 Not Modified - No need to crawl {url}")
+            return False
+        
+        h = response.headers
+        
+        # Check Content-Digest (most reliable)
+        if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
+            print(f"✓ Content-Digest matches - No need to crawl {url}")
+            return False
+        
+        # Check strong ETag
+        if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
+            print(f"✓ Strong ETag matches - No need to crawl {url}")
+            return False
+        
+        # Check Last-Modified
+        if h.get("last-modified") and cache.get("last_modified"):
+            try:
+                lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
+                lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
+                if lm_new <= lm_old:
+                    print(f"✓ Last-Modified not newer - No need to crawl {url}")
+                    return False
+            except:
+                pass
+        
+        # Check Content-Length (weakest signal - only as a hint, not definitive)
+        # Note: Same content length doesn't mean same content!
+        # This should only be used when no other signals are available
+        if h.get("content-length") and cache.get("content_length"):
+            try:
+                if int(h["content-length"]) != cache.get("content_length"):
+                    print(f"✗ Content-Length changed - Should crawl {url}")
+                    return True
+                else:
+                    print(f"⚠️  Content-Length unchanged but content might have changed - Should crawl {url}")
+                    return True  # When in doubt, crawl!
+            except:
+                pass
+        
+        print(f"✗ Content has changed - Should crawl {url}")
+        return True
+        
+    except Exception as e:
+        print(f"✗ Error checking {url}: {e}")
+        return True  # On error, assume we should crawl
+
+
+async def crawl_page(url: str) -> Dict[str, str]:
+    """
+    Simulate crawling a page and extracting cache headers.
+    """
+    print(f"\n🕷️  Crawling {url}...")
+    
+    async with httpx.AsyncClient(follow_redirects=True, timeout=10) as client:
+        response = await client.get(url)
+    
+    cache_data = {}
+    h = response.headers
+    
+    if h.get("etag"):
+        cache_data["etag"] = h["etag"]
+        print(f"  Stored ETag: {h['etag']}")
+    
+    if h.get("last-modified"):
+        cache_data["last_modified"] = h["last-modified"]
+        print(f"  Stored Last-Modified: {h['last-modified']}")
+    
+    if h.get("content-digest"):
+        cache_data["digest"] = h["content-digest"]
+        print(f"  Stored Content-Digest: {h['content-digest']}")
+    
+    if h.get("content-length"):
+        cache_data["content_length"] = int(h["content-length"])
+        print(f"  Stored Content-Length: {h['content-length']}")
+    
+    print(f"  Response size: {len(response.content)} bytes")
+    return cache_data
+
+
+async def test_static_site():
+    """Test with a static website (example.com)"""
+    print("=" * 60)
+    print("Testing with static site: example.com")
+    print("=" * 60)
+    
+    url = "https://example.com"
+    
+    # First crawl - always happens
+    cache = await crawl_page(url)
+    
+    # Wait a bit
+    await asyncio.sleep(2)
+    
+    # Second check - should not need to crawl
+    print(f"\n📊 Checking if we need to re-crawl...")
+    needs_crawl = await should_crawl(url, cache)
+    
+    if not needs_crawl:
+        print("✅ Correctly identified: No need to re-crawl static content")
+    else:
+        print("❌ Unexpected: Static content flagged as changed")
+
+
+async def test_dynamic_site():
+    """Test with dynamic websites that change frequently"""
+    print("\n" + "=" * 60)
+    print("Testing with dynamic sites")
+    print("=" * 60)
+    
+    # Test with a few dynamic sites
+    dynamic_sites = [
+        "https://api.github.com/",  # GitHub API root (changes with rate limit info)
+        "https://worldtimeapi.org/api/timezone/UTC",  # Current time API
+        "https://httpbin.org/uuid",  # Generates new UUID each request
+    ]
+    
+    for url in dynamic_sites:
+        print(f"\n🔄 Testing {url}")
+        try:
+            # First crawl
+            cache = await crawl_page(url)
+            
+            # Wait a bit
+            await asyncio.sleep(2)
+            
+            # Check if content changed
+            print(f"\n📊 Checking if we need to re-crawl...")
+            needs_crawl = await should_crawl(url, cache)
+            
+            if needs_crawl:
+                print("✅ Correctly identified: Dynamic content has changed")
+            else:
+                print("⚠️  Note: Dynamic content appears unchanged (might have caching)")
+                
+        except Exception as e:
+            print(f"❌ Error testing {url}: {e}")
+
+
+async def test_conditional_get():
+    """Test conditional GET fallback when HEAD doesn't provide enough info"""
+    print("\n" + "=" * 60)
+    print("Testing conditional GET scenario")
+    print("=" * 60)
+    
+    url = "https://httpbin.org/etag/test-etag-123"
+    
+    # Simulate a scenario where we have an ETag
+    cache = {"etag": '"test-etag-123"'}
+    
+    print(f"Testing with cached ETag: {cache['etag']}")
+    needs_crawl = await should_crawl(url, cache)
+    
+    if not needs_crawl:
+        print("✅ ETag matched - no crawl needed")
+    else:
+        print("✅ ETag didn't match - crawl needed")
+
+
+async def main():
+    """Run all tests"""
+    print("🚀 Starting HEAD request change detection tests\n")
+    
+    await test_static_site()
+    await test_dynamic_site()
+    await test_conditional_get()
+    
+    print("\n✨ All tests completed!")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/validity/test_head_with_real_changes.py
+++ b/tests/validity/test_head_with_real_changes.py
@@ -0,0 +1,186 @@
+import asyncio
+import httpx
+import email.utils
+from datetime import datetime
+import json
+from typing import Dict, Optional
+import time
+
+
+async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
+    """
+    Check if a URL should be crawled based on HEAD request headers.
+    """
+    if cache is None:
+        cache = {}
+    
+    headers = {
+        "Accept-Encoding": "identity",
+        "Want-Content-Digest": "sha-256",
+        "User-Agent": "Mozilla/5.0 (compatible; crawl4ai/1.0)"
+    }
+    
+    if cache.get("etag"):
+        headers["If-None-Match"] = cache["etag"]
+    if cache.get("last_modified"):
+        headers["If-Modified-Since"] = cache["last_modified"]
+    
+    try:
+        async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
+            response = await client.head(url, headers=headers)
+        
+        print(f"\nHEAD Response Status: {response.status_code}")
+        print(f"Headers received: {dict(response.headers)}")
+        
+        # 304 Not Modified
+        if response.status_code == 304:
+            return False
+        
+        h = response.headers
+        
+        # Check headers in order of reliability
+        if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
+            return False
+        
+        if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
+            return False
+        
+        if h.get("last-modified") and cache.get("last_modified"):
+            try:
+                lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
+                lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
+                if lm_new <= lm_old:
+                    return False
+            except:
+                pass
+        
+        # Check Content-Length (weakest signal - only as a hint, not definitive)
+        # Note: Same content length doesn't mean same content!
+        if h.get("content-length") and cache.get("content_length"):
+            try:
+                if int(h["content-length"]) != cache.get("content_length"):
+                    return True  # Length changed, likely content changed
+                # If length is same, we can't be sure - default to crawling
+            except:
+                pass
+        
+        return True
+        
+    except Exception as e:
+        print(f"Error during HEAD request: {e}")
+        return True
+
+
+async def test_with_changing_content():
+    """Test with a real changing website"""
+    print("=" * 60)
+    print("Testing with real changing content")
+    print("=" * 60)
+    
+    # Using httpbin's cache endpoint that changes after specified seconds
+    url = "https://httpbin.org/cache/1"  # Cache for 1 second
+    
+    print(f"\n1️⃣ First request to {url}")
+    async with httpx.AsyncClient() as client:
+        response1 = await client.get(url)
+        cache = {}
+        if response1.headers.get("etag"):
+            cache["etag"] = response1.headers["etag"]
+        if response1.headers.get("last-modified"):
+            cache["last_modified"] = response1.headers["last-modified"]
+        print(f"Cached ETag: {cache.get('etag', 'None')}")
+        print(f"Cached Last-Modified: {cache.get('last_modified', 'None')}")
+    
+    # Check immediately (should not need crawl)
+    print(f"\n2️⃣ Checking immediately after first request...")
+    needs_crawl = await should_crawl(url, cache)
+    print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")
+    
+    # Wait for cache to expire
+    print(f"\n⏳ Waiting 2 seconds for cache to expire...")
+    await asyncio.sleep(2)
+    
+    # Check again (should need crawl now)
+    print(f"\n3️⃣ Checking after cache expiry...")
+    needs_crawl = await should_crawl(url, cache)
+    print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")
+
+
+async def test_news_website():
+    """Test with a news website that updates frequently"""
+    print("\n" + "=" * 60)
+    print("Testing with news website (BBC)")
+    print("=" * 60)
+    
+    url = "https://www.bbc.com"
+    
+    print(f"\n1️⃣ First crawl of {url}")
+    async with httpx.AsyncClient() as client:
+        response1 = await client.get(url)
+        cache = {}
+        h = response1.headers
+        
+        if h.get("etag"):
+            cache["etag"] = h["etag"]
+            print(f"Stored ETag: {h['etag'][:50]}...")
+        if h.get("last-modified"):
+            cache["last_modified"] = h["last-modified"]
+            print(f"Stored Last-Modified: {h['last-modified']}")
+        if h.get("content-length"):
+            cache["content_length"] = int(h["content-length"])
+            print(f"Stored Content-Length: {h['content-length']}")
+    
+    # Check multiple times
+    for i in range(3):
+        await asyncio.sleep(5)
+        print(f"\n📊 Check #{i+2} - {datetime.now().strftime('%H:%M:%S')}")
+        needs_crawl = await should_crawl(url, cache)
+        print(f"Result: {'NEED TO CRAWL ✓' if needs_crawl else 'NO NEED TO CRAWL ✗'}")
+
+
+async def test_api_endpoint():
+    """Test with an API that provides proper caching headers"""
+    print("\n" + "=" * 60)
+    print("Testing with GitHub API")
+    print("=" * 60)
+    
+    # GitHub user API (updates when user data changes)
+    url = "https://api.github.com/users/github"
+    
+    headers = {"User-Agent": "crawl4ai-test"}
+    
+    print(f"\n1️⃣ First request to {url}")
+    async with httpx.AsyncClient() as client:
+        response1 = await client.get(url, headers=headers)
+        cache = {}
+        h = response1.headers
+        
+        if h.get("etag"):
+            cache["etag"] = h["etag"]
+            print(f"Stored ETag: {h['etag']}")
+        if h.get("last-modified"):
+            cache["last_modified"] = h["last-modified"]
+            print(f"Stored Last-Modified: {h['last-modified']}")
+        
+        # Print rate limit info
+        print(f"Rate Limit Remaining: {h.get('x-ratelimit-remaining', 'N/A')}")
+    
+    # Check if content changed
+    print(f"\n2️⃣ Checking if content changed...")
+    needs_crawl = await should_crawl(url, cache)
+    print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL (content unchanged)'}")
+
+
+async def main():
+    """Run all tests"""
+    print("🚀 Testing HEAD request change detection with real websites\n")
+    
+    await test_with_changing_content()
+    await test_news_website()
+    await test_api_endpoint()
+    
+    print("\n✨ All tests completed!")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/validity/test_smart_cache_mode.py
+++ b/tests/validity/test_smart_cache_mode.py
@@ -0,0 +1,196 @@
+"""
+Test SMART cache mode functionality in crawl4ai.
+
+This test demonstrates:
+1. Initial crawl with caching enabled
+2. Re-crawl with SMART mode on static content (should use cache)
+3. Re-crawl with SMART mode on dynamic content (should re-crawl)
+"""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.cache_context import CacheMode
+import time
+from datetime import datetime
+
+
+async def test_smart_cache_mode():
+    """Test the SMART cache mode with both static and dynamic URLs"""
+    
+    print("=" * 60)
+    print("Testing SMART Cache Mode")
+    print("=" * 60)
+    
+    # URLs for testing
+    static_url = "https://example.com"  # Rarely changes
+    dynamic_url = "https://httpbin.org/uuid"  # Changes every request
+    
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        
+        # Test 1: Initial crawl with caching enabled
+        print("\n1️⃣ Initial crawl with ENABLED cache mode")
+        print("-" * 40)
+        
+        # Crawl static URL
+        config_static = CrawlerRunConfig(
+            cache_mode=CacheMode.ENABLED,
+            verbose=True
+        )
+        result_static_1 = await crawler.arun(url=static_url, config=config_static)
+        print(f"✓ Static URL crawled: {len(result_static_1.html)} bytes")
+        print(f"  Response headers: {list(result_static_1.response_headers.keys())[:5]}...")
+        
+        # Crawl dynamic URL
+        config_dynamic = CrawlerRunConfig(
+            cache_mode=CacheMode.ENABLED,
+            verbose=True
+        )
+        result_dynamic_1 = await crawler.arun(url=dynamic_url, config=config_dynamic)
+        print(f"✓ Dynamic URL crawled: {len(result_dynamic_1.html)} bytes")
+        dynamic_content_1 = result_dynamic_1.html
+        
+        # Wait a bit
+        await asyncio.sleep(2)
+        
+        # Test 2: Re-crawl static URL with SMART mode
+        print("\n2️⃣ Re-crawl static URL with SMART cache mode")
+        print("-" * 40)
+        
+        config_smart = CrawlerRunConfig(
+            cache_mode=CacheMode.SMART,  # This will be our new mode
+            verbose=True
+        )
+        
+        start_time = time.time()
+        result_static_2 = await crawler.arun(url=static_url, config=config_smart)
+        elapsed = time.time() - start_time
+        
+        print(f"✓ Static URL with SMART mode completed in {elapsed:.2f}s")
+        print(f"  Should use cache (content unchanged)")
+        print(f"  HTML length: {len(result_static_2.html)} bytes")
+        
+        # Test 3: Re-crawl dynamic URL with SMART mode
+        print("\n3️⃣ Re-crawl dynamic URL with SMART cache mode")
+        print("-" * 40)
+        
+        start_time = time.time()
+        result_dynamic_2 = await crawler.arun(url=dynamic_url, config=config_smart)
+        elapsed = time.time() - start_time
+        dynamic_content_2 = result_dynamic_2.html
+        
+        print(f"✓ Dynamic URL with SMART mode completed in {elapsed:.2f}s")
+        print(f"  Should re-crawl (content changes every request)")
+        print(f"  HTML length: {len(result_dynamic_2.html)} bytes")
+        print(f"  Content changed: {dynamic_content_1 != dynamic_content_2}")
+        
+        # Test 4: Test with a news website (content changes frequently)
+        print("\n4️⃣ Testing with news website")
+        print("-" * 40)
+        
+        news_url = "https://news.ycombinator.com"
+        
+        # First crawl
+        result_news_1 = await crawler.arun(
+            url=news_url, 
+            config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+        )
+        print(f"✓ News site initial crawl: {len(result_news_1.html)} bytes")
+        
+        # Wait a bit
+        await asyncio.sleep(5)
+        
+        # Re-crawl with SMART mode
+        start_time = time.time()
+        result_news_2 = await crawler.arun(
+            url=news_url,
+            config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
+        )
+        elapsed = time.time() - start_time
+        
+        print(f"✓ News site SMART mode completed in {elapsed:.2f}s")
+        print(f"  Content length changed: {len(result_news_1.html) != len(result_news_2.html)}")
+        
+        # Summary
+        print("\n" + "=" * 60)
+        print("Summary")
+        print("=" * 60)
+        print("✅ SMART cache mode should:")
+        print("   - Use cache for static content (example.com)")
+        print("   - Re-crawl dynamic content (httpbin.org/uuid)")
+        print("   - Make intelligent decisions based on HEAD requests")
+        print("   - Save bandwidth on unchanged content")
+
+
+async def test_smart_cache_edge_cases():
+    """Test edge cases for SMART cache mode"""
+    
+    print("\n" + "=" * 60)
+    print("Testing SMART Cache Mode Edge Cases")
+    print("=" * 60)
+    
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        
+        # Test with URL that doesn't support HEAD
+        print("\n🔧 Testing URL with potential HEAD issues")
+        print("-" * 40)
+        
+        # Some servers don't handle HEAD well
+        problematic_url = "https://httpbin.org/status/200"
+        
+        # Initial crawl
+        await crawler.arun(
+            url=problematic_url,
+            config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+        )
+        
+        # Try SMART mode
+        result = await crawler.arun(
+            url=problematic_url,
+            config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
+        )
+        print(f"✓ Handled potentially problematic URL: {result.success}")
+        
+        # Test with URL that has no caching headers
+        print("\n🔧 Testing URL with no cache headers")
+        print("-" * 40)
+        
+        no_cache_url = "https://httpbin.org/html"
+        
+        # Initial crawl
+        await crawler.arun(
+            url=no_cache_url,
+            config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+        )
+        
+        # SMART mode should handle gracefully
+        result = await crawler.arun(
+            url=no_cache_url,
+            config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
+        )
+        print(f"✓ Handled URL with no cache headers: {result.success}")
+
+
+async def main():
+    """Run all tests"""
+    try:
+        # Run main test
+        await test_smart_cache_mode()
+        
+        # Run edge case tests
+        await test_smart_cache_edge_cases()
+        
+        print("\n✨ All tests completed!")
+        
+    except Exception as e:
+        print(f"\n❌ Error during testing: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    # Note: This test will fail until SMART mode is implemented
+    print("⚠️  Note: This test expects CacheMode.SMART to be implemented")
+    print("⚠️  It will fail with AttributeError until the feature is added\n")
+    
+    asyncio.run(main())
--- a/tests/validity/test_smart_cache_simple.py
+++ b/tests/validity/test_smart_cache_simple.py
@@ -0,0 +1,69 @@
+"""
+Simple test for SMART cache mode functionality.
+"""
+
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.cache_context import CacheMode
+import time
+
+
+async def test_smart_cache():
+    """Test SMART cache mode with a simple example"""
+    
+    print("Testing SMART Cache Mode")
+    print("-" * 40)
+    
+    # Test URL
+    url = "https://example.com"
+    
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # First crawl with normal caching
+        print("\n1. Initial crawl with ENABLED mode:")
+        config1 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+        result1 = await crawler.arun(url=url, config=config1)
+        print(f"   Crawled: {len(result1.html)} bytes")
+        print(f"   Headers: {list(result1.response_headers.keys())[:3]}...")
+        
+        # Wait a moment
+        await asyncio.sleep(2)
+        
+        # Re-crawl with SMART mode
+        print("\n2. Re-crawl with SMART mode:")
+        config2 = CrawlerRunConfig(cache_mode=CacheMode.SMART)
+        start = time.time()
+        result2 = await crawler.arun(url=url, config=config2)
+        elapsed = time.time() - start
+        
+        print(f"   Time: {elapsed:.2f}s")
+        print(f"   Result: {len(result2.html)} bytes")
+        print(f"   Should use cache (content unchanged)")
+        
+        # Test with dynamic content
+        print("\n3. Testing with dynamic URL:")
+        dynamic_url = "https://httpbin.org/uuid"
+        
+        # First crawl
+        config3 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+        result3 = await crawler.arun(url=dynamic_url, config=config3)
+        content1 = result3.html
+        
+        # Re-crawl with SMART
+        config4 = CrawlerRunConfig(cache_mode=CacheMode.SMART)
+        result4 = await crawler.arun(url=dynamic_url, config=config4)
+        content2 = result4.html
+        
+        print(f"   Content changed: {content1 != content2}")
+        print(f"   Should re-crawl (dynamic content)")
+
+
+if __name__ == "__main__":
+    print(f"Python path: {sys.path[0]}")
+    print(f"CacheMode values: {[e.value for e in CacheMode]}")
+    print()
+    asyncio.run(test_smart_cache())