crawl4ai/tests/validity/test_head_change_detection.py

import asyncio
import httpx
import email.utils
from datetime import datetime
import json
from typing import Dict, Optional
import time


async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
    """
    Check if a URL should be crawled based on HEAD request headers.

    Args:
        url: The URL to check
        cache: Previous cache data containing etag, last_modified, digest, content_length

    Returns:
        True if the page has changed and should be crawled, False otherwise
    """
    if cache is None:
        cache = {}

    headers = {
        "Accept-Encoding": "identity",
        "Want-Content-Digest": "sha-256",
    }

    if cache.get("etag"):
        headers["If-None-Match"] = cache["etag"]
    if cache.get("last_modified"):
        headers["If-Modified-Since"] = cache["last_modified"]

    try:
        async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
            response = await client.head(url, headers=headers)

        # 304 Not Modified - content hasn't changed
        if response.status_code == 304:
            print(f"✓ 304 Not Modified - No need to crawl {url}")
            return False

        h = response.headers

        # Check Content-Digest (most reliable)
        if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
            print(f"✓ Content-Digest matches - No need to crawl {url}")
            return False

        # Check strong ETag
        if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
            print(f"✓ Strong ETag matches - No need to crawl {url}")
            return False

        # Check Last-Modified
        if h.get("last-modified") and cache.get("last_modified"):
            try:
                lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
                lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
                if lm_new <= lm_old:
                    print(f"✓ Last-Modified not newer - No need to crawl {url}")
                    return False
            except:
                pass

        # Check Content-Length (weakest signal - only as a hint, not definitive)
        # Note: Same content length doesn't mean same content!
        # This should only be used when no other signals are available
        if h.get("content-length") and cache.get("content_length"):
            try:
                if int(h["content-length"]) != cache.get("content_length"):
                    print(f"✗ Content-Length changed - Should crawl {url}")
                    return True
                else:
                    print(f"⚠️  Content-Length unchanged but content might have changed - Should crawl {url}")
                    return True  # When in doubt, crawl!
            except:
                pass

        print(f"✗ Content has changed - Should crawl {url}")
        return True

    except Exception as e:
        print(f"✗ Error checking {url}: {e}")
        return True  # On error, assume we should crawl


async def crawl_page(url: str) -> Dict[str, str]:
    """
    Simulate crawling a page and extracting cache headers.
    """
    print(f"\n🕷️  Crawling {url}...")

    async with httpx.AsyncClient(follow_redirects=True, timeout=10) as client:
        response = await client.get(url)

    cache_data = {}
    h = response.headers

    if h.get("etag"):
        cache_data["etag"] = h["etag"]
        print(f"  Stored ETag: {h['etag']}")

    if h.get("last-modified"):
        cache_data["last_modified"] = h["last-modified"]
        print(f"  Stored Last-Modified: {h['last-modified']}")

    if h.get("content-digest"):
        cache_data["digest"] = h["content-digest"]
        print(f"  Stored Content-Digest: {h['content-digest']}")

    if h.get("content-length"):
        cache_data["content_length"] = int(h["content-length"])
        print(f"  Stored Content-Length: {h['content-length']}")

    print(f"  Response size: {len(response.content)} bytes")
    return cache_data


async def test_static_site():
    """Test with a static website (example.com)"""
    print("=" * 60)
    print("Testing with static site: example.com")
    print("=" * 60)

    url = "https://example.com"

    # First crawl - always happens
    cache = await crawl_page(url)

    # Wait a bit
    await asyncio.sleep(2)

    # Second check - should not need to crawl
    print(f"\n📊 Checking if we need to re-crawl...")
    needs_crawl = await should_crawl(url, cache)

    if not needs_crawl:
        print("✅ Correctly identified: No need to re-crawl static content")
    else:
        print("❌ Unexpected: Static content flagged as changed")


async def test_dynamic_site():
    """Test with dynamic websites that change frequently"""
    print("\n" + "=" * 60)
    print("Testing with dynamic sites")
    print("=" * 60)

    # Test with a few dynamic sites
    dynamic_sites = [
        "https://api.github.com/",  # GitHub API root (changes with rate limit info)
        "https://worldtimeapi.org/api/timezone/UTC",  # Current time API
        "https://httpbin.org/uuid",  # Generates new UUID each request
    ]

    for url in dynamic_sites:
        print(f"\n🔄 Testing {url}")
        try:
            # First crawl
            cache = await crawl_page(url)

            # Wait a bit
            await asyncio.sleep(2)

            # Check if content changed
            print(f"\n📊 Checking if we need to re-crawl...")
            needs_crawl = await should_crawl(url, cache)

            if needs_crawl:
                print("✅ Correctly identified: Dynamic content has changed")
            else:
                print("⚠️  Note: Dynamic content appears unchanged (might have caching)")

        except Exception as e:
            print(f"❌ Error testing {url}: {e}")


async def test_conditional_get():
    """Test conditional GET fallback when HEAD doesn't provide enough info"""
    print("\n" + "=" * 60)
    print("Testing conditional GET scenario")
    print("=" * 60)

    url = "https://httpbin.org/etag/test-etag-123"

    # Simulate a scenario where we have an ETag
    cache = {"etag": '"test-etag-123"'}

    print(f"Testing with cached ETag: {cache['etag']}")
    needs_crawl = await should_crawl(url, cache)

    if not needs_crawl:
        print("✅ ETag matched - no crawl needed")
    else:
        print("✅ ETag didn't match - crawl needed")


async def main():
    """Run all tests"""
    print("🚀 Starting HEAD request change detection tests\n")

    await test_static_site()
    await test_dynamic_site()
    await test_conditional_get()

    print("\n✨ All tests completed!")


if __name__ == "__main__":
    asyncio.run(main())