crawl4ai/tests/validity/test_head_with_real_changes.py

import asyncio
import httpx
import email.utils
from datetime import datetime
import json
from typing import Dict, Optional
import time


async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
    """
    Check if a URL should be crawled based on HEAD request headers.
    """
    if cache is None:
        cache = {}

    headers = {
        "Accept-Encoding": "identity",
        "Want-Content-Digest": "sha-256",
        "User-Agent": "Mozilla/5.0 (compatible; crawl4ai/1.0)"
    }

    if cache.get("etag"):
        headers["If-None-Match"] = cache["etag"]
    if cache.get("last_modified"):
        headers["If-Modified-Since"] = cache["last_modified"]

    try:
        async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
            response = await client.head(url, headers=headers)

        print(f"\nHEAD Response Status: {response.status_code}")
        print(f"Headers received: {dict(response.headers)}")

        # 304 Not Modified
        if response.status_code == 304:
            return False

        h = response.headers

        # Check headers in order of reliability
        if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
            return False

        if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
            return False

        if h.get("last-modified") and cache.get("last_modified"):
            try:
                lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
                lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
                if lm_new <= lm_old:
                    return False
            except:
                pass

        # Check Content-Length (weakest signal - only as a hint, not definitive)
        # Note: Same content length doesn't mean same content!
        if h.get("content-length") and cache.get("content_length"):
            try:
                if int(h["content-length"]) != cache.get("content_length"):
                    return True  # Length changed, likely content changed
                # If length is same, we can't be sure - default to crawling
            except:
                pass

        return True

    except Exception as e:
        print(f"Error during HEAD request: {e}")
        return True


async def test_with_changing_content():
    """Test with a real changing website"""
    print("=" * 60)
    print("Testing with real changing content")
    print("=" * 60)

    # Using httpbin's cache endpoint that changes after specified seconds
    url = "https://httpbin.org/cache/1"  # Cache for 1 second

    print(f"\n1️⃣ First request to {url}")
    async with httpx.AsyncClient() as client:
        response1 = await client.get(url)
        cache = {}
        if response1.headers.get("etag"):
            cache["etag"] = response1.headers["etag"]
        if response1.headers.get("last-modified"):
            cache["last_modified"] = response1.headers["last-modified"]
        print(f"Cached ETag: {cache.get('etag', 'None')}")
        print(f"Cached Last-Modified: {cache.get('last_modified', 'None')}")

    # Check immediately (should not need crawl)
    print(f"\n2️⃣ Checking immediately after first request...")
    needs_crawl = await should_crawl(url, cache)
    print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")

    # Wait for cache to expire
    print(f"\n⏳ Waiting 2 seconds for cache to expire...")
    await asyncio.sleep(2)

    # Check again (should need crawl now)
    print(f"\n3️⃣ Checking after cache expiry...")
    needs_crawl = await should_crawl(url, cache)
    print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")


async def test_news_website():
    """Test with a news website that updates frequently"""
    print("\n" + "=" * 60)
    print("Testing with news website (BBC)")
    print("=" * 60)

    url = "https://www.bbc.com"

    print(f"\n1️⃣ First crawl of {url}")
    async with httpx.AsyncClient() as client:
        response1 = await client.get(url)
        cache = {}
        h = response1.headers

        if h.get("etag"):
            cache["etag"] = h["etag"]
            print(f"Stored ETag: {h['etag'][:50]}...")
        if h.get("last-modified"):
            cache["last_modified"] = h["last-modified"]
            print(f"Stored Last-Modified: {h['last-modified']}")
        if h.get("content-length"):
            cache["content_length"] = int(h["content-length"])
            print(f"Stored Content-Length: {h['content-length']}")

    # Check multiple times
    for i in range(3):
        await asyncio.sleep(5)
        print(f"\n📊 Check #{i+2} - {datetime.now().strftime('%H:%M:%S')}")
        needs_crawl = await should_crawl(url, cache)
        print(f"Result: {'NEED TO CRAWL ✓' if needs_crawl else 'NO NEED TO CRAWL ✗'}")


async def test_api_endpoint():
    """Test with an API that provides proper caching headers"""
    print("\n" + "=" * 60)
    print("Testing with GitHub API")
    print("=" * 60)

    # GitHub user API (updates when user data changes)
    url = "https://api.github.com/users/github"

    headers = {"User-Agent": "crawl4ai-test"}

    print(f"\n1️⃣ First request to {url}")
    async with httpx.AsyncClient() as client:
        response1 = await client.get(url, headers=headers)
        cache = {}
        h = response1.headers

        if h.get("etag"):
            cache["etag"] = h["etag"]
            print(f"Stored ETag: {h['etag']}")
        if h.get("last-modified"):
            cache["last_modified"] = h["last-modified"]
            print(f"Stored Last-Modified: {h['last-modified']}")

        # Print rate limit info
        print(f"Rate Limit Remaining: {h.get('x-ratelimit-remaining', 'N/A')}")

    # Check if content changed
    print(f"\n2️⃣ Checking if content changed...")
    needs_crawl = await should_crawl(url, cache)
    print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL (content unchanged)'}")


async def main():
    """Run all tests"""
    print("🚀 Testing HEAD request change detection with real websites\n")

    await test_with_changing_content()
    await test_news_website()
    await test_api_endpoint()

    print("\n✨ All tests completed!")


if __name__ == "__main__":
    asyncio.run(main())