Files
crawl4ai/tests/validity/test_head_with_real_changes.py
UncleCode d1de82a332 feat(crawl4ai): Implement SMART cache mode
This commit introduces a new cache mode, SMART, to the crawl4ai library. The SMART mode intelligently validates cached content using HEAD requests before using it, saving significant bandwidth while ensuring fresh content. The changes include modifications to the async_webcrawler.py, cache_context.py, and utils.py files in the crawl4ai directory. The async_webcrawler.py file now includes a check for the SMART cache mode and performs a HEAD check to see if the content has changed. If the content has changed, the url is re-crawled; otherwise, the cached result is used. The cache_context.py and utils.py files have been updated to support these changes.

The documentation has also been updated to reflect these changes. The cache-modes.md file now includes a detailed explanation of the SMART mode, its logs, limitations, and an advanced example. The examples.md file now includes a link to the SMART Cache Mode example. The quickstart.md file now mentions the SMART mode in the note about cache modes.

These changes improve the efficiency of the crawl4ai library by reducing unnecessary re-crawling and bandwidth usage.

BREAKING CHANGE: The introduction of the SMART cache mode may affect existing code that uses the crawl4ai library and does not expect this new mode. Users should review the updated documentation to understand how to use this new mode.
2025-07-21 21:19:37 +08:00

186 lines
6.4 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import asyncio
import httpx
import email.utils
from datetime import datetime
import json
from typing import Dict, Optional
import time
async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
"""
Check if a URL should be crawled based on HEAD request headers.
"""
if cache is None:
cache = {}
headers = {
"Accept-Encoding": "identity",
"Want-Content-Digest": "sha-256",
"User-Agent": "Mozilla/5.0 (compatible; crawl4ai/1.0)"
}
if cache.get("etag"):
headers["If-None-Match"] = cache["etag"]
if cache.get("last_modified"):
headers["If-Modified-Since"] = cache["last_modified"]
try:
async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
response = await client.head(url, headers=headers)
print(f"\nHEAD Response Status: {response.status_code}")
print(f"Headers received: {dict(response.headers)}")
# 304 Not Modified
if response.status_code == 304:
return False
h = response.headers
# Check headers in order of reliability
if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
return False
if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
return False
if h.get("last-modified") and cache.get("last_modified"):
try:
lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
if lm_new <= lm_old:
return False
except:
pass
# Check Content-Length (weakest signal - only as a hint, not definitive)
# Note: Same content length doesn't mean same content!
if h.get("content-length") and cache.get("content_length"):
try:
if int(h["content-length"]) != cache.get("content_length"):
return True # Length changed, likely content changed
# If length is same, we can't be sure - default to crawling
except:
pass
return True
except Exception as e:
print(f"Error during HEAD request: {e}")
return True
async def test_with_changing_content():
"""Test with a real changing website"""
print("=" * 60)
print("Testing with real changing content")
print("=" * 60)
# Using httpbin's cache endpoint that changes after specified seconds
url = "https://httpbin.org/cache/1" # Cache for 1 second
print(f"\n1⃣ First request to {url}")
async with httpx.AsyncClient() as client:
response1 = await client.get(url)
cache = {}
if response1.headers.get("etag"):
cache["etag"] = response1.headers["etag"]
if response1.headers.get("last-modified"):
cache["last_modified"] = response1.headers["last-modified"]
print(f"Cached ETag: {cache.get('etag', 'None')}")
print(f"Cached Last-Modified: {cache.get('last_modified', 'None')}")
# Check immediately (should not need crawl)
print(f"\n2⃣ Checking immediately after first request...")
needs_crawl = await should_crawl(url, cache)
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")
# Wait for cache to expire
print(f"\n⏳ Waiting 2 seconds for cache to expire...")
await asyncio.sleep(2)
# Check again (should need crawl now)
print(f"\n3⃣ Checking after cache expiry...")
needs_crawl = await should_crawl(url, cache)
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")
async def test_news_website():
"""Test with a news website that updates frequently"""
print("\n" + "=" * 60)
print("Testing with news website (BBC)")
print("=" * 60)
url = "https://www.bbc.com"
print(f"\n1⃣ First crawl of {url}")
async with httpx.AsyncClient() as client:
response1 = await client.get(url)
cache = {}
h = response1.headers
if h.get("etag"):
cache["etag"] = h["etag"]
print(f"Stored ETag: {h['etag'][:50]}...")
if h.get("last-modified"):
cache["last_modified"] = h["last-modified"]
print(f"Stored Last-Modified: {h['last-modified']}")
if h.get("content-length"):
cache["content_length"] = int(h["content-length"])
print(f"Stored Content-Length: {h['content-length']}")
# Check multiple times
for i in range(3):
await asyncio.sleep(5)
print(f"\n📊 Check #{i+2} - {datetime.now().strftime('%H:%M:%S')}")
needs_crawl = await should_crawl(url, cache)
print(f"Result: {'NEED TO CRAWL ✓' if needs_crawl else 'NO NEED TO CRAWL ✗'}")
async def test_api_endpoint():
"""Test with an API that provides proper caching headers"""
print("\n" + "=" * 60)
print("Testing with GitHub API")
print("=" * 60)
# GitHub user API (updates when user data changes)
url = "https://api.github.com/users/github"
headers = {"User-Agent": "crawl4ai-test"}
print(f"\n1⃣ First request to {url}")
async with httpx.AsyncClient() as client:
response1 = await client.get(url, headers=headers)
cache = {}
h = response1.headers
if h.get("etag"):
cache["etag"] = h["etag"]
print(f"Stored ETag: {h['etag']}")
if h.get("last-modified"):
cache["last_modified"] = h["last-modified"]
print(f"Stored Last-Modified: {h['last-modified']}")
# Print rate limit info
print(f"Rate Limit Remaining: {h.get('x-ratelimit-remaining', 'N/A')}")
# Check if content changed
print(f"\n2⃣ Checking if content changed...")
needs_crawl = await should_crawl(url, cache)
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL (content unchanged)'}")
async def main():
"""Run all tests"""
print("🚀 Testing HEAD request change detection with real websites\n")
await test_with_changing_content()
await test_news_website()
await test_api_endpoint()
print("\n✨ All tests completed!")
if __name__ == "__main__":
asyncio.run(main())