This commit introduces a new cache mode, SMART, to the crawl4ai library. The SMART mode intelligently validates cached content using HEAD requests before using it, saving significant bandwidth while ensuring fresh content. The changes include modifications to the async_webcrawler.py, cache_context.py, and utils.py files in the crawl4ai directory. The async_webcrawler.py file now includes a check for the SMART cache mode and performs a HEAD check to see if the content has changed. If the content has changed, the url is re-crawled; otherwise, the cached result is used. The cache_context.py and utils.py files have been updated to support these changes. The documentation has also been updated to reflect these changes. The cache-modes.md file now includes a detailed explanation of the SMART mode, its logs, limitations, and an advanced example. The examples.md file now includes a link to the SMART Cache Mode example. The quickstart.md file now mentions the SMART mode in the note about cache modes. These changes improve the efficiency of the crawl4ai library by reducing unnecessary re-crawling and bandwidth usage. BREAKING CHANGE: The introduction of the SMART cache mode may affect existing code that uses the crawl4ai library and does not expect this new mode. Users should review the updated documentation to understand how to use this new mode.
211 lines
6.8 KiB
Python
211 lines
6.8 KiB
Python
import asyncio
|
|
import httpx
|
|
import email.utils
|
|
from datetime import datetime
|
|
import json
|
|
from typing import Dict, Optional
|
|
import time
|
|
|
|
|
|
async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
|
|
"""
|
|
Check if a URL should be crawled based on HEAD request headers.
|
|
|
|
Args:
|
|
url: The URL to check
|
|
cache: Previous cache data containing etag, last_modified, digest, content_length
|
|
|
|
Returns:
|
|
True if the page has changed and should be crawled, False otherwise
|
|
"""
|
|
if cache is None:
|
|
cache = {}
|
|
|
|
headers = {
|
|
"Accept-Encoding": "identity",
|
|
"Want-Content-Digest": "sha-256",
|
|
}
|
|
|
|
if cache.get("etag"):
|
|
headers["If-None-Match"] = cache["etag"]
|
|
if cache.get("last_modified"):
|
|
headers["If-Modified-Since"] = cache["last_modified"]
|
|
|
|
try:
|
|
async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
|
|
response = await client.head(url, headers=headers)
|
|
|
|
# 304 Not Modified - content hasn't changed
|
|
if response.status_code == 304:
|
|
print(f"✓ 304 Not Modified - No need to crawl {url}")
|
|
return False
|
|
|
|
h = response.headers
|
|
|
|
# Check Content-Digest (most reliable)
|
|
if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
|
|
print(f"✓ Content-Digest matches - No need to crawl {url}")
|
|
return False
|
|
|
|
# Check strong ETag
|
|
if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
|
|
print(f"✓ Strong ETag matches - No need to crawl {url}")
|
|
return False
|
|
|
|
# Check Last-Modified
|
|
if h.get("last-modified") and cache.get("last_modified"):
|
|
try:
|
|
lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
|
|
lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
|
|
if lm_new <= lm_old:
|
|
print(f"✓ Last-Modified not newer - No need to crawl {url}")
|
|
return False
|
|
except:
|
|
pass
|
|
|
|
# Check Content-Length (weakest signal - only as a hint, not definitive)
|
|
# Note: Same content length doesn't mean same content!
|
|
# This should only be used when no other signals are available
|
|
if h.get("content-length") and cache.get("content_length"):
|
|
try:
|
|
if int(h["content-length"]) != cache.get("content_length"):
|
|
print(f"✗ Content-Length changed - Should crawl {url}")
|
|
return True
|
|
else:
|
|
print(f"⚠️ Content-Length unchanged but content might have changed - Should crawl {url}")
|
|
return True # When in doubt, crawl!
|
|
except:
|
|
pass
|
|
|
|
print(f"✗ Content has changed - Should crawl {url}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error checking {url}: {e}")
|
|
return True # On error, assume we should crawl
|
|
|
|
|
|
async def crawl_page(url: str) -> Dict[str, str]:
|
|
"""
|
|
Simulate crawling a page and extracting cache headers.
|
|
"""
|
|
print(f"\n🕷️ Crawling {url}...")
|
|
|
|
async with httpx.AsyncClient(follow_redirects=True, timeout=10) as client:
|
|
response = await client.get(url)
|
|
|
|
cache_data = {}
|
|
h = response.headers
|
|
|
|
if h.get("etag"):
|
|
cache_data["etag"] = h["etag"]
|
|
print(f" Stored ETag: {h['etag']}")
|
|
|
|
if h.get("last-modified"):
|
|
cache_data["last_modified"] = h["last-modified"]
|
|
print(f" Stored Last-Modified: {h['last-modified']}")
|
|
|
|
if h.get("content-digest"):
|
|
cache_data["digest"] = h["content-digest"]
|
|
print(f" Stored Content-Digest: {h['content-digest']}")
|
|
|
|
if h.get("content-length"):
|
|
cache_data["content_length"] = int(h["content-length"])
|
|
print(f" Stored Content-Length: {h['content-length']}")
|
|
|
|
print(f" Response size: {len(response.content)} bytes")
|
|
return cache_data
|
|
|
|
|
|
async def test_static_site():
|
|
"""Test with a static website (example.com)"""
|
|
print("=" * 60)
|
|
print("Testing with static site: example.com")
|
|
print("=" * 60)
|
|
|
|
url = "https://example.com"
|
|
|
|
# First crawl - always happens
|
|
cache = await crawl_page(url)
|
|
|
|
# Wait a bit
|
|
await asyncio.sleep(2)
|
|
|
|
# Second check - should not need to crawl
|
|
print(f"\n📊 Checking if we need to re-crawl...")
|
|
needs_crawl = await should_crawl(url, cache)
|
|
|
|
if not needs_crawl:
|
|
print("✅ Correctly identified: No need to re-crawl static content")
|
|
else:
|
|
print("❌ Unexpected: Static content flagged as changed")
|
|
|
|
|
|
async def test_dynamic_site():
|
|
"""Test with dynamic websites that change frequently"""
|
|
print("\n" + "=" * 60)
|
|
print("Testing with dynamic sites")
|
|
print("=" * 60)
|
|
|
|
# Test with a few dynamic sites
|
|
dynamic_sites = [
|
|
"https://api.github.com/", # GitHub API root (changes with rate limit info)
|
|
"https://worldtimeapi.org/api/timezone/UTC", # Current time API
|
|
"https://httpbin.org/uuid", # Generates new UUID each request
|
|
]
|
|
|
|
for url in dynamic_sites:
|
|
print(f"\n🔄 Testing {url}")
|
|
try:
|
|
# First crawl
|
|
cache = await crawl_page(url)
|
|
|
|
# Wait a bit
|
|
await asyncio.sleep(2)
|
|
|
|
# Check if content changed
|
|
print(f"\n📊 Checking if we need to re-crawl...")
|
|
needs_crawl = await should_crawl(url, cache)
|
|
|
|
if needs_crawl:
|
|
print("✅ Correctly identified: Dynamic content has changed")
|
|
else:
|
|
print("⚠️ Note: Dynamic content appears unchanged (might have caching)")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error testing {url}: {e}")
|
|
|
|
|
|
async def test_conditional_get():
|
|
"""Test conditional GET fallback when HEAD doesn't provide enough info"""
|
|
print("\n" + "=" * 60)
|
|
print("Testing conditional GET scenario")
|
|
print("=" * 60)
|
|
|
|
url = "https://httpbin.org/etag/test-etag-123"
|
|
|
|
# Simulate a scenario where we have an ETag
|
|
cache = {"etag": '"test-etag-123"'}
|
|
|
|
print(f"Testing with cached ETag: {cache['etag']}")
|
|
needs_crawl = await should_crawl(url, cache)
|
|
|
|
if not needs_crawl:
|
|
print("✅ ETag matched - no crawl needed")
|
|
else:
|
|
print("✅ ETag didn't match - crawl needed")
|
|
|
|
|
|
async def main():
|
|
"""Run all tests"""
|
|
print("🚀 Starting HEAD request change detection tests\n")
|
|
|
|
await test_static_site()
|
|
await test_dynamic_site()
|
|
await test_conditional_get()
|
|
|
|
print("\n✨ All tests completed!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |