Some debugging for caching

This commit is contained in:
unclecode
2025-12-21 04:45:52 +00:00
parent f6b29a8f9f
commit 48426f73f0
11 changed files with 1464 additions and 4 deletions

View File

@@ -1061,6 +1061,15 @@ class CrawlerRunConfig():
shared_data (dict or None): Shared data to be passed between hooks.
Default: None.
# Cache Validation Parameters (Smart Cache)
check_cache_freshness (bool): If True, validates cached content freshness using HTTP
conditional requests (ETag/Last-Modified) and head fingerprinting
before returning cached results. Avoids full browser crawls when
content hasn't changed. Only applies when cache_mode allows reads.
Default: False.
cache_validation_timeout (float): Timeout in seconds for cache validation HTTP requests.
Default: 10.0.
# Page Navigation and Timing Parameters
wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
Default: "domcontentloaded".
@@ -1226,6 +1235,9 @@ class CrawlerRunConfig():
no_cache_read: bool = False,
no_cache_write: bool = False,
shared_data: dict = None,
# Cache Validation Parameters (Smart Cache)
check_cache_freshness: bool = False,
cache_validation_timeout: float = 10.0,
# Page Navigation and Timing Parameters
wait_until: str = "domcontentloaded",
page_timeout: int = PAGE_TIMEOUT,
@@ -1339,6 +1351,9 @@ class CrawlerRunConfig():
self.no_cache_read = no_cache_read
self.no_cache_write = no_cache_write
self.shared_data = shared_data
# Cache Validation (Smart Cache)
self.check_cache_freshness = check_cache_freshness
self.cache_validation_timeout = cache_validation_timeout
# Page Navigation and Timing Parameters
self.wait_until = wait_until

View File

@@ -1,10 +1,11 @@
import os
import time
from pathlib import Path
import aiosqlite
import asyncio
from typing import Optional, Dict
from contextlib import asynccontextmanager
import json
import json
from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
import aiofiles
from .async_logger import AsyncLogger
@@ -262,6 +263,11 @@ class AsyncDatabaseManager:
"screenshot",
"response_headers",
"downloaded_files",
# Smart cache validation columns (added in 0.8.x)
"etag",
"last_modified",
"head_fingerprint",
"cached_at",
]
for column in new_columns:
@@ -275,6 +281,11 @@ class AsyncDatabaseManager:
await db.execute(
f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"'
)
elif new_column == "cached_at":
# Timestamp column for cache validation
await db.execute(
f"ALTER TABLE crawled_data ADD COLUMN {new_column} REAL DEFAULT 0"
)
else:
await db.execute(
f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""'
@@ -378,6 +389,92 @@ class AsyncDatabaseManager:
)
return None
async def aget_cache_metadata(self, url: str) -> Optional[Dict]:
"""
Retrieve only cache validation metadata for a URL (lightweight query).
Returns dict with: url, etag, last_modified, head_fingerprint, cached_at, response_headers
This is used for cache validation without loading full content.
"""
async def _get_metadata(db):
async with db.execute(
"""SELECT url, etag, last_modified, head_fingerprint, cached_at, response_headers
FROM crawled_data WHERE url = ?""",
(url,)
) as cursor:
row = await cursor.fetchone()
if not row:
return None
columns = [description[0] for description in cursor.description]
row_dict = dict(zip(columns, row))
# Parse response_headers JSON
try:
row_dict["response_headers"] = (
json.loads(row_dict["response_headers"])
if row_dict["response_headers"] else {}
)
except json.JSONDecodeError:
row_dict["response_headers"] = {}
return row_dict
try:
return await self.execute_with_retry(_get_metadata)
except Exception as e:
self.logger.error(
message="Error retrieving cache metadata: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)},
)
return None
async def aupdate_cache_metadata(
self,
url: str,
etag: Optional[str] = None,
last_modified: Optional[str] = None,
head_fingerprint: Optional[str] = None,
):
"""
Update only the cache validation metadata for a URL.
Used to update etag/last_modified after a successful validation.
"""
async def _update(db):
updates = []
values = []
if etag is not None:
updates.append("etag = ?")
values.append(etag)
if last_modified is not None:
updates.append("last_modified = ?")
values.append(last_modified)
if head_fingerprint is not None:
updates.append("head_fingerprint = ?")
values.append(head_fingerprint)
if not updates:
return
values.append(url)
await db.execute(
f"UPDATE crawled_data SET {', '.join(updates)} WHERE url = ?",
tuple(values)
)
try:
await self.execute_with_retry(_update)
except Exception as e:
self.logger.error(
message="Error updating cache metadata: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)},
)
async def acache_url(self, result: CrawlResult):
"""Cache CrawlResult data"""
# Store content files and get hashes
@@ -425,15 +522,24 @@ class AsyncDatabaseManager:
for field, (content, content_type) in content_map.items():
content_hashes[field] = await self._store_content(content, content_type)
# Extract cache validation headers from response
response_headers = result.response_headers or {}
etag = response_headers.get("etag") or response_headers.get("ETag") or ""
last_modified = response_headers.get("last-modified") or response_headers.get("Last-Modified") or ""
# head_fingerprint is set by caller via result attribute (if available)
head_fingerprint = getattr(result, "head_fingerprint", None) or ""
cached_at = time.time()
async def _cache(db):
await db.execute(
"""
INSERT INTO crawled_data (
url, html, cleaned_html, markdown,
extracted_content, success, media, links, metadata,
screenshot, response_headers, downloaded_files
screenshot, response_headers, downloaded_files,
etag, last_modified, head_fingerprint, cached_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
@@ -445,7 +551,11 @@ class AsyncDatabaseManager:
metadata = excluded.metadata,
screenshot = excluded.screenshot,
response_headers = excluded.response_headers,
downloaded_files = excluded.downloaded_files
downloaded_files = excluded.downloaded_files,
etag = excluded.etag,
last_modified = excluded.last_modified,
head_fingerprint = excluded.head_fingerprint,
cached_at = excluded.cached_at
""",
(
result.url,
@@ -460,6 +570,10 @@ class AsyncDatabaseManager:
content_hashes["screenshot"],
json.dumps(result.response_headers or {}),
json.dumps(result.downloaded_files or []),
etag,
last_modified,
head_fingerprint,
cached_at,
),
)

View File

@@ -47,7 +47,9 @@ from .utils import (
get_error_context,
RobotsParser,
preprocess_html_for_schema,
compute_head_fingerprint,
)
from .cache_validator import CacheValidator, CacheValidationResult
class AsyncWebCrawler:
@@ -267,6 +269,51 @@ class AsyncWebCrawler:
if cache_context.should_read():
cached_result = await async_db_manager.aget_cached_url(url)
# Smart Cache: Validate cache freshness if enabled
if cached_result and config.check_cache_freshness:
cache_metadata = await async_db_manager.aget_cache_metadata(url)
if cache_metadata:
async with CacheValidator(timeout=config.cache_validation_timeout) as validator:
validation = await validator.validate(
url=url,
stored_etag=cache_metadata.get("etag"),
stored_last_modified=cache_metadata.get("last_modified"),
stored_head_fingerprint=cache_metadata.get("head_fingerprint"),
)
if validation.status == CacheValidationResult.FRESH:
cached_result.cache_status = "hit_validated"
self.logger.info(
message="Cache validated: {reason}",
tag="CACHE",
params={"reason": validation.reason}
)
# Update metadata if we got new values
if validation.new_etag or validation.new_last_modified:
await async_db_manager.aupdate_cache_metadata(
url=url,
etag=validation.new_etag,
last_modified=validation.new_last_modified,
head_fingerprint=validation.new_head_fingerprint,
)
elif validation.status == CacheValidationResult.ERROR:
cached_result.cache_status = "hit_fallback"
self.logger.warning(
message="Cache validation failed, using cached: {reason}",
tag="CACHE",
params={"reason": validation.reason}
)
else:
# STALE or UNKNOWN - force recrawl
self.logger.info(
message="Cache stale: {reason}",
tag="CACHE",
params={"reason": validation.reason}
)
cached_result = None
elif cached_result:
cached_result.cache_status = "hit"
if cached_result:
html = sanitize_input_encode(cached_result.html)
extracted_content = sanitize_input_encode(
@@ -383,6 +430,14 @@ class AsyncWebCrawler:
crawl_result.success = bool(html)
crawl_result.session_id = getattr(
config, "session_id", None)
crawl_result.cache_status = "miss"
# Compute head fingerprint for cache validation
if html:
head_end = html.lower().find('</head>')
if head_end != -1:
head_html = html[:head_end + 7]
crawl_result.head_fingerprint = compute_head_fingerprint(head_html)
self.logger.url_status(
url=cache_context.display_url,

270
crawl4ai/cache_validator.py Normal file
View File

@@ -0,0 +1,270 @@
"""
Cache validation using HTTP conditional requests and head fingerprinting.
Uses httpx for fast, lightweight HTTP requests (no browser needed).
This module enables smart cache validation to avoid unnecessary full browser crawls
when content hasn't changed.
Validation Strategy:
1. Send HEAD request with If-None-Match / If-Modified-Since headers
2. If server returns 304 Not Modified → cache is FRESH
3. If server returns 200 → fetch <head> and compare fingerprint
4. If fingerprint matches → cache is FRESH (minor changes only)
5. Otherwise → cache is STALE, need full recrawl
"""
import httpx
from dataclasses import dataclass
from typing import Optional, Tuple
from enum import Enum
from .utils import compute_head_fingerprint
class CacheValidationResult(Enum):
"""Result of cache validation check."""
FRESH = "fresh" # Content unchanged, use cache
STALE = "stale" # Content changed, need recrawl
UNKNOWN = "unknown" # Couldn't determine, need recrawl
ERROR = "error" # Request failed, use cache as fallback
@dataclass
class ValidationResult:
"""Detailed result of a cache validation attempt."""
status: CacheValidationResult
new_etag: Optional[str] = None
new_last_modified: Optional[str] = None
new_head_fingerprint: Optional[str] = None
reason: str = ""
class CacheValidator:
"""
Validates cache freshness using lightweight HTTP requests.
This validator uses httpx to make fast HTTP requests without needing
a full browser. It supports two validation methods:
1. HTTP Conditional Requests (Layer 3):
- Uses If-None-Match with stored ETag
- Uses If-Modified-Since with stored Last-Modified
- Server returns 304 if content unchanged
2. Head Fingerprinting (Layer 4):
- Fetches only the <head> section (~5KB)
- Compares fingerprint of key meta tags
- Catches changes even without server support for conditional requests
"""
def __init__(self, timeout: float = 10.0, user_agent: Optional[str] = None):
"""
Initialize the cache validator.
Args:
timeout: Request timeout in seconds
user_agent: Custom User-Agent string (optional)
"""
self.timeout = timeout
self.user_agent = user_agent or "Mozilla/5.0 (compatible; Crawl4AI/1.0)"
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create the httpx client."""
if self._client is None:
self._client = httpx.AsyncClient(
http2=True,
timeout=self.timeout,
follow_redirects=True,
headers={"User-Agent": self.user_agent}
)
return self._client
async def validate(
self,
url: str,
stored_etag: Optional[str] = None,
stored_last_modified: Optional[str] = None,
stored_head_fingerprint: Optional[str] = None,
) -> ValidationResult:
"""
Validate if cached content is still fresh.
Args:
url: The URL to validate
stored_etag: Previously stored ETag header value
stored_last_modified: Previously stored Last-Modified header value
stored_head_fingerprint: Previously computed head fingerprint
Returns:
ValidationResult with status and any updated metadata
"""
client = await self._get_client()
# Build conditional request headers
headers = {}
if stored_etag:
headers["If-None-Match"] = stored_etag
if stored_last_modified:
headers["If-Modified-Since"] = stored_last_modified
try:
# Step 1: Try HEAD request with conditional headers
if headers:
response = await client.head(url, headers=headers)
if response.status_code == 304:
return ValidationResult(
status=CacheValidationResult.FRESH,
reason="Server returned 304 Not Modified"
)
# Got 200, extract new headers for potential update
new_etag = response.headers.get("etag")
new_last_modified = response.headers.get("last-modified")
# If we have fingerprint, compare it
if stored_head_fingerprint:
head_html, _, _ = await self._fetch_head(url)
if head_html:
new_fingerprint = compute_head_fingerprint(head_html)
if new_fingerprint and new_fingerprint == stored_head_fingerprint:
return ValidationResult(
status=CacheValidationResult.FRESH,
new_etag=new_etag,
new_last_modified=new_last_modified,
new_head_fingerprint=new_fingerprint,
reason="Head fingerprint matches"
)
elif new_fingerprint:
return ValidationResult(
status=CacheValidationResult.STALE,
new_etag=new_etag,
new_last_modified=new_last_modified,
new_head_fingerprint=new_fingerprint,
reason="Head fingerprint changed"
)
# Headers changed and no fingerprint match
return ValidationResult(
status=CacheValidationResult.STALE,
new_etag=new_etag,
new_last_modified=new_last_modified,
reason="Server returned 200, content may have changed"
)
# Step 2: No conditional headers available, try fingerprint only
if stored_head_fingerprint:
head_html, new_etag, new_last_modified = await self._fetch_head(url)
if head_html:
new_fingerprint = compute_head_fingerprint(head_html)
if new_fingerprint and new_fingerprint == stored_head_fingerprint:
return ValidationResult(
status=CacheValidationResult.FRESH,
new_etag=new_etag,
new_last_modified=new_last_modified,
new_head_fingerprint=new_fingerprint,
reason="Head fingerprint matches"
)
elif new_fingerprint:
return ValidationResult(
status=CacheValidationResult.STALE,
new_etag=new_etag,
new_last_modified=new_last_modified,
new_head_fingerprint=new_fingerprint,
reason="Head fingerprint changed"
)
# Step 3: No validation data available
return ValidationResult(
status=CacheValidationResult.UNKNOWN,
reason="No validation data available (no etag, last-modified, or fingerprint)"
)
except httpx.TimeoutException:
return ValidationResult(
status=CacheValidationResult.ERROR,
reason="Validation request timed out"
)
except httpx.RequestError as e:
return ValidationResult(
status=CacheValidationResult.ERROR,
reason=f"Validation request failed: {type(e).__name__}"
)
except Exception as e:
# On unexpected error, prefer using cache over failing
return ValidationResult(
status=CacheValidationResult.ERROR,
reason=f"Validation error: {str(e)}"
)
async def _fetch_head(self, url: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Fetch only the <head> section of a page.
Uses streaming to stop reading after </head> is found,
minimizing bandwidth usage.
Args:
url: The URL to fetch
Returns:
Tuple of (head_html, etag, last_modified)
"""
client = await self._get_client()
try:
async with client.stream(
"GET",
url,
headers={"Accept-Encoding": "identity"} # Disable compression for easier parsing
) as response:
etag = response.headers.get("etag")
last_modified = response.headers.get("last-modified")
if response.status_code != 200:
return None, etag, last_modified
# Read until </head> or max 64KB
chunks = []
total_bytes = 0
max_bytes = 65536
async for chunk in response.aiter_bytes(4096):
chunks.append(chunk)
total_bytes += len(chunk)
content = b''.join(chunks)
# Check for </head> (case insensitive)
if b'</head>' in content.lower() or b'</HEAD>' in content:
break
if total_bytes >= max_bytes:
break
html = content.decode('utf-8', errors='replace')
# Extract just the head section
head_end = html.lower().find('</head>')
if head_end != -1:
html = html[:head_end + 7]
return html, etag, last_modified
except Exception:
return None, None, None
async def close(self):
"""Close the HTTP client and release resources."""
if self._client:
await self._client.aclose()
self._client = None
async def __aenter__(self):
"""Async context manager entry."""
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
await self.close()

View File

@@ -152,6 +152,10 @@ class CrawlResult(BaseModel):
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
tables: List[Dict] = Field(default_factory=list) # NEW [{headers,rows,caption,summary}]
# Cache validation metadata (Smart Cache)
head_fingerprint: Optional[str] = None
cached_at: Optional[float] = None
cache_status: Optional[str] = None # "hit", "hit_validated", "hit_fallback", "miss"
model_config = ConfigDict(arbitrary_types_allowed=True)

View File

@@ -2828,6 +2828,67 @@ def generate_content_hash(content: str) -> str:
# return hashlib.sha256(content.encode()).hexdigest()
def compute_head_fingerprint(head_html: str) -> str:
"""
Compute a fingerprint of <head> content for cache validation.
Focuses on content that typically changes when page updates:
- <title>
- <meta name="description">
- <meta property="og:title|og:description|og:image|og:updated_time">
- <meta property="article:modified_time">
- <meta name="last-modified">
Uses xxhash for speed, combines multiple signals into a single hash.
Args:
head_html: The HTML content of the <head> section
Returns:
A hex string fingerprint, or empty string if no signals found
"""
if not head_html:
return ""
head_lower = head_html.lower()
signals = []
# Extract title
title_match = re.search(r'<title[^>]*>(.*?)</title>', head_lower, re.DOTALL)
if title_match:
signals.append(title_match.group(1).strip())
# Meta tags to extract (name or property attribute, and the value to match)
meta_tags = [
("name", "description"),
("name", "last-modified"),
("property", "og:title"),
("property", "og:description"),
("property", "og:image"),
("property", "og:updated_time"),
("property", "article:modified_time"),
]
for attr_type, attr_value in meta_tags:
# Handle both attribute orders: attr="value" content="..." and content="..." attr="value"
patterns = [
rf'<meta[^>]*{attr_type}=["\']{ re.escape(attr_value)}["\'][^>]*content=["\']([^"\']*)["\']',
rf'<meta[^>]*content=["\']([^"\']*)["\'][^>]*{attr_type}=["\']{re.escape(attr_value)}["\']',
]
for pattern in patterns:
match = re.search(pattern, head_lower)
if match:
signals.append(match.group(1).strip())
break # Found this tag, move to next
if not signals:
return ""
# Combine signals and hash
combined = '|'.join(signals)
return xxhash.xxh64(combined.encode()).hexdigest()
def ensure_content_dirs(base_path: str) -> Dict[str, str]:
"""Create content directories if they don't exist"""
dirs = {