Some debugging for caching

2025-12-21 04:45:52 +00:00
parent f6b29a8f9f
commit 48426f73f0
11 changed files with 1464 additions and 4 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1061,6 +1061,15 @@ class CrawlerRunConfig():
        shared_data (dict or None): Shared data to be passed between hooks.
                                     Default: None.

+        # Cache Validation Parameters (Smart Cache)
+        check_cache_freshness (bool): If True, validates cached content freshness using HTTP
+                                      conditional requests (ETag/Last-Modified) and head fingerprinting
+                                      before returning cached results. Avoids full browser crawls when
+                                      content hasn't changed. Only applies when cache_mode allows reads.
+                                      Default: False.
+        cache_validation_timeout (float): Timeout in seconds for cache validation HTTP requests.
+                                          Default: 10.0.
+
        # Page Navigation and Timing Parameters
        wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
                          Default: "domcontentloaded".
@@ -1226,6 +1235,9 @@ class CrawlerRunConfig():
        no_cache_read: bool = False,
        no_cache_write: bool = False,
        shared_data: dict = None,
+        # Cache Validation Parameters (Smart Cache)
+        check_cache_freshness: bool = False,
+        cache_validation_timeout: float = 10.0,
        # Page Navigation and Timing Parameters
        wait_until: str = "domcontentloaded",
        page_timeout: int = PAGE_TIMEOUT,
@@ -1339,6 +1351,9 @@ class CrawlerRunConfig():
        self.no_cache_read = no_cache_read
        self.no_cache_write = no_cache_write
        self.shared_data = shared_data
+        # Cache Validation (Smart Cache)
+        self.check_cache_freshness = check_cache_freshness
+        self.cache_validation_timeout = cache_validation_timeout

        # Page Navigation and Timing Parameters
        self.wait_until = wait_until
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -1,10 +1,11 @@
 import os
+import time
 from pathlib import Path
 import aiosqlite
 import asyncio
 from typing import Optional, Dict
 from contextlib import asynccontextmanager
-import json  
+import json
 from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
 import aiofiles
 from .async_logger import AsyncLogger
@@ -262,6 +263,11 @@ class AsyncDatabaseManager:
                "screenshot",
                "response_headers",
                "downloaded_files",
+                # Smart cache validation columns (added in 0.8.x)
+                "etag",
+                "last_modified",
+                "head_fingerprint",
+                "cached_at",
            ]

            for column in new_columns:
@@ -275,6 +281,11 @@ class AsyncDatabaseManager:
            await db.execute(
                f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"'
            )
+        elif new_column == "cached_at":
+            # Timestamp column for cache validation
+            await db.execute(
+                f"ALTER TABLE crawled_data ADD COLUMN {new_column} REAL DEFAULT 0"
+            )
        else:
            await db.execute(
                f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""'
@@ -378,6 +389,92 @@ class AsyncDatabaseManager:
            )
            return None

+    async def aget_cache_metadata(self, url: str) -> Optional[Dict]:
+        """
+        Retrieve only cache validation metadata for a URL (lightweight query).
+
+        Returns dict with: url, etag, last_modified, head_fingerprint, cached_at, response_headers
+        This is used for cache validation without loading full content.
+        """
+        async def _get_metadata(db):
+            async with db.execute(
+                """SELECT url, etag, last_modified, head_fingerprint, cached_at, response_headers
+                   FROM crawled_data WHERE url = ?""",
+                (url,)
+            ) as cursor:
+                row = await cursor.fetchone()
+                if not row:
+                    return None
+
+                columns = [description[0] for description in cursor.description]
+                row_dict = dict(zip(columns, row))
+
+                # Parse response_headers JSON
+                try:
+                    row_dict["response_headers"] = (
+                        json.loads(row_dict["response_headers"])
+                        if row_dict["response_headers"] else {}
+                    )
+                except json.JSONDecodeError:
+                    row_dict["response_headers"] = {}
+
+                return row_dict
+
+        try:
+            return await self.execute_with_retry(_get_metadata)
+        except Exception as e:
+            self.logger.error(
+                message="Error retrieving cache metadata: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)},
+            )
+            return None
+
+    async def aupdate_cache_metadata(
+        self,
+        url: str,
+        etag: Optional[str] = None,
+        last_modified: Optional[str] = None,
+        head_fingerprint: Optional[str] = None,
+    ):
+        """
+        Update only the cache validation metadata for a URL.
+        Used to update etag/last_modified after a successful validation.
+        """
+        async def _update(db):
+            updates = []
+            values = []
+
+            if etag is not None:
+                updates.append("etag = ?")
+                values.append(etag)
+            if last_modified is not None:
+                updates.append("last_modified = ?")
+                values.append(last_modified)
+            if head_fingerprint is not None:
+                updates.append("head_fingerprint = ?")
+                values.append(head_fingerprint)
+
+            if not updates:
+                return
+
+            values.append(url)
+            await db.execute(
+                f"UPDATE crawled_data SET {', '.join(updates)} WHERE url = ?",
+                tuple(values)
+            )
+
+        try:
+            await self.execute_with_retry(_update)
+        except Exception as e:
+            self.logger.error(
+                message="Error updating cache metadata: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)},
+            )
+
    async def acache_url(self, result: CrawlResult):
        """Cache CrawlResult data"""
        # Store content files and get hashes
@@ -425,15 +522,24 @@ class AsyncDatabaseManager:
        for field, (content, content_type) in content_map.items():
            content_hashes[field] = await self._store_content(content, content_type)

+        # Extract cache validation headers from response
+        response_headers = result.response_headers or {}
+        etag = response_headers.get("etag") or response_headers.get("ETag") or ""
+        last_modified = response_headers.get("last-modified") or response_headers.get("Last-Modified") or ""
+        # head_fingerprint is set by caller via result attribute (if available)
+        head_fingerprint = getattr(result, "head_fingerprint", None) or ""
+        cached_at = time.time()
+
        async def _cache(db):
            await db.execute(
                """
                INSERT INTO crawled_data (
                    url, html, cleaned_html, markdown,
                    extracted_content, success, media, links, metadata,
-                    screenshot, response_headers, downloaded_files
+                    screenshot, response_headers, downloaded_files,
+                    etag, last_modified, head_fingerprint, cached_at
                )
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ON CONFLICT(url) DO UPDATE SET
                    html = excluded.html,
                    cleaned_html = excluded.cleaned_html,
@@ -445,7 +551,11 @@ class AsyncDatabaseManager:
                    metadata = excluded.metadata,
                    screenshot = excluded.screenshot,
                    response_headers = excluded.response_headers,
-                    downloaded_files = excluded.downloaded_files
+                    downloaded_files = excluded.downloaded_files,
+                    etag = excluded.etag,
+                    last_modified = excluded.last_modified,
+                    head_fingerprint = excluded.head_fingerprint,
+                    cached_at = excluded.cached_at
            """,
                (
                    result.url,
@@ -460,6 +570,10 @@ class AsyncDatabaseManager:
                    content_hashes["screenshot"],
                    json.dumps(result.response_headers or {}),
                    json.dumps(result.downloaded_files or []),
+                    etag,
+                    last_modified,
+                    head_fingerprint,
+                    cached_at,
                ),
            )

--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -47,7 +47,9 @@ from .utils import (
    get_error_context,
    RobotsParser,
    preprocess_html_for_schema,
+    compute_head_fingerprint,
 )
+from .cache_validator import CacheValidator, CacheValidationResult


 class AsyncWebCrawler:
@@ -267,6 +269,51 @@ class AsyncWebCrawler:
                if cache_context.should_read():
                    cached_result = await async_db_manager.aget_cached_url(url)

+                # Smart Cache: Validate cache freshness if enabled
+                if cached_result and config.check_cache_freshness:
+                    cache_metadata = await async_db_manager.aget_cache_metadata(url)
+                    if cache_metadata:
+                        async with CacheValidator(timeout=config.cache_validation_timeout) as validator:
+                            validation = await validator.validate(
+                                url=url,
+                                stored_etag=cache_metadata.get("etag"),
+                                stored_last_modified=cache_metadata.get("last_modified"),
+                                stored_head_fingerprint=cache_metadata.get("head_fingerprint"),
+                            )
+
+                        if validation.status == CacheValidationResult.FRESH:
+                            cached_result.cache_status = "hit_validated"
+                            self.logger.info(
+                                message="Cache validated: {reason}",
+                                tag="CACHE",
+                                params={"reason": validation.reason}
+                            )
+                            # Update metadata if we got new values
+                            if validation.new_etag or validation.new_last_modified:
+                                await async_db_manager.aupdate_cache_metadata(
+                                    url=url,
+                                    etag=validation.new_etag,
+                                    last_modified=validation.new_last_modified,
+                                    head_fingerprint=validation.new_head_fingerprint,
+                                )
+                        elif validation.status == CacheValidationResult.ERROR:
+                            cached_result.cache_status = "hit_fallback"
+                            self.logger.warning(
+                                message="Cache validation failed, using cached: {reason}",
+                                tag="CACHE",
+                                params={"reason": validation.reason}
+                            )
+                        else:
+                            # STALE or UNKNOWN - force recrawl
+                            self.logger.info(
+                                message="Cache stale: {reason}",
+                                tag="CACHE",
+                                params={"reason": validation.reason}
+                            )
+                            cached_result = None
+                elif cached_result:
+                    cached_result.cache_status = "hit"
+
                if cached_result:
                    html = sanitize_input_encode(cached_result.html)
                    extracted_content = sanitize_input_encode(
@@ -383,6 +430,14 @@ class AsyncWebCrawler:
                    crawl_result.success = bool(html)
                    crawl_result.session_id = getattr(
                        config, "session_id", None)
+                    crawl_result.cache_status = "miss"
+
+                    # Compute head fingerprint for cache validation
+                    if html:
+                        head_end = html.lower().find('</head>')
+                        if head_end != -1:
+                            head_html = html[:head_end + 7]
+                            crawl_result.head_fingerprint = compute_head_fingerprint(head_html)

                    self.logger.url_status(
                        url=cache_context.display_url,
--- a/crawl4ai/cache_validator.py
+++ b/crawl4ai/cache_validator.py
@@ -0,0 +1,270 @@
+"""
+Cache validation using HTTP conditional requests and head fingerprinting.
+
+Uses httpx for fast, lightweight HTTP requests (no browser needed).
+This module enables smart cache validation to avoid unnecessary full browser crawls
+when content hasn't changed.
+
+Validation Strategy:
+1. Send HEAD request with If-None-Match / If-Modified-Since headers
+2. If server returns 304 Not Modified → cache is FRESH
+3. If server returns 200 → fetch <head> and compare fingerprint
+4. If fingerprint matches → cache is FRESH (minor changes only)
+5. Otherwise → cache is STALE, need full recrawl
+"""
+
+import httpx
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from enum import Enum
+
+from .utils import compute_head_fingerprint
+
+
+class CacheValidationResult(Enum):
+    """Result of cache validation check."""
+    FRESH = "fresh"       # Content unchanged, use cache
+    STALE = "stale"       # Content changed, need recrawl
+    UNKNOWN = "unknown"   # Couldn't determine, need recrawl
+    ERROR = "error"       # Request failed, use cache as fallback
+
+
+@dataclass
+class ValidationResult:
+    """Detailed result of a cache validation attempt."""
+    status: CacheValidationResult
+    new_etag: Optional[str] = None
+    new_last_modified: Optional[str] = None
+    new_head_fingerprint: Optional[str] = None
+    reason: str = ""
+
+
+class CacheValidator:
+    """
+    Validates cache freshness using lightweight HTTP requests.
+
+    This validator uses httpx to make fast HTTP requests without needing
+    a full browser. It supports two validation methods:
+
+    1. HTTP Conditional Requests (Layer 3):
+       - Uses If-None-Match with stored ETag
+       - Uses If-Modified-Since with stored Last-Modified
+       - Server returns 304 if content unchanged
+
+    2. Head Fingerprinting (Layer 4):
+       - Fetches only the <head> section (~5KB)
+       - Compares fingerprint of key meta tags
+       - Catches changes even without server support for conditional requests
+    """
+
+    def __init__(self, timeout: float = 10.0, user_agent: Optional[str] = None):
+        """
+        Initialize the cache validator.
+
+        Args:
+            timeout: Request timeout in seconds
+            user_agent: Custom User-Agent string (optional)
+        """
+        self.timeout = timeout
+        self.user_agent = user_agent or "Mozilla/5.0 (compatible; Crawl4AI/1.0)"
+        self._client: Optional[httpx.AsyncClient] = None
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create the httpx client."""
+        if self._client is None:
+            self._client = httpx.AsyncClient(
+                http2=True,
+                timeout=self.timeout,
+                follow_redirects=True,
+                headers={"User-Agent": self.user_agent}
+            )
+        return self._client
+
+    async def validate(
+        self,
+        url: str,
+        stored_etag: Optional[str] = None,
+        stored_last_modified: Optional[str] = None,
+        stored_head_fingerprint: Optional[str] = None,
+    ) -> ValidationResult:
+        """
+        Validate if cached content is still fresh.
+
+        Args:
+            url: The URL to validate
+            stored_etag: Previously stored ETag header value
+            stored_last_modified: Previously stored Last-Modified header value
+            stored_head_fingerprint: Previously computed head fingerprint
+
+        Returns:
+            ValidationResult with status and any updated metadata
+        """
+        client = await self._get_client()
+
+        # Build conditional request headers
+        headers = {}
+        if stored_etag:
+            headers["If-None-Match"] = stored_etag
+        if stored_last_modified:
+            headers["If-Modified-Since"] = stored_last_modified
+
+        try:
+            # Step 1: Try HEAD request with conditional headers
+            if headers:
+                response = await client.head(url, headers=headers)
+
+                if response.status_code == 304:
+                    return ValidationResult(
+                        status=CacheValidationResult.FRESH,
+                        reason="Server returned 304 Not Modified"
+                    )
+
+                # Got 200, extract new headers for potential update
+                new_etag = response.headers.get("etag")
+                new_last_modified = response.headers.get("last-modified")
+
+                # If we have fingerprint, compare it
+                if stored_head_fingerprint:
+                    head_html, _, _ = await self._fetch_head(url)
+                    if head_html:
+                        new_fingerprint = compute_head_fingerprint(head_html)
+                        if new_fingerprint and new_fingerprint == stored_head_fingerprint:
+                            return ValidationResult(
+                                status=CacheValidationResult.FRESH,
+                                new_etag=new_etag,
+                                new_last_modified=new_last_modified,
+                                new_head_fingerprint=new_fingerprint,
+                                reason="Head fingerprint matches"
+                            )
+                        elif new_fingerprint:
+                            return ValidationResult(
+                                status=CacheValidationResult.STALE,
+                                new_etag=new_etag,
+                                new_last_modified=new_last_modified,
+                                new_head_fingerprint=new_fingerprint,
+                                reason="Head fingerprint changed"
+                            )
+
+                # Headers changed and no fingerprint match
+                return ValidationResult(
+                    status=CacheValidationResult.STALE,
+                    new_etag=new_etag,
+                    new_last_modified=new_last_modified,
+                    reason="Server returned 200, content may have changed"
+                )
+
+            # Step 2: No conditional headers available, try fingerprint only
+            if stored_head_fingerprint:
+                head_html, new_etag, new_last_modified = await self._fetch_head(url)
+
+                if head_html:
+                    new_fingerprint = compute_head_fingerprint(head_html)
+
+                    if new_fingerprint and new_fingerprint == stored_head_fingerprint:
+                        return ValidationResult(
+                            status=CacheValidationResult.FRESH,
+                            new_etag=new_etag,
+                            new_last_modified=new_last_modified,
+                            new_head_fingerprint=new_fingerprint,
+                            reason="Head fingerprint matches"
+                        )
+                    elif new_fingerprint:
+                        return ValidationResult(
+                            status=CacheValidationResult.STALE,
+                            new_etag=new_etag,
+                            new_last_modified=new_last_modified,
+                            new_head_fingerprint=new_fingerprint,
+                            reason="Head fingerprint changed"
+                        )
+
+            # Step 3: No validation data available
+            return ValidationResult(
+                status=CacheValidationResult.UNKNOWN,
+                reason="No validation data available (no etag, last-modified, or fingerprint)"
+            )
+
+        except httpx.TimeoutException:
+            return ValidationResult(
+                status=CacheValidationResult.ERROR,
+                reason="Validation request timed out"
+            )
+        except httpx.RequestError as e:
+            return ValidationResult(
+                status=CacheValidationResult.ERROR,
+                reason=f"Validation request failed: {type(e).__name__}"
+            )
+        except Exception as e:
+            # On unexpected error, prefer using cache over failing
+            return ValidationResult(
+                status=CacheValidationResult.ERROR,
+                reason=f"Validation error: {str(e)}"
+            )
+
+    async def _fetch_head(self, url: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        """
+        Fetch only the <head> section of a page.
+
+        Uses streaming to stop reading after </head> is found,
+        minimizing bandwidth usage.
+
+        Args:
+            url: The URL to fetch
+
+        Returns:
+            Tuple of (head_html, etag, last_modified)
+        """
+        client = await self._get_client()
+
+        try:
+            async with client.stream(
+                "GET",
+                url,
+                headers={"Accept-Encoding": "identity"}  # Disable compression for easier parsing
+            ) as response:
+                etag = response.headers.get("etag")
+                last_modified = response.headers.get("last-modified")
+
+                if response.status_code != 200:
+                    return None, etag, last_modified
+
+                # Read until </head> or max 64KB
+                chunks = []
+                total_bytes = 0
+                max_bytes = 65536
+
+                async for chunk in response.aiter_bytes(4096):
+                    chunks.append(chunk)
+                    total_bytes += len(chunk)
+
+                    content = b''.join(chunks)
+                    # Check for </head> (case insensitive)
+                    if b'</head>' in content.lower() or b'</HEAD>' in content:
+                        break
+                    if total_bytes >= max_bytes:
+                        break
+
+                html = content.decode('utf-8', errors='replace')
+
+                # Extract just the head section
+                head_end = html.lower().find('</head>')
+                if head_end != -1:
+                    html = html[:head_end + 7]
+
+                return html, etag, last_modified
+
+        except Exception:
+            return None, None, None
+
+    async def close(self):
+        """Close the HTTP client and release resources."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+
+    async def __aenter__(self):
+        """Async context manager entry."""
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self.close()
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -152,6 +152,10 @@ class CrawlResult(BaseModel):
    network_requests: Optional[List[Dict[str, Any]]] = None
    console_messages: Optional[List[Dict[str, Any]]] = None
    tables: List[Dict] = Field(default_factory=list)  # NEW – [{headers,rows,caption,summary}]
+    # Cache validation metadata (Smart Cache)
+    head_fingerprint: Optional[str] = None
+    cached_at: Optional[float] = None
+    cache_status: Optional[str] = None  # "hit", "hit_validated", "hit_fallback", "miss"

    model_config = ConfigDict(arbitrary_types_allowed=True)

--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2828,6 +2828,67 @@ def generate_content_hash(content: str) -> str:
    # return hashlib.sha256(content.encode()).hexdigest()


+def compute_head_fingerprint(head_html: str) -> str:
+    """
+    Compute a fingerprint of <head> content for cache validation.
+
+    Focuses on content that typically changes when page updates:
+    - <title>
+    - <meta name="description">
+    - <meta property="og:title|og:description|og:image|og:updated_time">
+    - <meta property="article:modified_time">
+    - <meta name="last-modified">
+
+    Uses xxhash for speed, combines multiple signals into a single hash.
+
+    Args:
+        head_html: The HTML content of the <head> section
+
+    Returns:
+        A hex string fingerprint, or empty string if no signals found
+    """
+    if not head_html:
+        return ""
+
+    head_lower = head_html.lower()
+    signals = []
+
+    # Extract title
+    title_match = re.search(r'<title[^>]*>(.*?)</title>', head_lower, re.DOTALL)
+    if title_match:
+        signals.append(title_match.group(1).strip())
+
+    # Meta tags to extract (name or property attribute, and the value to match)
+    meta_tags = [
+        ("name", "description"),
+        ("name", "last-modified"),
+        ("property", "og:title"),
+        ("property", "og:description"),
+        ("property", "og:image"),
+        ("property", "og:updated_time"),
+        ("property", "article:modified_time"),
+    ]
+
+    for attr_type, attr_value in meta_tags:
+        # Handle both attribute orders: attr="value" content="..." and content="..." attr="value"
+        patterns = [
+            rf'<meta[^>]*{attr_type}=["\']{ re.escape(attr_value)}["\'][^>]*content=["\']([^"\']*)["\']',
+            rf'<meta[^>]*content=["\']([^"\']*)["\'][^>]*{attr_type}=["\']{re.escape(attr_value)}["\']',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, head_lower)
+            if match:
+                signals.append(match.group(1).strip())
+                break  # Found this tag, move to next
+
+    if not signals:
+        return ""
+
+    # Combine signals and hash
+    combined = '|'.join(signals)
+    return xxhash.xxh64(combined.encode()).hexdigest()
+
+
 def ensure_content_dirs(base_path: str) -> Dict[str, str]:
    """Create content directories if they don't exist"""
    dirs = {