Add smart TTL cache for sitemap URL seeder

- Add cache_ttl_hours and validate_sitemap_lastmod params to SeedingConfig - New JSON cache format with metadata (version, created_at, lastmod, url_count) - Cache validation by TTL expiry and sitemap lastmod comparison - Auto-migration from old .jsonl to new .json format - Fixes bug where incomplete cache was used indefinitely
2025-12-30 01:59:09 +00:00
parent 2550f3d2d5
commit 3d78001c30
2 changed files with 312 additions and 49 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1996,6 +1996,8 @@ class SeedingConfig:
        score_threshold: Optional[float] = None,
        scoring_method: str = "bm25",
        filter_nonsense_urls: bool = True,
        cache_ttl_hours: int = 24,
        validate_sitemap_lastmod: bool = True,
    ):
        """
        Initialize URL seeding configuration.
@@ -2027,10 +2029,14 @@ class SeedingConfig:
                  Requires extract_head=True. Default: None
            score_threshold: Minimum relevance score (0.0-1.0) to include URL. 
                           Only applies when query is provided. Default: None
-            scoring_method: Scoring algorithm to use. Currently only "bm25" is supported. 
+            scoring_method: Scoring algorithm to use. Currently only "bm25" is supported.
                          Future: "semantic". Default: "bm25"
-            filter_nonsense_urls: Filter out utility URLs like robots.txt, sitemap.xml, 
+            filter_nonsense_urls: Filter out utility URLs like robots.txt, sitemap.xml,
                                 ads.txt, favicon.ico, etc. Default: True
            cache_ttl_hours: Hours before sitemap cache expires. Set to 0 to disable TTL
                            (only lastmod validation). Default: 24
            validate_sitemap_lastmod: If True, compares sitemap's <lastmod> with cache
                                     timestamp and refetches if sitemap is newer. Default: True
        """
        self.source = source
        self.pattern = pattern
@@ -2047,6 +2053,8 @@ class SeedingConfig:
        self.score_threshold = score_threshold
        self.scoring_method = scoring_method
        self.filter_nonsense_urls = filter_nonsense_urls
        self.cache_ttl_hours = cache_ttl_hours
        self.validate_sitemap_lastmod = validate_sitemap_lastmod
    # Add to_dict, from_kwargs, and clone methods for consistency
    def to_dict(self) -> Dict[str, Any]:
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -24,7 +24,7 @@ import os
 import pathlib
 import re
 import time
-from datetime import timedelta
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
 from urllib.parse import quote, urljoin
@@ -78,6 +78,103 @@ _link_rx = re.compile(
 # ────────────────────────────────────────────────────────────────────────── helpers
 def _parse_sitemap_lastmod(xml_content: bytes) -> Optional[str]:
    """Extract the most recent lastmod from sitemap XML."""
    try:
        if LXML:
            root = etree.fromstring(xml_content)
            # Get all lastmod elements (namespace-agnostic)
            lastmods = root.xpath("//*[local-name()='lastmod']/text()")
            if lastmods:
                # Return the most recent one
                return max(lastmods)
    except Exception:
        pass
    return None
 def _is_cache_valid(
    cache_path: pathlib.Path,
    ttl_hours: int,
    validate_lastmod: bool,
    current_lastmod: Optional[str] = None
 ) -> bool:
    """
    Check if sitemap cache is still valid.
    Returns False (invalid) if:
    - File doesn't exist
    - File is corrupted/unreadable
    - TTL expired (if ttl_hours > 0)
    - Sitemap lastmod is newer than cache (if validate_lastmod=True)
    """
    if not cache_path.exists():
        return False
    try:
        with open(cache_path, "r") as f:
            data = json.load(f)
        # Check version
        if data.get("version") != 1:
            return False
        # Check TTL
        if ttl_hours > 0:
            created_at = datetime.fromisoformat(data["created_at"].replace("Z", "+00:00"))
            age_hours = (datetime.now(timezone.utc) - created_at).total_seconds() / 3600
            if age_hours > ttl_hours:
                return False
        # Check lastmod
        if validate_lastmod and current_lastmod:
            cached_lastmod = data.get("sitemap_lastmod")
            if cached_lastmod and current_lastmod > cached_lastmod:
                return False
        # Check URL count (sanity check - if 0, likely corrupted)
        if data.get("url_count", 0) == 0:
            return False
        return True
    except (json.JSONDecodeError, KeyError, ValueError, IOError):
        # Corrupted cache - return False to trigger refetch
        return False
 def _read_cache(cache_path: pathlib.Path) -> List[str]:
    """Read URLs from cache file. Returns empty list on error."""
    try:
        with open(cache_path, "r") as f:
            data = json.load(f)
        return data.get("urls", [])
    except Exception:
        return []
 def _write_cache(
    cache_path: pathlib.Path,
    urls: List[str],
    sitemap_url: str,
    sitemap_lastmod: Optional[str]
 ) -> None:
    """Write URLs to cache with metadata."""
    data = {
        "version": 1,
        "created_at": datetime.now(timezone.utc).isoformat(),
        "sitemap_lastmod": sitemap_lastmod,
        "sitemap_url": sitemap_url,
        "url_count": len(urls),
        "urls": urls
    }
    try:
        with open(cache_path, "w") as f:
            json.dump(data, f)
    except Exception:
        pass  # Fail silently - cache is optional
 def _match(url: str, pattern: str) -> bool:
    if fnmatch.fnmatch(url, pattern):
        return True
@@ -295,6 +392,10 @@ class AsyncUrlSeeder:
        score_threshold = config.score_threshold
        scoring_method = config.scoring_method
        # Store cache config for use in _from_sitemaps
        self._cache_ttl_hours = getattr(config, 'cache_ttl_hours', 24)
        self._validate_sitemap_lastmod = getattr(config, 'validate_sitemap_lastmod', True)
        # Ensure seeder's logger verbose matches the config's verbose if it's set
        if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
            self.logger.verbose = config.verbose
@@ -764,68 +865,222 @@ class AsyncUrlSeeder:
    # ─────────────────────────────── Sitemaps
    async def _from_sitemaps(self, domain: str, pattern: str, force: bool = False):
        """
-        1. Probe default sitemap locations.
+        Discover URLs from sitemaps with smart TTL-based caching.
-        2. If none exist, parse robots.txt for alternative sitemap URLs.
+
-        3. Yield only URLs that match `pattern`.
+        1. Check cache validity (TTL + lastmod)
        2. If valid, yield from cache
        3. If invalid or force=True, fetch fresh and update cache
        4. FALLBACK: If anything fails, bypass cache and fetch directly
        """
        # Get config values (passed via self during urls() call)
        cache_ttl_hours = getattr(self, '_cache_ttl_hours', 24)
        validate_lastmod = getattr(self, '_validate_sitemap_lastmod', True)
-       # ── cache file (same logic as _from_cc)
+        # Cache file path (new format: .json instead of .jsonl)
        host = re.sub(r'^https?://', '', domain).rstrip('/')
-        host = re.sub('[/?#]+', '_', domain)
+        host_safe = re.sub('[/?#]+', '_', host)
        digest = hashlib.md5(pattern.encode()).hexdigest()[:8]
-        path = self.cache_dir / f"sitemap_{host}_{digest}.jsonl"
+        cache_path = self.cache_dir / f"sitemap_{host_safe}_{digest}.json"
-        if path.exists() and not force:
+        # Check for old .jsonl format and delete it
-            self._log("info", "Loading sitemap URLs for {d} from cache: {p}",
+        old_cache_path = self.cache_dir / f"sitemap_{host_safe}_{digest}.jsonl"
-                      params={"d": host, "p": str(path)}, tag="URL_SEED")
+        if old_cache_path.exists():
-            async with aiofiles.open(path, "r") as fp:
+            try:
-                async for line in fp:
+                old_cache_path.unlink()
-                    url = line.strip()
+                self._log("info", "Deleted old cache format: {p}",
-                    if _match(url, pattern):
+                          params={"p": str(old_cache_path)}, tag="URL_SEED")
-                        yield url
+            except Exception:
-            return
+                pass
-        # 1️⃣ direct sitemap probe
+        # Step 1: Find sitemap URL and get lastmod (needed for validation)
-        # strip any scheme so we can handle https → http fallback
+        sitemap_url = None
-        host = re.sub(r'^https?://', '', domain).rstrip('/')
+        sitemap_lastmod = None
        sitemap_content = None
-        schemes = ('https', 'http')  # prefer TLS, downgrade if needed
+        schemes = ('https', 'http')
        for scheme in schemes:
            for suffix in ("/sitemap.xml", "/sitemap_index.xml"):
                sm = f"{scheme}://{host}{suffix}"
-                sm = await self._resolve_head(sm)
+                resolved = await self._resolve_head(sm)
-                if sm:
+                if resolved:
-                    self._log("info", "Found sitemap at {url}", params={
+                    sitemap_url = resolved
-                              "url": sm}, tag="URL_SEED")
+                    # Fetch sitemap content to get lastmod
-                    async with aiofiles.open(path, "w") as fp:
+                    try:
                        r = await self.client.get(sitemap_url, timeout=15, follow_redirects=True)
                        if 200 <= r.status_code < 300:
                            sitemap_content = r.content
                            sitemap_lastmod = _parse_sitemap_lastmod(sitemap_content)
                    except Exception:
                        pass
                    break
            if sitemap_url:
                break
        # Step 2: Check cache validity (skip if force=True)
        if not force and cache_path.exists():
            if _is_cache_valid(cache_path, cache_ttl_hours, validate_lastmod, sitemap_lastmod):
                self._log("info", "Loading sitemap URLs from valid cache: {p}",
                          params={"p": str(cache_path)}, tag="URL_SEED")
                cached_urls = _read_cache(cache_path)
                for url in cached_urls:
                    if _match(url, pattern):
                        yield url
                return
            else:
                self._log("info", "Cache invalid/expired, refetching sitemap for {d}",
                          params={"d": domain}, tag="URL_SEED")
        # Step 3: Fetch fresh URLs
        discovered_urls = []
        if sitemap_url and sitemap_content:
            self._log("info", "Found sitemap at {url}", params={"url": sitemap_url}, tag="URL_SEED")
            # Parse sitemap (reuse content we already fetched)
            async for u in self._iter_sitemap_content(sitemap_url, sitemap_content):
                discovered_urls.append(u)
                if _match(u, pattern):
                    yield u
        elif sitemap_url:
            # We have a sitemap URL but no content (fetch failed earlier), try again
            self._log("info", "Found sitemap at {url}", params={"url": sitemap_url}, tag="URL_SEED")
            async for u in self._iter_sitemap(sitemap_url):
                discovered_urls.append(u)
                if _match(u, pattern):
                    yield u
        else:
            # Fallback: robots.txt
            robots = f"https://{host}/robots.txt"
            try:
                r = await self.client.get(robots, timeout=10, follow_redirects=True)
                if 200 <= r.status_code < 300:
                    sitemap_lines = [l.split(":", 1)[1].strip()
                                     for l in r.text.splitlines()
                                     if l.lower().startswith("sitemap:")]
                    for sm in sitemap_lines:
                        async for u in self._iter_sitemap(sm):
-                            await fp.write(u + "\n")
+                            discovered_urls.append(u)
                            if _match(u, pattern):
                                yield u
                else:
                    self._log("warning", "robots.txt unavailable for {d} HTTP{c}",
                              params={"d": domain, "c": r.status_code}, tag="URL_SEED")
                    return
-
+            except Exception as e:
-        # 2️⃣ robots.txt fallback
+                self._log("warning", "Failed to fetch robots.txt for {d}: {e}",
-        robots = f"https://{domain.rstrip('/')}/robots.txt"
+                          params={"d": domain, "e": str(e)}, tag="URL_SEED")
        try:
            r = await self.client.get(robots, timeout=10, follow_redirects=True)
            if not 200 <= r.status_code < 300:
                self._log("warning", "robots.txt unavailable for {d} HTTP{c}", params={
                          "d": domain, "c": r.status_code}, tag="URL_SEED")
                return
            sitemap_lines = [l.split(":", 1)[1].strip(
            ) for l in r.text.splitlines() if l.lower().startswith("sitemap:")]
        except Exception as e:
            self._log("warning", "Failed to fetch robots.txt for {d}: {e}", params={
                      "d": domain, "e": str(e)}, tag="URL_SEED")
            return
-        if sitemap_lines:
+        # Step 4: Write to cache (FALLBACK: if write fails, URLs still yielded above)
-            async with aiofiles.open(path, "w") as fp:
+        if discovered_urls:
-                for sm in sitemap_lines:
+            _write_cache(cache_path, discovered_urls, sitemap_url or "", sitemap_lastmod)
-                    async for u in self._iter_sitemap(sm):
+            self._log("info", "Cached {count} URLs for {d}",
-                        await fp.write(u + "\n")
+                      params={"count": len(discovered_urls), "d": domain}, tag="URL_SEED")
-                        if _match(u, pattern):
+
-                            yield u
+    async def _iter_sitemap_content(self, url: str, content: bytes):
        """Parse sitemap from already-fetched content."""
        data = gzip.decompress(content) if url.endswith(".gz") else content
        base_url = url
        def _normalize_loc(raw: Optional[str]) -> Optional[str]:
            if not raw:
                return None
            normalized = urljoin(base_url, raw.strip())
            if not normalized:
                return None
            return normalized
        # Detect if this is a sitemap index
        is_sitemap_index = False
        sub_sitemaps = []
        regular_urls = []
        if LXML:
            try:
                parser = etree.XMLParser(recover=True)
                root = etree.fromstring(data, parser=parser)
                sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
                url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")
                if sitemap_loc_nodes:
                    is_sitemap_index = True
                    for sitemap_elem in sitemap_loc_nodes:
                        loc = _normalize_loc(sitemap_elem.text)
                        if loc:
                            sub_sitemaps.append(loc)
                if not is_sitemap_index:
                    for loc_elem in url_loc_nodes:
                        loc = _normalize_loc(loc_elem.text)
                        if loc:
                            regular_urls.append(loc)
            except Exception as e:
                self._log("error", "LXML parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
                return
        else:
            import xml.etree.ElementTree as ET
            try:
                root = ET.fromstring(data)
                for elem in root.iter():
                    if '}' in elem.tag:
                        elem.tag = elem.tag.split('}')[1]
                sitemaps = root.findall('.//sitemap')
                url_entries = root.findall('.//url')
                if sitemaps:
                    is_sitemap_index = True
                    for sitemap in sitemaps:
                        loc_elem = sitemap.find('loc')
                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
                        if loc:
                            sub_sitemaps.append(loc)
                if not is_sitemap_index:
                    for url_elem in url_entries:
                        loc_elem = url_elem.find('loc')
                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
                        if loc:
                            regular_urls.append(loc)
            except Exception as e:
                self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
                return
        # Process based on type
        if is_sitemap_index and sub_sitemaps:
            self._log("info", "Processing sitemap index with {count} sub-sitemaps",
                      params={"count": len(sub_sitemaps)}, tag="URL_SEED")
            queue_size = min(50000, len(sub_sitemaps) * 1000)
            result_queue = asyncio.Queue(maxsize=queue_size)
            completed_count = 0
            total_sitemaps = len(sub_sitemaps)
            async def process_subsitemap(sitemap_url: str):
                try:
                    async for u in self._iter_sitemap(sitemap_url):
                        await result_queue.put(u)
                except Exception as e:
                    self._log("error", "Error processing sub-sitemap {url}: {error}",
                              params={"url": sitemap_url, "error": str(e)}, tag="URL_SEED")
                finally:
                    await result_queue.put(None)
            tasks = [asyncio.create_task(process_subsitemap(sm)) for sm in sub_sitemaps]
            while completed_count < total_sitemaps:
                item = await result_queue.get()
                if item is None:
                    completed_count += 1
                else:
                    yield item
            await asyncio.gather(*tasks, return_exceptions=True)
        else:
            for u in regular_urls:
                yield u
    async def _iter_sitemap(self, url: str):
        try: