feat: integrate last30days and daily-news-report skills

2026-01-26 19:05:37 +01:00
parent d2569f2107
commit c7f7f23bd7
45 changed files with 7632 additions and 0 deletions
--- a/skills/last30days/scripts/lib/websearch.py
+++ b/skills/last30days/scripts/lib/websearch.py
@@ -0,0 +1,401 @@
+"""WebSearch module for last30days skill.
+
+NOTE: WebSearch uses Claude's built-in WebSearch tool, which runs INSIDE Claude Code.
+Unlike Reddit/X which use external APIs, WebSearch results are obtained by Claude
+directly and passed to this module for normalization and scoring.
+
+The typical flow is:
+1. Claude invokes WebSearch tool with the topic
+2. Claude passes results to parse_websearch_results()
+3. Results are normalized into WebSearchItem objects
+"""
+
+import re
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+
+from . import schema
+
+
+# Month name mappings for date parsing
+MONTH_MAP = {
+    "jan": 1, "january": 1,
+    "feb": 2, "february": 2,
+    "mar": 3, "march": 3,
+    "apr": 4, "april": 4,
+    "may": 5,
+    "jun": 6, "june": 6,
+    "jul": 7, "july": 7,
+    "aug": 8, "august": 8,
+    "sep": 9, "sept": 9, "september": 9,
+    "oct": 10, "october": 10,
+    "nov": 11, "november": 11,
+    "dec": 12, "december": 12,
+}
+
+
+def extract_date_from_url(url: str) -> Optional[str]:
+    """Try to extract a date from URL path.
+
+    Many sites embed dates in URLs like:
+    - /2026/01/24/article-title
+    - /2026-01-24/article
+    - /blog/20260124/title
+
+    Args:
+        url: URL to parse
+
+    Returns:
+        Date string in YYYY-MM-DD format, or None
+    """
+    # Pattern 1: /YYYY/MM/DD/ (most common)
+    match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', url)
+    if match:
+        year, month, day = match.groups()
+        if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
+            return f"{year}-{month}-{day}"
+
+    # Pattern 2: /YYYY-MM-DD/ or /YYYY-MM-DD-
+    match = re.search(r'/(\d{4})-(\d{2})-(\d{2})[-/]', url)
+    if match:
+        year, month, day = match.groups()
+        if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
+            return f"{year}-{month}-{day}"
+
+    # Pattern 3: /YYYYMMDD/ (compact)
+    match = re.search(r'/(\d{4})(\d{2})(\d{2})/', url)
+    if match:
+        year, month, day = match.groups()
+        if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
+            return f"{year}-{month}-{day}"
+
+    return None
+
+
+def extract_date_from_snippet(text: str) -> Optional[str]:
+    """Try to extract a date from text snippet or title.
+
+    Looks for patterns like:
+    - January 24, 2026 or Jan 24, 2026
+    - 24 January 2026
+    - 2026-01-24
+    - "3 days ago", "yesterday", "last week"
+
+    Args:
+        text: Text to parse
+
+    Returns:
+        Date string in YYYY-MM-DD format, or None
+    """
+    if not text:
+        return None
+
+    text_lower = text.lower()
+
+    # Pattern 1: Month DD, YYYY (e.g., "January 24, 2026")
+    match = re.search(
+        r'\b(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|'
+        r'jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)'
+        r'\s+(\d{1,2})(?:st|nd|rd|th)?,?\s*(\d{4})\b',
+        text_lower
+    )
+    if match:
+        month_str, day, year = match.groups()
+        month = MONTH_MAP.get(month_str[:3])
+        if month and 2020 <= int(year) <= 2030 and 1 <= int(day) <= 31:
+            return f"{year}-{month:02d}-{int(day):02d}"
+
+    # Pattern 2: DD Month YYYY (e.g., "24 January 2026")
+    match = re.search(
+        r'\b(\d{1,2})(?:st|nd|rd|th)?\s+'
+        r'(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|'
+        r'jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)'
+        r'\s+(\d{4})\b',
+        text_lower
+    )
+    if match:
+        day, month_str, year = match.groups()
+        month = MONTH_MAP.get(month_str[:3])
+        if month and 2020 <= int(year) <= 2030 and 1 <= int(day) <= 31:
+            return f"{year}-{month:02d}-{int(day):02d}"
+
+    # Pattern 3: YYYY-MM-DD (ISO format)
+    match = re.search(r'\b(\d{4})-(\d{2})-(\d{2})\b', text)
+    if match:
+        year, month, day = match.groups()
+        if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
+            return f"{year}-{month}-{day}"
+
+    # Pattern 4: Relative dates ("3 days ago", "yesterday", etc.)
+    today = datetime.now()
+
+    if "yesterday" in text_lower:
+        date = today - timedelta(days=1)
+        return date.strftime("%Y-%m-%d")
+
+    if "today" in text_lower:
+        return today.strftime("%Y-%m-%d")
+
+    # "N days ago"
+    match = re.search(r'\b(\d+)\s*days?\s*ago\b', text_lower)
+    if match:
+        days = int(match.group(1))
+        if days <= 60:  # Reasonable range
+            date = today - timedelta(days=days)
+            return date.strftime("%Y-%m-%d")
+
+    # "N hours ago" -> today
+    match = re.search(r'\b(\d+)\s*hours?\s*ago\b', text_lower)
+    if match:
+        return today.strftime("%Y-%m-%d")
+
+    # "last week" -> ~7 days ago
+    if "last week" in text_lower:
+        date = today - timedelta(days=7)
+        return date.strftime("%Y-%m-%d")
+
+    # "this week" -> ~3 days ago (middle of week)
+    if "this week" in text_lower:
+        date = today - timedelta(days=3)
+        return date.strftime("%Y-%m-%d")
+
+    return None
+
+
+def extract_date_signals(
+    url: str,
+    snippet: str,
+    title: str,
+) -> Tuple[Optional[str], str]:
+    """Extract date from any available signal.
+
+    Tries URL first (most reliable), then snippet, then title.
+
+    Args:
+        url: Page URL
+        snippet: Page snippet/description
+        title: Page title
+
+    Returns:
+        Tuple of (date_string, confidence)
+        - date from URL: 'high' confidence
+        - date from snippet/title: 'med' confidence
+        - no date found: None, 'low' confidence
+    """
+    # Try URL first (most reliable)
+    url_date = extract_date_from_url(url)
+    if url_date:
+        return url_date, "high"
+
+    # Try snippet
+    snippet_date = extract_date_from_snippet(snippet)
+    if snippet_date:
+        return snippet_date, "med"
+
+    # Try title
+    title_date = extract_date_from_snippet(title)
+    if title_date:
+        return title_date, "med"
+
+    return None, "low"
+
+
+# Domains to exclude (Reddit and X are handled separately)
+EXCLUDED_DOMAINS = {
+    "reddit.com",
+    "www.reddit.com",
+    "old.reddit.com",
+    "twitter.com",
+    "www.twitter.com",
+    "x.com",
+    "www.x.com",
+    "mobile.twitter.com",
+}
+
+
+def extract_domain(url: str) -> str:
+    """Extract the domain from a URL.
+
+    Args:
+        url: Full URL
+
+    Returns:
+        Domain string (e.g., "medium.com")
+    """
+    try:
+        parsed = urlparse(url)
+        domain = parsed.netloc.lower()
+        # Remove www. prefix for cleaner display
+        if domain.startswith("www."):
+            domain = domain[4:]
+        return domain
+    except Exception:
+        return ""
+
+
+def is_excluded_domain(url: str) -> bool:
+    """Check if URL is from an excluded domain (Reddit/X).
+
+    Args:
+        url: URL to check
+
+    Returns:
+        True if URL should be excluded
+    """
+    try:
+        parsed = urlparse(url)
+        domain = parsed.netloc.lower()
+        return domain in EXCLUDED_DOMAINS
+    except Exception:
+        return False
+
+
+def parse_websearch_results(
+    results: List[Dict[str, Any]],
+    topic: str,
+    from_date: str = "",
+    to_date: str = "",
+) -> List[Dict[str, Any]]:
+    """Parse WebSearch results into normalized format.
+
+    This function expects results from Claude's WebSearch tool.
+    Each result should have: title, url, snippet, and optionally date/relevance.
+
+    Uses "Date Detective" approach:
+    1. Extract dates from URLs (high confidence)
+    2. Extract dates from snippets/titles (med confidence)
+    3. Hard filter: exclude items with verified old dates
+    4. Keep items with no date signals (with low confidence penalty)
+
+    Args:
+        results: List of WebSearch result dicts
+        topic: Original search topic (for context)
+        from_date: Start date for filtering (YYYY-MM-DD)
+        to_date: End date for filtering (YYYY-MM-DD)
+
+    Returns:
+        List of normalized item dicts ready for WebSearchItem creation
+    """
+    items = []
+
+    for i, result in enumerate(results):
+        if not isinstance(result, dict):
+            continue
+
+        url = result.get("url", "")
+        if not url:
+            continue
+
+        # Skip Reddit/X URLs (handled separately)
+        if is_excluded_domain(url):
+            continue
+
+        title = str(result.get("title", "")).strip()
+        snippet = str(result.get("snippet", result.get("description", ""))).strip()
+
+        if not title and not snippet:
+            continue
+
+        # Use Date Detective to extract date signals
+        date = result.get("date")  # Use provided date if available
+        date_confidence = "low"
+
+        if date and re.match(r'^\d{4}-\d{2}-\d{2}$', str(date)):
+            # Provided date is valid
+            date_confidence = "med"
+        else:
+            # Try to extract date from URL/snippet/title
+            extracted_date, confidence = extract_date_signals(url, snippet, title)
+            if extracted_date:
+                date = extracted_date
+                date_confidence = confidence
+
+        # Hard filter: if we found a date and it's too old, skip
+        if date and from_date and date < from_date:
+            continue  # DROP - verified old content
+
+        # Hard filter: if date is in the future, skip (parsing error)
+        if date and to_date and date > to_date:
+            continue  # DROP - future date
+
+        # Get relevance if provided, default to 0.5
+        relevance = result.get("relevance", 0.5)
+        try:
+            relevance = min(1.0, max(0.0, float(relevance)))
+        except (TypeError, ValueError):
+            relevance = 0.5
+
+        item = {
+            "id": f"W{i+1}",
+            "title": title[:200],  # Truncate long titles
+            "url": url,
+            "source_domain": extract_domain(url),
+            "snippet": snippet[:500],  # Truncate long snippets
+            "date": date,
+            "date_confidence": date_confidence,
+            "relevance": relevance,
+            "why_relevant": str(result.get("why_relevant", "")).strip(),
+        }
+
+        items.append(item)
+
+    return items
+
+
+def normalize_websearch_items(
+    items: List[Dict[str, Any]],
+    from_date: str,
+    to_date: str,
+) -> List[schema.WebSearchItem]:
+    """Convert parsed dicts to WebSearchItem objects.
+
+    Args:
+        items: List of parsed item dicts
+        from_date: Start of date range (YYYY-MM-DD)
+        to_date: End of date range (YYYY-MM-DD)
+
+    Returns:
+        List of WebSearchItem objects
+    """
+    result = []
+
+    for item in items:
+        web_item = schema.WebSearchItem(
+            id=item["id"],
+            title=item["title"],
+            url=item["url"],
+            source_domain=item["source_domain"],
+            snippet=item["snippet"],
+            date=item.get("date"),
+            date_confidence=item.get("date_confidence", "low"),
+            relevance=item.get("relevance", 0.5),
+            why_relevant=item.get("why_relevant", ""),
+        )
+        result.append(web_item)
+
+    return result
+
+
+def dedupe_websearch(items: List[schema.WebSearchItem]) -> List[schema.WebSearchItem]:
+    """Remove duplicate WebSearch items.
+
+    Deduplication is based on URL.
+
+    Args:
+        items: List of WebSearchItem objects
+
+    Returns:
+        Deduplicated list
+    """
+    seen_urls = set()
+    result = []
+
+    for item in items:
+        # Normalize URL for comparison
+        url_key = item.url.lower().rstrip("/")
+        if url_key not in seen_urls:
+            seen_urls.add(url_key)
+            result.append(item)
+
+    return result