feat: integrate last30days and daily-news-report skills

2026-01-26 19:05:37 +01:00
parent d2569f2107
commit c7f7f23bd7
45 changed files with 7632 additions and 0 deletions
--- a/skills/last30days/scripts/lib/reddit_enrich.py
+++ b/skills/last30days/scripts/lib/reddit_enrich.py
@@ -0,0 +1,232 @@
+"""Reddit thread enrichment with real engagement metrics."""
+
+import re
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
+
+from . import http, dates
+
+
+def extract_reddit_path(url: str) -> Optional[str]:
+    """Extract the path from a Reddit URL.
+
+    Args:
+        url: Reddit URL
+
+    Returns:
+        Path component or None
+    """
+    try:
+        parsed = urlparse(url)
+        if "reddit.com" not in parsed.netloc:
+            return None
+        return parsed.path
+    except:
+        return None
+
+
+def fetch_thread_data(url: str, mock_data: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
+    """Fetch Reddit thread JSON data.
+
+    Args:
+        url: Reddit thread URL
+        mock_data: Mock data for testing
+
+    Returns:
+        Thread data dict or None on failure
+    """
+    if mock_data is not None:
+        return mock_data
+
+    path = extract_reddit_path(url)
+    if not path:
+        return None
+
+    try:
+        data = http.get_reddit_json(path)
+        return data
+    except http.HTTPError:
+        return None
+
+
+def parse_thread_data(data: Any) -> Dict[str, Any]:
+    """Parse Reddit thread JSON into structured data.
+
+    Args:
+        data: Raw Reddit JSON response
+
+    Returns:
+        Dict with submission and comments data
+    """
+    result = {
+        "submission": None,
+        "comments": [],
+    }
+
+    if not isinstance(data, list) or len(data) < 1:
+        return result
+
+    # First element is submission listing
+    submission_listing = data[0]
+    if isinstance(submission_listing, dict):
+        children = submission_listing.get("data", {}).get("children", [])
+        if children:
+            sub_data = children[0].get("data", {})
+            result["submission"] = {
+                "score": sub_data.get("score"),
+                "num_comments": sub_data.get("num_comments"),
+                "upvote_ratio": sub_data.get("upvote_ratio"),
+                "created_utc": sub_data.get("created_utc"),
+                "permalink": sub_data.get("permalink"),
+                "title": sub_data.get("title"),
+                "selftext": sub_data.get("selftext", "")[:500],  # Truncate
+            }
+
+    # Second element is comments listing
+    if len(data) >= 2:
+        comments_listing = data[1]
+        if isinstance(comments_listing, dict):
+            children = comments_listing.get("data", {}).get("children", [])
+            for child in children:
+                if child.get("kind") != "t1":  # t1 = comment
+                    continue
+                c_data = child.get("data", {})
+                if not c_data.get("body"):
+                    continue
+
+                comment = {
+                    "score": c_data.get("score", 0),
+                    "created_utc": c_data.get("created_utc"),
+                    "author": c_data.get("author", "[deleted]"),
+                    "body": c_data.get("body", "")[:300],  # Truncate
+                    "permalink": c_data.get("permalink"),
+                }
+                result["comments"].append(comment)
+
+    return result
+
+
+def get_top_comments(comments: List[Dict], limit: int = 10) -> List[Dict[str, Any]]:
+    """Get top comments sorted by score.
+
+    Args:
+        comments: List of comment dicts
+        limit: Maximum number to return
+
+    Returns:
+        Top comments sorted by score
+    """
+    # Filter out deleted/removed
+    valid = [c for c in comments if c.get("author") not in ("[deleted]", "[removed]")]
+
+    # Sort by score descending
+    sorted_comments = sorted(valid, key=lambda c: c.get("score", 0), reverse=True)
+
+    return sorted_comments[:limit]
+
+
+def extract_comment_insights(comments: List[Dict], limit: int = 7) -> List[str]:
+    """Extract key insights from top comments.
+
+    Uses simple heuristics to identify valuable comments:
+    - Has substantive text
+    - Contains actionable information
+    - Not just agreement/disagreement
+
+    Args:
+        comments: Top comments
+        limit: Max insights to extract
+
+    Returns:
+        List of insight strings
+    """
+    insights = []
+
+    for comment in comments[:limit * 2]:  # Look at more comments than we need
+        body = comment.get("body", "").strip()
+        if not body or len(body) < 30:
+            continue
+
+        # Skip low-value patterns
+        skip_patterns = [
+            r'^(this|same|agreed|exactly|yep|nope|yes|no|thanks|thank you)\.?$',
+            r'^lol|lmao|haha',
+            r'^\[deleted\]',
+            r'^\[removed\]',
+        ]
+        if any(re.match(p, body.lower()) for p in skip_patterns):
+            continue
+
+        # Truncate to first meaningful sentence or ~150 chars
+        insight = body[:150]
+        if len(body) > 150:
+            # Try to find a sentence boundary
+            for i, char in enumerate(insight):
+                if char in '.!?' and i > 50:
+                    insight = insight[:i+1]
+                    break
+            else:
+                insight = insight.rstrip() + "..."
+
+        insights.append(insight)
+        if len(insights) >= limit:
+            break
+
+    return insights
+
+
+def enrich_reddit_item(
+    item: Dict[str, Any],
+    mock_thread_data: Optional[Dict] = None,
+) -> Dict[str, Any]:
+    """Enrich a Reddit item with real engagement data.
+
+    Args:
+        item: Reddit item dict
+        mock_thread_data: Mock data for testing
+
+    Returns:
+        Enriched item dict
+    """
+    url = item.get("url", "")
+
+    # Fetch thread data
+    thread_data = fetch_thread_data(url, mock_thread_data)
+    if not thread_data:
+        return item
+
+    parsed = parse_thread_data(thread_data)
+    submission = parsed.get("submission")
+    comments = parsed.get("comments", [])
+
+    # Update engagement metrics
+    if submission:
+        item["engagement"] = {
+            "score": submission.get("score"),
+            "num_comments": submission.get("num_comments"),
+            "upvote_ratio": submission.get("upvote_ratio"),
+        }
+
+        # Update date from actual data
+        created_utc = submission.get("created_utc")
+        if created_utc:
+            item["date"] = dates.timestamp_to_date(created_utc)
+
+    # Get top comments
+    top_comments = get_top_comments(comments)
+    item["top_comments"] = []
+    for c in top_comments:
+        permalink = c.get("permalink", "")
+        comment_url = f"https://reddit.com{permalink}" if permalink else ""
+        item["top_comments"].append({
+            "score": c.get("score", 0),
+            "date": dates.timestamp_to_date(c.get("created_utc")),
+            "author": c.get("author", ""),
+            "excerpt": c.get("body", "")[:200],
+            "url": comment_url,
+        })
+
+    # Extract insights
+    item["comment_insights"] = extract_comment_insights(top_comments)
+
+    return item