feat: integrate last30days and daily-news-report skills

2026-01-26 19:05:37 +01:00
parent d2569f2107
commit c7f7f23bd7
45 changed files with 7632 additions and 0 deletions
--- a/skills/last30days/scripts/lib/dedupe.py
+++ b/skills/last30days/scripts/lib/dedupe.py
@@ -0,0 +1,120 @@
+"""Near-duplicate detection for last30days skill."""
+
+import re
+from typing import List, Set, Tuple, Union
+
+from . import schema
+
+
+def normalize_text(text: str) -> str:
+    """Normalize text for comparison.
+
+    - Lowercase
+    - Remove punctuation
+    - Collapse whitespace
+    """
+    text = text.lower()
+    text = re.sub(r'[^\w\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+
+
+def get_ngrams(text: str, n: int = 3) -> Set[str]:
+    """Get character n-grams from text."""
+    text = normalize_text(text)
+    if len(text) < n:
+        return {text}
+    return {text[i:i+n] for i in range(len(text) - n + 1)}
+
+
+def jaccard_similarity(set1: Set[str], set2: Set[str]) -> float:
+    """Compute Jaccard similarity between two sets."""
+    if not set1 or not set2:
+        return 0.0
+    intersection = len(set1 & set2)
+    union = len(set1 | set2)
+    return intersection / union if union > 0 else 0.0
+
+
+def get_item_text(item: Union[schema.RedditItem, schema.XItem]) -> str:
+    """Get comparable text from an item."""
+    if isinstance(item, schema.RedditItem):
+        return item.title
+    else:
+        return item.text
+
+
+def find_duplicates(
+    items: List[Union[schema.RedditItem, schema.XItem]],
+    threshold: float = 0.7,
+) -> List[Tuple[int, int]]:
+    """Find near-duplicate pairs in items.
+
+    Args:
+        items: List of items to check
+        threshold: Similarity threshold (0-1)
+
+    Returns:
+        List of (i, j) index pairs where i < j and items are similar
+    """
+    duplicates = []
+
+    # Pre-compute n-grams
+    ngrams = [get_ngrams(get_item_text(item)) for item in items]
+
+    for i in range(len(items)):
+        for j in range(i + 1, len(items)):
+            similarity = jaccard_similarity(ngrams[i], ngrams[j])
+            if similarity >= threshold:
+                duplicates.append((i, j))
+
+    return duplicates
+
+
+def dedupe_items(
+    items: List[Union[schema.RedditItem, schema.XItem]],
+    threshold: float = 0.7,
+) -> List[Union[schema.RedditItem, schema.XItem]]:
+    """Remove near-duplicates, keeping highest-scored item.
+
+    Args:
+        items: List of items (should be pre-sorted by score descending)
+        threshold: Similarity threshold
+
+    Returns:
+        Deduplicated items
+    """
+    if len(items) <= 1:
+        return items
+
+    # Find duplicate pairs
+    dup_pairs = find_duplicates(items, threshold)
+
+    # Mark indices to remove (always remove the lower-scored one)
+    # Since items are pre-sorted by score, the second index is always lower
+    to_remove = set()
+    for i, j in dup_pairs:
+        # Keep the higher-scored one (lower index in sorted list)
+        if items[i].score >= items[j].score:
+            to_remove.add(j)
+        else:
+            to_remove.add(i)
+
+    # Return items not marked for removal
+    return [item for idx, item in enumerate(items) if idx not in to_remove]
+
+
+def dedupe_reddit(
+    items: List[schema.RedditItem],
+    threshold: float = 0.7,
+) -> List[schema.RedditItem]:
+    """Dedupe Reddit items."""
+    return dedupe_items(items, threshold)
+
+
+def dedupe_x(
+    items: List[schema.XItem],
+    threshold: float = 0.7,
+) -> List[schema.XItem]:
+    """Dedupe X items."""
+    return dedupe_items(items, threshold)