feat: integrate last30days and daily-news-report skills
This commit is contained in:
120
skills/last30days/scripts/lib/dedupe.py
Normal file
120
skills/last30days/scripts/lib/dedupe.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""Near-duplicate detection for last30days skill."""
|
||||
|
||||
import re
|
||||
from typing import List, Set, Tuple, Union
|
||||
|
||||
from . import schema
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalize text for comparison.
|
||||
|
||||
- Lowercase
|
||||
- Remove punctuation
|
||||
- Collapse whitespace
|
||||
"""
|
||||
text = text.lower()
|
||||
text = re.sub(r'[^\w\s]', ' ', text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_ngrams(text: str, n: int = 3) -> Set[str]:
|
||||
"""Get character n-grams from text."""
|
||||
text = normalize_text(text)
|
||||
if len(text) < n:
|
||||
return {text}
|
||||
return {text[i:i+n] for i in range(len(text) - n + 1)}
|
||||
|
||||
|
||||
def jaccard_similarity(set1: Set[str], set2: Set[str]) -> float:
|
||||
"""Compute Jaccard similarity between two sets."""
|
||||
if not set1 or not set2:
|
||||
return 0.0
|
||||
intersection = len(set1 & set2)
|
||||
union = len(set1 | set2)
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
def get_item_text(item: Union[schema.RedditItem, schema.XItem]) -> str:
|
||||
"""Get comparable text from an item."""
|
||||
if isinstance(item, schema.RedditItem):
|
||||
return item.title
|
||||
else:
|
||||
return item.text
|
||||
|
||||
|
||||
def find_duplicates(
|
||||
items: List[Union[schema.RedditItem, schema.XItem]],
|
||||
threshold: float = 0.7,
|
||||
) -> List[Tuple[int, int]]:
|
||||
"""Find near-duplicate pairs in items.
|
||||
|
||||
Args:
|
||||
items: List of items to check
|
||||
threshold: Similarity threshold (0-1)
|
||||
|
||||
Returns:
|
||||
List of (i, j) index pairs where i < j and items are similar
|
||||
"""
|
||||
duplicates = []
|
||||
|
||||
# Pre-compute n-grams
|
||||
ngrams = [get_ngrams(get_item_text(item)) for item in items]
|
||||
|
||||
for i in range(len(items)):
|
||||
for j in range(i + 1, len(items)):
|
||||
similarity = jaccard_similarity(ngrams[i], ngrams[j])
|
||||
if similarity >= threshold:
|
||||
duplicates.append((i, j))
|
||||
|
||||
return duplicates
|
||||
|
||||
|
||||
def dedupe_items(
|
||||
items: List[Union[schema.RedditItem, schema.XItem]],
|
||||
threshold: float = 0.7,
|
||||
) -> List[Union[schema.RedditItem, schema.XItem]]:
|
||||
"""Remove near-duplicates, keeping highest-scored item.
|
||||
|
||||
Args:
|
||||
items: List of items (should be pre-sorted by score descending)
|
||||
threshold: Similarity threshold
|
||||
|
||||
Returns:
|
||||
Deduplicated items
|
||||
"""
|
||||
if len(items) <= 1:
|
||||
return items
|
||||
|
||||
# Find duplicate pairs
|
||||
dup_pairs = find_duplicates(items, threshold)
|
||||
|
||||
# Mark indices to remove (always remove the lower-scored one)
|
||||
# Since items are pre-sorted by score, the second index is always lower
|
||||
to_remove = set()
|
||||
for i, j in dup_pairs:
|
||||
# Keep the higher-scored one (lower index in sorted list)
|
||||
if items[i].score >= items[j].score:
|
||||
to_remove.add(j)
|
||||
else:
|
||||
to_remove.add(i)
|
||||
|
||||
# Return items not marked for removal
|
||||
return [item for idx, item in enumerate(items) if idx not in to_remove]
|
||||
|
||||
|
||||
def dedupe_reddit(
|
||||
items: List[schema.RedditItem],
|
||||
threshold: float = 0.7,
|
||||
) -> List[schema.RedditItem]:
|
||||
"""Dedupe Reddit items."""
|
||||
return dedupe_items(items, threshold)
|
||||
|
||||
|
||||
def dedupe_x(
|
||||
items: List[schema.XItem],
|
||||
threshold: float = 0.7,
|
||||
) -> List[schema.XItem]:
|
||||
"""Dedupe X items."""
|
||||
return dedupe_items(items, threshold)
|
||||
Reference in New Issue
Block a user