feat(filters): add reverse option to URLPatternFilter

Adds a new 'reverse' parameter to URLPatternFilter that allows inverting the filter's logic. When reverse=True, URLs that would normally match are rejected and vice versa.

Also removes unused 'scraped_html' from WebScrapingStrategy output to reduce memory usage.

BREAKING CHANGE: WebScrapingStrategy no longer returns 'scraped_html' in its output dictionary
This commit is contained in:
UncleCode
2025-03-08 18:54:41 +08:00
parent 4aeb7ef9ad
commit c6a605ccce
2 changed files with 19 additions and 12 deletions

View File

@@ -848,7 +848,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return { return {
# **markdown_content, # **markdown_content,
"scraped_html": html, # "scraped_html": html,
"cleaned_html": cleaned_html, "cleaned_html": cleaned_html,
"success": success, "success": success,
"media": media, "media": media,

View File

@@ -124,6 +124,7 @@ class URLPatternFilter(URLFilter):
"_simple_prefixes", "_simple_prefixes",
"_domain_patterns", "_domain_patterns",
"_path_patterns", "_path_patterns",
"_reverse",
) )
PATTERN_TYPES = { PATTERN_TYPES = {
@@ -138,8 +139,10 @@ class URLPatternFilter(URLFilter):
self, self,
patterns: Union[str, Pattern, List[Union[str, Pattern]]], patterns: Union[str, Pattern, List[Union[str, Pattern]]],
use_glob: bool = True, use_glob: bool = True,
reverse: bool = False,
): ):
super().__init__() super().__init__()
self._reverse = reverse
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
self._simple_suffixes = set() self._simple_suffixes = set()
@@ -205,36 +208,40 @@ class URLPatternFilter(URLFilter):
@lru_cache(maxsize=10000) @lru_cache(maxsize=10000)
def apply(self, url: str) -> bool: def apply(self, url: str) -> bool:
"""Hierarchical pattern matching"""
# Quick suffix check (*.html) # Quick suffix check (*.html)
if self._simple_suffixes: if self._simple_suffixes:
path = url.split("?")[0] path = url.split("?")[0]
if path.split("/")[-1].split(".")[-1] in self._simple_suffixes: if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
self._update_stats(True) result = True
return True self._update_stats(result)
return not result if self._reverse else result
# Domain check # Domain check
if self._domain_patterns: if self._domain_patterns:
for pattern in self._domain_patterns: for pattern in self._domain_patterns:
if pattern.match(url): if pattern.match(url):
self._update_stats(True) result = True
return True self._update_stats(result)
return not result if self._reverse else result
# Prefix check (/foo/*) # Prefix check (/foo/*)
if self._simple_prefixes: if self._simple_prefixes:
path = url.split("?")[0] path = url.split("?")[0]
if any(path.startswith(p) for p in self._simple_prefixes): if any(path.startswith(p) for p in self._simple_prefixes):
self._update_stats(True) result = True
return True self._update_stats(result)
return not result if self._reverse else result
# Complex patterns # Complex patterns
if self._path_patterns: if self._path_patterns:
if any(p.search(url) for p in self._path_patterns): if any(p.search(url) for p in self._path_patterns):
self._update_stats(True) result = True
return True self._update_stats(result)
return not result if self._reverse else result
self._update_stats(False) result = False
return False self._update_stats(result)
return not result if self._reverse else result
class ContentTypeFilter(URLFilter): class ContentTypeFilter(URLFilter):