feat(filters): add reverse option to URLPatternFilter
Adds a new 'reverse' parameter to URLPatternFilter that allows inverting the filter's logic. When reverse=True, URLs that would normally match are rejected and vice versa. Also removes unused 'scraped_html' from WebScrapingStrategy output to reduce memory usage. BREAKING CHANGE: WebScrapingStrategy no longer returns 'scraped_html' in its output dictionary
This commit is contained in:
@@ -848,7 +848,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
# **markdown_content,
|
# **markdown_content,
|
||||||
"scraped_html": html,
|
# "scraped_html": html,
|
||||||
"cleaned_html": cleaned_html,
|
"cleaned_html": cleaned_html,
|
||||||
"success": success,
|
"success": success,
|
||||||
"media": media,
|
"media": media,
|
||||||
|
|||||||
@@ -124,6 +124,7 @@ class URLPatternFilter(URLFilter):
|
|||||||
"_simple_prefixes",
|
"_simple_prefixes",
|
||||||
"_domain_patterns",
|
"_domain_patterns",
|
||||||
"_path_patterns",
|
"_path_patterns",
|
||||||
|
"_reverse",
|
||||||
)
|
)
|
||||||
|
|
||||||
PATTERN_TYPES = {
|
PATTERN_TYPES = {
|
||||||
@@ -138,8 +139,10 @@ class URLPatternFilter(URLFilter):
|
|||||||
self,
|
self,
|
||||||
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||||
use_glob: bool = True,
|
use_glob: bool = True,
|
||||||
|
reverse: bool = False,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self._reverse = reverse
|
||||||
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||||
|
|
||||||
self._simple_suffixes = set()
|
self._simple_suffixes = set()
|
||||||
@@ -205,36 +208,40 @@ class URLPatternFilter(URLFilter):
|
|||||||
|
|
||||||
@lru_cache(maxsize=10000)
|
@lru_cache(maxsize=10000)
|
||||||
def apply(self, url: str) -> bool:
|
def apply(self, url: str) -> bool:
|
||||||
"""Hierarchical pattern matching"""
|
|
||||||
# Quick suffix check (*.html)
|
# Quick suffix check (*.html)
|
||||||
if self._simple_suffixes:
|
if self._simple_suffixes:
|
||||||
path = url.split("?")[0]
|
path = url.split("?")[0]
|
||||||
if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
|
if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
|
||||||
self._update_stats(True)
|
result = True
|
||||||
return True
|
self._update_stats(result)
|
||||||
|
return not result if self._reverse else result
|
||||||
|
|
||||||
# Domain check
|
# Domain check
|
||||||
if self._domain_patterns:
|
if self._domain_patterns:
|
||||||
for pattern in self._domain_patterns:
|
for pattern in self._domain_patterns:
|
||||||
if pattern.match(url):
|
if pattern.match(url):
|
||||||
self._update_stats(True)
|
result = True
|
||||||
return True
|
self._update_stats(result)
|
||||||
|
return not result if self._reverse else result
|
||||||
|
|
||||||
# Prefix check (/foo/*)
|
# Prefix check (/foo/*)
|
||||||
if self._simple_prefixes:
|
if self._simple_prefixes:
|
||||||
path = url.split("?")[0]
|
path = url.split("?")[0]
|
||||||
if any(path.startswith(p) for p in self._simple_prefixes):
|
if any(path.startswith(p) for p in self._simple_prefixes):
|
||||||
self._update_stats(True)
|
result = True
|
||||||
return True
|
self._update_stats(result)
|
||||||
|
return not result if self._reverse else result
|
||||||
|
|
||||||
# Complex patterns
|
# Complex patterns
|
||||||
if self._path_patterns:
|
if self._path_patterns:
|
||||||
if any(p.search(url) for p in self._path_patterns):
|
if any(p.search(url) for p in self._path_patterns):
|
||||||
self._update_stats(True)
|
result = True
|
||||||
return True
|
self._update_stats(result)
|
||||||
|
return not result if self._reverse else result
|
||||||
|
|
||||||
self._update_stats(False)
|
result = False
|
||||||
return False
|
self._update_stats(result)
|
||||||
|
return not result if self._reverse else result
|
||||||
|
|
||||||
|
|
||||||
class ContentTypeFilter(URLFilter):
|
class ContentTypeFilter(URLFilter):
|
||||||
|
|||||||
Reference in New Issue
Block a user