diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ba70dc11..1b80c566 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -848,7 +848,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): return { # **markdown_content, - "scraped_html": html, + # "scraped_html": html, "cleaned_html": cleaned_html, "success": success, "media": media, diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index 9fd8a72a..122be482 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -124,6 +124,7 @@ class URLPatternFilter(URLFilter): "_simple_prefixes", "_domain_patterns", "_path_patterns", + "_reverse", ) PATTERN_TYPES = { @@ -138,8 +139,10 @@ class URLPatternFilter(URLFilter): self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True, + reverse: bool = False, ): super().__init__() + self._reverse = reverse patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns self._simple_suffixes = set() @@ -205,36 +208,40 @@ class URLPatternFilter(URLFilter): @lru_cache(maxsize=10000) def apply(self, url: str) -> bool: - """Hierarchical pattern matching""" # Quick suffix check (*.html) if self._simple_suffixes: path = url.split("?")[0] if path.split("/")[-1].split(".")[-1] in self._simple_suffixes: - self._update_stats(True) - return True + result = True + self._update_stats(result) + return not result if self._reverse else result # Domain check if self._domain_patterns: for pattern in self._domain_patterns: if pattern.match(url): - self._update_stats(True) - return True + result = True + self._update_stats(result) + return not result if self._reverse else result # Prefix check (/foo/*) if self._simple_prefixes: path = url.split("?")[0] if any(path.startswith(p) for p in self._simple_prefixes): - self._update_stats(True) - return True + result = True + self._update_stats(result) + return not result if self._reverse else result # Complex patterns if self._path_patterns: if any(p.search(url) for p in self._path_patterns): - self._update_stats(True) - return True + result = True + self._update_stats(result) + return not result if self._reverse else result - self._update_stats(False) - return False + result = False + self._update_stats(result) + return not result if self._reverse else result class ContentTypeFilter(URLFilter):