From e95e8e1a974ebee12ba42a21387fd7ecc7d8fec9 Mon Sep 17 00:00:00 2001 From: Chris Murphy Date: Mon, 1 Dec 2025 16:16:31 -0500 Subject: [PATCH] generalized query in ContentRelevanceFilter to be a str or list --- crawl4ai/deep_crawling/filters.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index 981cbcd8..c075cb7d 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -509,18 +509,22 @@ class DomainFilter(URLFilter): class ContentRelevanceFilter(URLFilter): """BM25-based relevance filter using head section content""" - __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl") + __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl", "query") def __init__( self, - query: str, + query: Union[str, List[str]], threshold: float, k1: float = 1.2, b: float = 0.75, avgdl: int = 1000, ): super().__init__(name="BM25RelevanceFilter") - self.query_terms = self._tokenize(query) + if isinstance(query, list): + self.query = " ".join(query) + else: + self.query = query + self.query_terms = self._tokenize(self.query) self.threshold = threshold self.k1 = k1 # TF saturation parameter self.b = b # Length normalization parameter