From 8b215e17afcd15015ee5a9b47d5e60e7c0c62c35 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 8 Jun 2025 06:57:37 +0200 Subject: [PATCH] Add use_stemming option to BM25ContentFilter (#1192) --- crawl4ai/cli.py | 3 ++- crawl4ai/content_filter_strategy.py | 25 +++++++++++++++++-------- docs/md_v2/core/markdown-generation.md | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 51477d6b..33b313bc 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1073,7 +1073,8 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: crawler_cfg.markdown_generator = DefaultMarkdownGenerator( content_filter = BM25ContentFilter( user_query=filter_conf.get("query"), - bm25_threshold=filter_conf.get("threshold", 1.0) + bm25_threshold=filter_conf.get("threshold", 1.0), + use_stemming=filter_conf.get("use_stemming", True), ) ) elif filter_conf["type"] == "pruning": diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 4102cbad..1e764f74 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -405,6 +405,7 @@ class BM25ContentFilter(RelevantContentFilter): user_query: str = None, bm25_threshold: float = 1.0, language: str = "english", + use_stemming: bool = True, ): """ Initializes the BM25ContentFilter class, if not provided, falls back to page metadata. @@ -416,9 +417,11 @@ class BM25ContentFilter(RelevantContentFilter): user_query (str): User query for filtering (optional). bm25_threshold (float): BM25 threshold for filtering (default: 1.0). language (str): Language for stemming (default: 'english'). + use_stemming (bool): Whether to apply stemming (default: True). """ super().__init__(user_query=user_query) self.bm25_threshold = bm25_threshold + self.use_stemming = use_stemming self.priority_tags = { "h1": 5.0, "h2": 4.0, @@ -432,7 +435,7 @@ class BM25ContentFilter(RelevantContentFilter): "pre": 1.5, "th": 1.5, # Table headers } - self.stemmer = stemmer(language) + self.stemmer = stemmer(language) if use_stemming else None def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: """ @@ -479,13 +482,19 @@ class BM25ContentFilter(RelevantContentFilter): # for _, chunk, _, _ in candidates] # tokenized_query = [ps.stem(word) for word in query.lower().split()] - tokenized_corpus = [ - [self.stemmer.stemWord(word) for word in chunk.lower().split()] - for _, chunk, _, _ in candidates - ] - tokenized_query = [ - self.stemmer.stemWord(word) for word in query.lower().split() - ] + if self.use_stemming: + tokenized_corpus = [ + [self.stemmer.stemWord(word) for word in chunk.lower().split()] + for _, chunk, _, _ in candidates + ] + tokenized_query = [ + self.stemmer.stemWord(word) for word in query.lower().split() + ] + else: + tokenized_corpus = [ + chunk.lower().split() for _, chunk, _, _ in candidates + ] + tokenized_query = query.lower().split() # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())] # for _, chunk, _, _ in candidates] diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index d4cad79b..eccb115a 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator) - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query. - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more. -- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”). +- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”). **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.