From 8b215e17afcd15015ee5a9b47d5e60e7c0c62c35 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 8 Jun 2025 06:57:37 +0200
Subject: [PATCH] Add use_stemming option to BM25ContentFilter (#1192)

---
 crawl4ai/cli.py                        |  3 ++-
 crawl4ai/content_filter_strategy.py    | 25 +++++++++++++++++--------
 docs/md_v2/core/markdown-generation.md |  2 +-
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py
index 51477d6b..33b313bc 100644
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1073,7 +1073,8 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
                 crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
                     content_filter = BM25ContentFilter(
                         user_query=filter_conf.get("query"),
-                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                        bm25_threshold=filter_conf.get("threshold", 1.0),
+                        use_stemming=filter_conf.get("use_stemming", True),
                     )
                 )
             elif filter_conf["type"] == "pruning":
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 4102cbad..1e764f74 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -405,6 +405,7 @@ class BM25ContentFilter(RelevantContentFilter):
         user_query: str = None,
         bm25_threshold: float = 1.0,
         language: str = "english",
+        use_stemming: bool = True,
     ):
         """
         Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
@@ -416,9 +417,11 @@ class BM25ContentFilter(RelevantContentFilter):
             user_query (str): User query for filtering (optional).
             bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
             language (str): Language for stemming (default: 'english').
+            use_stemming (bool): Whether to apply stemming (default: True).
         """
         super().__init__(user_query=user_query)
         self.bm25_threshold = bm25_threshold
+        self.use_stemming = use_stemming
         self.priority_tags = {
             "h1": 5.0,
             "h2": 4.0,
@@ -432,7 +435,7 @@ class BM25ContentFilter(RelevantContentFilter):
             "pre": 1.5,
             "th": 1.5,  # Table headers
         }
-        self.stemmer = stemmer(language)
+        self.stemmer = stemmer(language) if use_stemming else None
 
     def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
         """
@@ -479,13 +482,19 @@ class BM25ContentFilter(RelevantContentFilter):
         #                 for _, chunk, _, _ in candidates]
         # tokenized_query = [ps.stem(word) for word in query.lower().split()]
 
-        tokenized_corpus = [
-            [self.stemmer.stemWord(word) for word in chunk.lower().split()]
-            for _, chunk, _, _ in candidates
-        ]
-        tokenized_query = [
-            self.stemmer.stemWord(word) for word in query.lower().split()
-        ]
+        if self.use_stemming:
+            tokenized_corpus = [
+                [self.stemmer.stemWord(word) for word in chunk.lower().split()]
+                for _, chunk, _, _ in candidates
+            ]
+            tokenized_query = [
+                self.stemmer.stemWord(word) for word in query.lower().split()
+            ]
+        else:
+            tokenized_corpus = [
+                chunk.lower().split() for _, chunk, _, _ in candidates
+            ]
+            tokenized_query = query.lower().split()
 
         # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
         #            for _, chunk, _, _ in candidates]
diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md
index d4cad79b..eccb115a 100644
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
 
 - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
 - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
-- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
+- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
 
 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.