Add use_stemming option to BM25ContentFilter

2025-06-08 06:56:33 +02:00
3 changed files with 20 additions and 10 deletions
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1073,7 +1073,8 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
                    content_filter = BM25ContentFilter(
                        user_query=filter_conf.get("query"),
-                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                        bm25_threshold=filter_conf.get("threshold", 1.0),
+                        use_stemming=filter_conf.get("use_stemming", True),
                    )
                )
            elif filter_conf["type"] == "pruning":
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -405,6 +405,7 @@ class BM25ContentFilter(RelevantContentFilter):
        user_query: str = None,
        bm25_threshold: float = 1.0,
        language: str = "english",
+        use_stemming: bool = True,
    ):
        """
        Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
@@ -416,9 +417,11 @@ class BM25ContentFilter(RelevantContentFilter):
            user_query (str): User query for filtering (optional).
            bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
            language (str): Language for stemming (default: 'english').
+            use_stemming (bool): Whether to apply stemming (default: True).
        """
        super().__init__(user_query=user_query)
        self.bm25_threshold = bm25_threshold
+        self.use_stemming = use_stemming
        self.priority_tags = {
            "h1": 5.0,
            "h2": 4.0,
@@ -432,7 +435,7 @@ class BM25ContentFilter(RelevantContentFilter):
            "pre": 1.5,
            "th": 1.5,  # Table headers
        }
-        self.stemmer = stemmer(language)
+        self.stemmer = stemmer(language) if use_stemming else None

    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
        """
@@ -479,6 +482,7 @@ class BM25ContentFilter(RelevantContentFilter):
        #                 for _, chunk, _, _ in candidates]
        # tokenized_query = [ps.stem(word) for word in query.lower().split()]

+        if self.use_stemming:
            tokenized_corpus = [
                [self.stemmer.stemWord(word) for word in chunk.lower().split()]
                for _, chunk, _, _ in candidates
@@ -486,6 +490,11 @@ class BM25ContentFilter(RelevantContentFilter):
            tokenized_query = [
                self.stemmer.stemWord(word) for word in query.lower().split()
            ]
+        else:
+            tokenized_corpus = [
+                chunk.lower().split() for _, chunk, _, _ in candidates
+            ]
+            tokenized_query = query.lower().split()

        # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
        #            for _, chunk, _, _ in candidates]
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)

 - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
 - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
+- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).

 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.