Compare commits

...

1 Commits

Author SHA1 Message Date
UncleCode
5100dd28be Add use_stemming option to BM25ContentFilter 2025-06-08 06:56:33 +02:00
3 changed files with 20 additions and 10 deletions

View File

@@ -1073,7 +1073,8 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
content_filter = BM25ContentFilter(
user_query=filter_conf.get("query"),
bm25_threshold=filter_conf.get("threshold", 1.0)
bm25_threshold=filter_conf.get("threshold", 1.0),
use_stemming=filter_conf.get("use_stemming", True),
)
)
elif filter_conf["type"] == "pruning":

View File

@@ -405,6 +405,7 @@ class BM25ContentFilter(RelevantContentFilter):
user_query: str = None,
bm25_threshold: float = 1.0,
language: str = "english",
use_stemming: bool = True,
):
"""
Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
@@ -416,9 +417,11 @@ class BM25ContentFilter(RelevantContentFilter):
user_query (str): User query for filtering (optional).
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
language (str): Language for stemming (default: 'english').
use_stemming (bool): Whether to apply stemming (default: True).
"""
super().__init__(user_query=user_query)
self.bm25_threshold = bm25_threshold
self.use_stemming = use_stemming
self.priority_tags = {
"h1": 5.0,
"h2": 4.0,
@@ -432,7 +435,7 @@ class BM25ContentFilter(RelevantContentFilter):
"pre": 1.5,
"th": 1.5, # Table headers
}
self.stemmer = stemmer(language)
self.stemmer = stemmer(language) if use_stemming else None
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
"""
@@ -479,6 +482,7 @@ class BM25ContentFilter(RelevantContentFilter):
# for _, chunk, _, _ in candidates]
# tokenized_query = [ps.stem(word) for word in query.lower().split()]
if self.use_stemming:
tokenized_corpus = [
[self.stemmer.stemWord(word) for word in chunk.lower().split()]
for _, chunk, _, _ in candidates
@@ -486,6 +490,11 @@ class BM25ContentFilter(RelevantContentFilter):
tokenized_query = [
self.stemmer.stemWord(word) for word in query.lower().split()
]
else:
tokenized_corpus = [
chunk.lower().split() for _, chunk, _, _ in candidates
]
tokenized_query = query.lower().split()
# tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
# for _, chunk, _, _ in candidates]

View File

@@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.
- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.
- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.