Compare commits
1 Commits
feat/ahmed
...
devin/1748
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c8d28316b9 |
@@ -137,7 +137,7 @@ if __name__ == "__main__":
|
||||
- Higher → fewer chunks but more relevant.
|
||||
- Lower → more inclusive.
|
||||
|
||||
> In more advanced scenarios, you might see parameters like `use_stemming`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
|
||||
> In more advanced scenarios, you might see parameters like `language`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
|
||||
|
||||
---
|
||||
|
||||
@@ -242,4 +242,4 @@ class MyCustomFilter(RelevantContentFilter):
|
||||
|
||||
With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
|
||||
|
||||
- Last Updated: 2025-01-01
|
||||
- Last Updated: 2025-01-01
|
||||
|
||||
@@ -187,7 +187,7 @@ from crawl4ai import CrawlerRunConfig
|
||||
bm25_filter = BM25ContentFilter(
|
||||
user_query="machine learning",
|
||||
bm25_threshold=1.2,
|
||||
use_stemming=True
|
||||
language="english"
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
|
||||
Reference in New Issue
Block a user