Merge branch 'vr0.5.0.post5'

This commit is contained in:
UncleCode
2025-03-25 12:24:07 +08:00
2 changed files with 21 additions and 9 deletions

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.5.0.post4" __version__ = "0.5.0.post6"

View File

@@ -24,6 +24,7 @@ from crawl4ai import (
BM25ContentFilter, BM25ContentFilter,
PruningContentFilter, PruningContentFilter,
BrowserProfiler, BrowserProfiler,
DefaultMarkdownGenerator,
LLMConfig LLMConfig
) )
from litellm import completion from litellm import completion
@@ -614,17 +615,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
crawler_cfg = crawler_cfg.clone(**crawler) crawler_cfg = crawler_cfg.clone(**crawler)
# Handle content filter config # Handle content filter config
if filter_config: if filter_config or output in ["markdown-fit", "md-fit"]:
filter_conf = load_config_file(filter_config) if filter_config:
filter_conf = load_config_file(filter_config)
elif not filter_config and output in ["markdown-fit", "md-fit"]:
filter_conf = {
"type": "pruning",
"query": "",
"threshold": 0.48
}
if filter_conf["type"] == "bm25": if filter_conf["type"] == "bm25":
crawler_cfg.content_filter = BM25ContentFilter( crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
user_query=filter_conf.get("query"), content_filter = BM25ContentFilter(
bm25_threshold=filter_conf.get("threshold", 1.0) user_query=filter_conf.get("query"),
bm25_threshold=filter_conf.get("threshold", 1.0)
)
) )
elif filter_conf["type"] == "pruning": elif filter_conf["type"] == "pruning":
crawler_cfg.content_filter = PruningContentFilter( crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
user_query=filter_conf.get("query"), content_filter = PruningContentFilter(
threshold=filter_conf.get("threshold", 0.48) user_query=filter_conf.get("query"),
threshold=filter_conf.get("threshold", 0.48)
)
) )
# Handle extraction strategy # Handle extraction strategy