Merge branch 'vr0.5.0.post5'
This commit is contained in:
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.5.0.post4"
|
__version__ = "0.5.0.post6"
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from crawl4ai import (
|
|||||||
BM25ContentFilter,
|
BM25ContentFilter,
|
||||||
PruningContentFilter,
|
PruningContentFilter,
|
||||||
BrowserProfiler,
|
BrowserProfiler,
|
||||||
|
DefaultMarkdownGenerator,
|
||||||
LLMConfig
|
LLMConfig
|
||||||
)
|
)
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
@@ -614,17 +615,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
|
|||||||
crawler_cfg = crawler_cfg.clone(**crawler)
|
crawler_cfg = crawler_cfg.clone(**crawler)
|
||||||
|
|
||||||
# Handle content filter config
|
# Handle content filter config
|
||||||
if filter_config:
|
if filter_config or output in ["markdown-fit", "md-fit"]:
|
||||||
filter_conf = load_config_file(filter_config)
|
if filter_config:
|
||||||
|
filter_conf = load_config_file(filter_config)
|
||||||
|
elif not filter_config and output in ["markdown-fit", "md-fit"]:
|
||||||
|
filter_conf = {
|
||||||
|
"type": "pruning",
|
||||||
|
"query": "",
|
||||||
|
"threshold": 0.48
|
||||||
|
}
|
||||||
if filter_conf["type"] == "bm25":
|
if filter_conf["type"] == "bm25":
|
||||||
crawler_cfg.content_filter = BM25ContentFilter(
|
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
|
||||||
user_query=filter_conf.get("query"),
|
content_filter = BM25ContentFilter(
|
||||||
bm25_threshold=filter_conf.get("threshold", 1.0)
|
user_query=filter_conf.get("query"),
|
||||||
|
bm25_threshold=filter_conf.get("threshold", 1.0)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
elif filter_conf["type"] == "pruning":
|
elif filter_conf["type"] == "pruning":
|
||||||
crawler_cfg.content_filter = PruningContentFilter(
|
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
|
||||||
user_query=filter_conf.get("query"),
|
content_filter = PruningContentFilter(
|
||||||
threshold=filter_conf.get("threshold", 0.48)
|
user_query=filter_conf.get("query"),
|
||||||
|
threshold=filter_conf.get("threshold", 0.48)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle extraction strategy
|
# Handle extraction strategy
|
||||||
|
|||||||
Reference in New Issue
Block a user