feat(cli): enhance markdown generation with default content filters

Add DefaultMarkdownGenerator integration and automatic content filtering for markdown output formats. When using 'markdown-fit' or 'md-fit' output formats, automatically apply PruningContentFilter with default settings if no filter config is provided.

This change improves the user experience by providing sensible defaults for markdown generation while maintaining the ability to customize filtering behavior.
This commit is contained in:
UncleCode
2025-03-25 11:56:00 +08:00
parent f78c46446b
commit 1107fa1d62
2 changed files with 22 additions and 9 deletions

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py
__version__ = "0.5.0.post4"
__version__ = "0.5.0.post5"

View File

@@ -2,6 +2,7 @@ import click
import os
import time
from fastapi.datastructures import Default
import humanize
from typing import Dict, Any, Optional, List
import json
@@ -24,6 +25,7 @@ from crawl4ai import (
BM25ContentFilter,
PruningContentFilter,
BrowserProfiler,
DefaultMarkdownGenerator,
LLMConfig
)
from litellm import completion
@@ -614,17 +616,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
crawler_cfg = crawler_cfg.clone(**crawler)
# Handle content filter config
if filter_config:
filter_conf = load_config_file(filter_config)
if filter_config or output in ["markdown-fit", "md-fit"]:
if filter_config:
filter_conf = load_config_file(filter_config)
elif not filter_config and output in ["markdown-fit", "md-fit"]:
filter_conf = {
"type": "pruning",
"query": "",
"threshold": 0.48
}
if filter_conf["type"] == "bm25":
crawler_cfg.content_filter = BM25ContentFilter(
user_query=filter_conf.get("query"),
bm25_threshold=filter_conf.get("threshold", 1.0)
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
content_filter = BM25ContentFilter(
user_query=filter_conf.get("query"),
bm25_threshold=filter_conf.get("threshold", 1.0)
)
)
elif filter_conf["type"] == "pruning":
crawler_cfg.content_filter = PruningContentFilter(
user_query=filter_conf.get("query"),
threshold=filter_conf.get("threshold", 0.48)
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
content_filter = PruningContentFilter(
user_query=filter_conf.get("query"),
threshold=filter_conf.get("threshold", 0.48)
)
)
# Handle extraction strategy