feat(cli): enhance markdown generation with default content filters
Add DefaultMarkdownGenerator integration and automatic content filtering for markdown output formats. When using 'markdown-fit' or 'md-fit' output formats, automatically apply PruningContentFilter with default settings if no filter config is provided. This change improves the user experience by providing sensible defaults for markdown generation while maintaining the ability to customize filtering behavior.
This commit is contained in:
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.5.0.post4"
|
__version__ = "0.5.0.post5"
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import click
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from fastapi.datastructures import Default
|
||||||
import humanize
|
import humanize
|
||||||
from typing import Dict, Any, Optional, List
|
from typing import Dict, Any, Optional, List
|
||||||
import json
|
import json
|
||||||
@@ -24,6 +25,7 @@ from crawl4ai import (
|
|||||||
BM25ContentFilter,
|
BM25ContentFilter,
|
||||||
PruningContentFilter,
|
PruningContentFilter,
|
||||||
BrowserProfiler,
|
BrowserProfiler,
|
||||||
|
DefaultMarkdownGenerator,
|
||||||
LLMConfig
|
LLMConfig
|
||||||
)
|
)
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
@@ -614,17 +616,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
|
|||||||
crawler_cfg = crawler_cfg.clone(**crawler)
|
crawler_cfg = crawler_cfg.clone(**crawler)
|
||||||
|
|
||||||
# Handle content filter config
|
# Handle content filter config
|
||||||
if filter_config:
|
if filter_config or output in ["markdown-fit", "md-fit"]:
|
||||||
filter_conf = load_config_file(filter_config)
|
if filter_config:
|
||||||
|
filter_conf = load_config_file(filter_config)
|
||||||
|
elif not filter_config and output in ["markdown-fit", "md-fit"]:
|
||||||
|
filter_conf = {
|
||||||
|
"type": "pruning",
|
||||||
|
"query": "",
|
||||||
|
"threshold": 0.48
|
||||||
|
}
|
||||||
if filter_conf["type"] == "bm25":
|
if filter_conf["type"] == "bm25":
|
||||||
crawler_cfg.content_filter = BM25ContentFilter(
|
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
|
||||||
user_query=filter_conf.get("query"),
|
content_filter = BM25ContentFilter(
|
||||||
bm25_threshold=filter_conf.get("threshold", 1.0)
|
user_query=filter_conf.get("query"),
|
||||||
|
bm25_threshold=filter_conf.get("threshold", 1.0)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
elif filter_conf["type"] == "pruning":
|
elif filter_conf["type"] == "pruning":
|
||||||
crawler_cfg.content_filter = PruningContentFilter(
|
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
|
||||||
user_query=filter_conf.get("query"),
|
content_filter = PruningContentFilter(
|
||||||
threshold=filter_conf.get("threshold", 0.48)
|
user_query=filter_conf.get("query"),
|
||||||
|
threshold=filter_conf.get("threshold", 0.48)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle extraction strategy
|
# Handle extraction strategy
|
||||||
|
|||||||
Reference in New Issue
Block a user