From 1107fa1d62f3a4157fa56bdfc148f4fc0d9a766c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 25 Mar 2025 11:56:00 +0800 Subject: [PATCH 1/2] feat(cli): enhance markdown generation with default content filters Add DefaultMarkdownGenerator integration and automatic content filtering for markdown output formats. When using 'markdown-fit' or 'md-fit' output formats, automatically apply PruningContentFilter with default settings if no filter config is provided. This change improves the user experience by providing sensible defaults for markdown generation while maintaining the ability to customize filtering behavior. --- crawl4ai/__version__.py | 2 +- crawl4ai/cli.py | 29 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 9477177b..4cf6a154 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.5.0.post4" +__version__ = "0.5.0.post5" diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index cb6e7063..0399a866 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -2,6 +2,7 @@ import click import os import time +from fastapi.datastructures import Default import humanize from typing import Dict, Any, Optional, List import json @@ -24,6 +25,7 @@ from crawl4ai import ( BM25ContentFilter, PruningContentFilter, BrowserProfiler, + DefaultMarkdownGenerator, LLMConfig ) from litellm import completion @@ -614,17 +616,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: crawler_cfg = crawler_cfg.clone(**crawler) # Handle content filter config - if filter_config: - filter_conf = load_config_file(filter_config) + if filter_config or output in ["markdown-fit", "md-fit"]: + if filter_config: + filter_conf = load_config_file(filter_config) + elif not filter_config and output in ["markdown-fit", "md-fit"]: + filter_conf = { + "type": "pruning", + "query": "", + "threshold": 0.48 + } if filter_conf["type"] == "bm25": - crawler_cfg.content_filter = BM25ContentFilter( - user_query=filter_conf.get("query"), - bm25_threshold=filter_conf.get("threshold", 1.0) + crawler_cfg.markdown_generator = DefaultMarkdownGenerator( + content_filter = BM25ContentFilter( + user_query=filter_conf.get("query"), + bm25_threshold=filter_conf.get("threshold", 1.0) + ) ) elif filter_conf["type"] == "pruning": - crawler_cfg.content_filter = PruningContentFilter( - user_query=filter_conf.get("query"), - threshold=filter_conf.get("threshold", 0.48) + crawler_cfg.markdown_generator = DefaultMarkdownGenerator( + content_filter = PruningContentFilter( + user_query=filter_conf.get("query"), + threshold=filter_conf.get("threshold", 0.48) + ) ) # Handle extraction strategy From bdd9db579ab6b6df959b4c57ab6f1c74347ae7bb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 25 Mar 2025 12:01:36 +0800 Subject: [PATCH 2/2] chore(version): bump version to 0.5.0.post6 refactor(cli): remove unused import from FastAPI --- crawl4ai/__version__.py | 2 +- crawl4ai/cli.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 4cf6a154..0f5d9ee7 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.5.0.post5" +__version__ = "0.5.0.post6" diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 0399a866..212e44b1 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -2,7 +2,6 @@ import click import os import time -from fastapi.datastructures import Default import humanize from typing import Dict, Any, Optional, List import json