From 2b17f234f8354dca893063b68aa3ec41431c5d3c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 7 May 2025 15:20:36 +0530 Subject: [PATCH] docs: update direct passing of content_filter to CrawlerRunConfig and instead pass it via MarkdownGenerator. Ref: #603 --- deploy/docker/c4ai-doc-context.md | 11 ++++++++--- docs/md_v2/core/markdown-generation.md | 9 ++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 1642f85e..081f29b7 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -403,7 +403,7 @@ async def main(): md_generator = DefaultMarkdownGenerator( content_filter=filter, - options={"ignore_links": True} + options={"ignore_links": True}) # 4) Crawler run config: skip cache, use extraction run_conf = CrawlerRunConfig( @@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter( For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): @@ -4175,8 +4175,13 @@ async def main(): verbose=True ) + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + ) + config = CrawlerRunConfig( - content_filter=filter + markdown_generator=md_generator ) async with AsyncWebCrawler() as crawler: diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index e6f5e12a..4a6e9218 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -233,7 +233,7 @@ prune_filter = PruningContentFilter( For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): @@ -255,9 +255,12 @@ async def main(): chunk_token_threshold=4096, # Adjust based on your needs verbose=True ) - + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + ) config = CrawlerRunConfig( - content_filter=filter + markdown_generator=md_generator, ) async with AsyncWebCrawler() as crawler: