Merge branch '2025-MAY-2' into next-MAY

2025-07-08 11:46:13 +02:00
parent 1a73fb60db 414f16e975
commit 0f210f6e02
28 changed files with 448 additions and 154 deletions
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi

 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
 from crawl4ai import JsonCssExtractionStrategy

 async def main():
@@ -298,7 +298,7 @@ async def main():
    # 3) Example LLM content filtering

    gemini_config = LLMConfig(
-        provider="gemini/gemini-1.5-pro" 
+        provider="gemini/gemini-1.5-pro", 
        api_token = "env:GEMINI_API_TOKEN"
    )

@@ -322,8 +322,9 @@ async def main():
    )

    md_generator = DefaultMarkdownGenerator(
-    content_filter=filter,
-    options={"ignore_links": True}
+        content_filter=filter,
+        options={"ignore_links": True}
+    )

    # 4) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
--- a/docs/md_v2/core/cli.md
+++ b/docs/md_v2/core/cli.md
@@ -17,6 +17,9 @@
 - [Configuration Reference](#configuration-reference)
 - [Best Practices & Tips](#best-practices--tips)

+## Installation
+The Crawl4AI CLI will be installed automatically when you install the library.
+
 ## Basic Usage

 The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
--- a/docs/md_v2/core/local-files.md
+++ b/docs/md_v2/core/local-files.md
@@ -8,11 +8,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,

 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig

 async def crawl_web():
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/apple", 
@@ -33,13 +33,13 @@ To crawl a local HTML file, prefix the file path with `file://`.

 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig

 async def crawl_local_file():
    local_file_path = "/path/to/apple.html"  # Replace with your file path
    file_url = f"file://{local_file_path}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=file_url, config=config)
@@ -93,7 +93,7 @@ import os
 import sys
 import asyncio
 from pathlib import Path
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig

 async def main():
@@ -104,7 +104,7 @@ async def main():
    async with AsyncWebCrawler() as crawler:
        # Step 1: Crawl the Web URL
        print("\n=== Step 1: Crawling the Wikipedia URL ===")
-        web_config = CrawlerRunConfig(bypass_cache=True)
+        web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        result = await crawler.arun(url=wikipedia_url, config=web_config)

        if not result.success:
@@ -119,7 +119,7 @@ async def main():
        # Step 2: Crawl from the Local HTML File
        print("=== Step 2: Crawling from the Local HTML File ===")
        file_url = f"file://{html_file_path.resolve()}"
-        file_config = CrawlerRunConfig(bypass_cache=True)
+        file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        local_result = await crawler.arun(url=file_url, config=file_config)

        if not local_result.success:
@@ -135,7 +135,7 @@ async def main():
        with open(html_file_path, 'r', encoding='utf-8') as f:
            raw_html_content = f.read()
        raw_html_url = f"raw:{raw_html_content}"
-        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        raw_result = await crawler.arun(url=raw_html_url, config=raw_config)

        if not raw_result.success:
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -201,6 +201,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
 - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
 - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
 - **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
+- **`language (str)`**: Language for stemming (default: 'english').

 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.

@@ -233,7 +234,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:

 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter

 async def main():
@@ -255,9 +256,12 @@ async def main():
        chunk_token_threshold=4096,  # Adjust based on your needs
        verbose=True
    )
-
+    md_generator = DefaultMarkdownGenerator(
+        content_filter=filter,
+        options={"ignore_links": True}
+    )
    config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator,
    )

    async with AsyncWebCrawler() as crawler: