From 91a5fea11fc21942c32b55517391f178b47bbe73 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 10 Feb 2025 16:58:52 +0800
Subject: [PATCH] feat(cli): add command line interface with comprehensive
 features

Implements a full-featured CLI for Crawl4AI with the following capabilities:
- Basic and advanced web crawling
- Configuration management via YAML/JSON files
- Multiple extraction strategies (CSS, XPath, LLM)
- Content filtering and optimization
- Interactive Q&A capabilities
- Various output formats
- Comprehensive documentation and examples

Also includes:
- Home directory setup for configuration and cache
- Environment variable support for API tokens
- Test suite for CLI functionality
---
 crawl4ai/async_configs.py         |   6 +-
 crawl4ai/cli.py                   | 404 ++++++++++++++++++++++++++++++
 crawl4ai/config.py                |   2 +
 crawl4ai/extraction_strategy.py   |  14 +-
 crawl4ai/install.py               |  33 +++
 docs/examples/cli/browser.yml     |  13 +
 docs/examples/cli/crawler.yml     |  13 +
 docs/examples/cli/css_schema.json |  27 ++
 docs/examples/cli/extract.yml     |  11 +
 docs/examples/cli/extract_css.yml |   3 +
 docs/examples/cli/llm_schema.json |  26 ++
 docs/md_v2/core/cli.md            | 304 ++++++++++++++++++++++
 mkdocs.yml                        |   1 +
 tests/cli/test_cli.py             | 133 ++++++++++
 14 files changed, 983 insertions(+), 7 deletions(-)
 create mode 100644 docs/examples/cli/browser.yml
 create mode 100644 docs/examples/cli/crawler.yml
 create mode 100644 docs/examples/cli/css_schema.json
 create mode 100644 docs/examples/cli/extract.yml
 create mode 100644 docs/examples/cli/extract_css.yml
 create mode 100644 docs/examples/cli/llm_schema.json
 create mode 100644 docs/md_v2/core/cli.md
 create mode 100644 tests/cli/test_cli.py

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 6c3b294d..b60cc468 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -361,7 +361,8 @@ class BrowserConfig():
     @staticmethod
     def load( data: dict) -> "BrowserConfig":
         # Deserialize the object from a dictionary
-        return from_serializable_dict(data) if data else BrowserConfig()
+        config = from_serializable_dict(data) 
+        return BrowserConfig.from_kwargs(config)
 
 
 class CrawlerRunConfig():
@@ -811,7 +812,8 @@ class CrawlerRunConfig():
     @staticmethod
     def load(data: dict) -> "CrawlerRunConfig":
         # Deserialize the object from a dictionary
-        return from_serializable_dict(data) if data else CrawlerRunConfig()
+        config = from_serializable_dict(data) 
+        return CrawlerRunConfig.from_kwargs(config)
 
     def to_dict(self):
         return {
diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py
index e69de29b..d1fb60d2 100644
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -0,0 +1,404 @@
+import click
+import os
+from typing import Dict, Any, Optional
+import json
+import yaml
+import anyio
+from crawl4ai import (
+    CacheMode,
+    AsyncWebCrawler, 
+    CrawlResult,
+    BrowserConfig, 
+    CrawlerRunConfig,
+    LLMExtractionStrategy, 
+    JsonCssExtractionStrategy,
+    JsonXPathExtractionStrategy,
+    BM25ContentFilter, 
+    PruningContentFilter
+)
+from litellm import completion
+from pathlib import Path
+
+def get_global_config() -> dict:
+    config_dir = Path.home() / ".crawl4ai"
+    config_file = config_dir / "global.yml"
+    
+    if not config_file.exists():
+        config_dir.mkdir(parents=True, exist_ok=True)
+        return {}
+        
+    with open(config_file) as f:
+        return yaml.safe_load(f) or {}
+
+def save_global_config(config: dict):
+    config_file = Path.home() / ".crawl4ai" / "global.yml"
+    with open(config_file, "w") as f:
+        yaml.dump(config, f)
+
+def setup_llm_config() -> tuple[str, str]:
+    config = get_global_config()
+    provider = config.get("DEFAULT_LLM_PROVIDER")
+    token = config.get("DEFAULT_LLM_PROVIDER_TOKEN")
+    
+    if not provider:
+        click.echo("\nNo default LLM provider configured.")
+        click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')")
+        click.echo("See available providers at: https://docs.litellm.ai/docs/providers")
+        provider = click.prompt("Enter provider")
+        
+    if not provider.startswith("ollama/"):
+        if not token:
+            token = click.prompt("Enter API token for " + provider, hide_input=True)
+    else:
+        token = "no-token"
+    
+    if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"):
+        config["DEFAULT_LLM_PROVIDER"] = provider
+        config["DEFAULT_LLM_PROVIDER_TOKEN"] = token
+        save_global_config(config)
+        click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml")
+    
+    return provider, token
+
+async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str):
+    response = completion(
+        model=provider,
+        api_key=token,
+        messages=[
+            {
+                "content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.",
+                "role": "system"
+            },
+            {
+                "content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}",
+                "role": "user"
+            },
+        ],
+        stream=True,
+    )
+    
+    for chunk in response:
+        if content := chunk["choices"][0]["delta"].get("content"):
+            print(content, end="", flush=True)
+    print()  # New line at end
+
+
+
+def parse_key_values(ctx, param, value) -> Dict[str, Any]:
+    if not value:
+        return {}
+    result = {}
+    pairs = value.split(',')
+    for pair in pairs:
+        try:
+            k, v = pair.split('=', 1)
+            # Handle common value types 
+            if v.lower() == 'true': v = True
+            elif v.lower() == 'false': v = False
+            elif v.isdigit(): v = int(v)
+            elif v.replace('.','',1).isdigit(): v = float(v)
+            elif v.startswith('[') and v.endswith(']'):
+                v = [x.strip() for x in v[1:-1].split(',') if x.strip()]
+            elif v.startswith('{') and v.endswith('}'):
+                try:
+                    v = json.loads(v)
+                except json.JSONDecodeError:
+                    raise click.BadParameter(f'Invalid JSON object: {v}')
+            result[k.strip()] = v
+        except ValueError:
+            raise click.BadParameter(f'Invalid key=value pair: {pair}')
+    return result
+
+def load_config_file(path: Optional[str]) -> dict:
+    if not path:
+        return {}
+    
+    try:
+        with open(path) as f:
+            if path.endswith((".yaml", ".yml")):
+                return yaml.safe_load(f)
+            return json.load(f)
+    except Exception as e:
+        raise click.BadParameter(f'Error loading config file {path}: {str(e)}')
+
+def load_schema_file(path: Optional[str]) -> dict:
+    if not path:
+        return None
+    return load_config_file(path)
+
+async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool):
+    if verbose:
+        click.echo("Starting crawler with configurations:")
+        click.echo(f"Browser config: {browser_cfg.dump()}")
+        click.echo(f"Crawler config: {crawler_cfg.dump()}")
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        try:
+            result = await crawler.arun(url=url, config=crawler_cfg)
+            return result
+        except Exception as e:
+            raise click.ClickException(f"Crawling failed: {str(e)}")
+
+def show_examples():
+    examples = """
+🚀 Crawl4AI CLI Examples
+
+1️⃣  Basic Usage:
+    # Simple crawl with default settings
+    crwl https://example.com
+
+    # Get markdown output
+    crwl https://example.com -o markdown
+
+    # Verbose JSON output with cache bypass
+    crwl https://example.com -o json -v --bypass-cache
+
+2️⃣  Using Config Files:
+    # Using browser and crawler configs
+    crwl https://example.com -B browser.yml -C crawler.yml
+
+    # CSS-based extraction
+    crwl https://example.com -e extract_css.yml -s css_schema.json -o json
+
+    # LLM-based extraction
+    crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
+
+3️⃣  Direct Parameters:
+    # Browser settings
+    crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+
+    # Crawler settings
+    crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+
+4️⃣  Sample Config Files:
+
+browser.yml:
+    headless: true
+    viewport_width: 1280
+    user_agent_mode: "random"
+    verbose: true
+    ignore_https_errors: true
+
+extract_css.yml:
+    type: "json-css"
+    params:
+        verbose: true
+
+css_schema.json:
+    {
+      "name": "ArticleExtractor",
+      "baseSelector": ".article",
+      "fields": [
+        {
+          "name": "title",
+          "selector": "h1.title",
+          "type": "text"
+        },
+        {
+          "name": "link",
+          "selector": "a.read-more",
+          "type": "attribute",
+          "attribute": "href"
+        }
+      ]
+    }
+
+extract_llm.yml:
+    type: "llm"
+    provider: "openai/gpt-4"
+    instruction: "Extract all articles with their titles and links"
+    api_token: "your-token"
+    params:
+        temperature: 0.3
+        max_tokens: 1000
+
+llm_schema.json:
+    {
+      "title": "Article",
+      "type": "object",
+      "properties": {
+        "title": {
+          "type": "string",
+          "description": "The title of the article"
+        },
+        "link": {
+          "type": "string",
+          "description": "URL to the full article"
+        }
+      }
+    }
+
+5️⃣  Advanced Usage:
+    # Combine configs with direct parameters
+    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
+
+    # Full extraction pipeline
+    crwl https://example.com \\
+        -B browser.yml \\
+        -C crawler.yml \\
+        -e extract_llm.yml \\
+        -s llm_schema.json \\
+        -o json \\
+        -v
+
+    # Content filtering with BM25
+    crwl https://example.com \\
+        -f filter_bm25.yml \\
+        -o markdown-fit
+
+For more documentation visit: https://github.com/unclecode/crawl4ai
+
+6️⃣  Q&A with LLM:
+    # Ask a question about the content
+    crwl https://example.com -q "What is the main topic discussed?"
+
+    # First view content, then ask questions
+    crwl https://example.com -o markdown  # See the crawled content first
+    crwl https://example.com -q "Summarize the key points"
+    crwl https://example.com -q "What are the conclusions?"
+
+    # Advanced crawling with Q&A
+    crwl https://example.com \\
+        -B browser.yml \\
+        -c "css_selector=article,scan_full_page=true" \\
+        -q "What are the pros and cons mentioned?"
+
+    Note: First time using -q will prompt for LLM provider and API token.
+    These will be saved in ~/.crawl4ai/global.yml for future use.
+    
+    Supported provider format: 'company/model'
+    Examples:
+      - ollama/llama3.3
+      - openai/gpt-4
+      - anthropic/claude-3-sonnet
+      - cohere/command
+      - google/gemini-pro
+    
+    See full list of providers: https://docs.litellm.ai/docs/providers
+"""
+    click.echo(examples)
+
+@click.command(context_settings={"help_option_names": ["-h", "--help"]})
+@click.argument("url", required=False)
+@click.option("--example", is_flag=True, help="Show usage examples")
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "markdown-v2", "md", "md-fit"]), default="all")
+@click.option("--bypass-cache", is_flag=True, default = True,  help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
+        extraction_config: str, schema: str, browser: Dict, crawler: Dict,
+        output: str, bypass_cache: bool, question: str, verbose: bool):
+    """Crawl4AI CLI - Web content extraction tool
+
+    Simple Usage:
+        crwl https://example.com
+    
+    Run with --example to see detailed usage examples."""
+
+    if example:
+        show_examples()
+        return
+        
+    if not url:
+        raise click.UsageError("URL argument is required unless using --example")
+    
+    try:
+        # Load base configurations
+        browser_cfg = BrowserConfig.load(load_config_file(browser_config))
+        crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config))
+        
+        # Override with CLI params
+        if browser:
+            browser_cfg = browser_cfg.clone(**browser)
+        if crawler:
+            crawler_cfg = crawler_cfg.clone(**crawler)
+            
+        # Handle content filter config
+        if filter_config:
+            filter_conf = load_config_file(filter_config)
+            if filter_conf["type"] == "bm25":
+                crawler_cfg.content_filter = BM25ContentFilter(
+                    user_query=filter_conf.get("query"),
+                    bm25_threshold=filter_conf.get("threshold", 1.0)
+                )
+            elif filter_conf["type"] == "pruning":
+                crawler_cfg.content_filter = PruningContentFilter(
+                    user_query=filter_conf.get("query"),
+                    threshold=filter_conf.get("threshold", 0.48)
+                )
+                
+        # Handle extraction strategy
+        if extraction_config:
+            extract_conf = load_config_file(extraction_config)
+            schema_data = load_schema_file(schema)
+            
+            # Check if type does not exist show proper message
+            if not extract_conf.get("type"):
+                raise click.ClickException("Extraction type not specified")
+            if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]:
+                raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}")
+            
+            if extract_conf["type"] == "llm":
+                # if no provider show error emssage
+                if not extract_conf.get("provider") or not extract_conf.get("api_token"):
+                    raise click.ClickException("LLM provider and API token are required for LLM extraction")
+
+                crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                    provider=extract_conf["provider"],
+                    instruction=extract_conf["instruction"],
+                    api_token=extract_conf.get("api_token", extract_conf.get("api_key")),
+                    schema=schema_data,
+                    **extract_conf.get("params", {})
+                )
+            elif extract_conf["type"] == "json-css":
+                crawler_cfg.extraction_strategy = JsonCssExtractionStrategy(
+                    schema=schema_data
+                )
+            elif extract_conf["type"] == "json-xpath":
+                crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy(
+                    schema=schema_data
+                )
+                
+
+        # No cache
+        if bypass_cache:
+            crawler_cfg.cache_mode = CacheMode.BYPASS
+        
+        # Run crawler
+        result : CrawlResult = anyio.run(
+            run_crawler,
+            url,
+            browser_cfg,
+            crawler_cfg,
+            verbose
+        )
+
+        # Handle question
+        if question:
+            provider, token = setup_llm_config()
+            markdown = result.markdown_v2.raw_markdown
+            anyio.run(stream_llm_response, url, markdown, question, provider, token)
+            return
+        
+        # Handle output
+        if output == "all":
+            click.echo(json.dumps(result.model_dump(), indent=2))
+        elif output == "json":
+            click.echo(json.dumps(json.loads(result.extracted_content), indent=2))
+        elif output in ["markdown", "md"]:
+            click.echo(result.markdown_v2.raw_markdown)
+        elif output in ["markdown-fit", "md-fit"]:
+            click.echo(result.markdown_v2.fit_markdown)
+            
+    except Exception as e:
+        raise click.ClickException(str(e))
+
+if __name__ == "__main__":
+    cli()
\ No newline at end of file
diff --git a/crawl4ai/config.py b/crawl4ai/config.py
index 3e26514a..51fe4434 100644
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -15,6 +15,8 @@ PROVIDER_MODELS = {
     "openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
     "openai/o1-mini": os.getenv("OPENAI_API_KEY"),
     "openai/o1-preview": os.getenv("OPENAI_API_KEY"),
+    "openai/o3-mini": os.getenv("OPENAI_API_KEY"),
+    "openai/o3-mini-high": os.getenv("OPENAI_API_KEY"),
     "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
     "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
     "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 8b9dc9cf..90b726bc 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -533,11 +533,15 @@ class LLMExtractionStrategy(ExtractionStrategy):
         """
         super().__init__(**kwargs)
         self.provider = provider
-        self.api_token = (
-            api_token
-            or PROVIDER_MODELS.get(provider, "no-token")
-            or os.getenv("OPENAI_API_KEY")
-        )
+        if api_token and not api_token.startswith("env:"):
+            self.api_token = api_token
+        elif api_token and api_token.startswith("env:"):
+            self.api_token = os.getenv(api_token[4:])
+        else:
+            self.api_token = (
+                PROVIDER_MODELS.get(provider, "no-token")
+                or os.getenv("OPENAI_API_KEY")
+            )
         self.instruction = instruction
         self.extract_type = extraction_type
         self.schema = schema
diff --git a/crawl4ai/install.py b/crawl4ai/install.py
index 139be591..05179a71 100644
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -2,14 +2,47 @@ import subprocess
 import sys
 import asyncio
 from .async_logger import AsyncLogger, LogLevel
+from pathlib import Path
+import os
+import shutil
 
 # Initialize logger
 logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
 
+def setup_home_directory():
+    """Set up the .crawl4ai folder structure in the user's home directory."""
+    base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
+    crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
+    crawl4ai_config = crawl4ai_folder / "global.yml"
+    crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
+    cache_folder = crawl4ai_folder / "cache"
+    content_folders = [
+        "html_content",
+        "cleaned_html",
+        "markdown_content",
+        "extracted_content",
+        "screenshots",
+    ]
+
+    # Clean up old cache if exists
+    if cache_folder.exists():
+        shutil.rmtree(cache_folder)
+
+    # Create new folder structure
+    crawl4ai_folder.mkdir(exist_ok=True)
+    cache_folder.mkdir(exist_ok=True)
+    for folder in content_folders:
+        (crawl4ai_folder / folder).mkdir(exist_ok=True)
+    
+    # If config file does not exist, create it
+    if not crawl4ai_config.exists():
+        with open(crawl4ai_config, "w") as f:
+            f.write("")
 
 def post_install():
     """Run all post-installation tasks"""
     logger.info("Running post-installation setup...", tag="INIT")
+    setup_home_directory()
     install_playwright()
     run_migration()
     logger.success("Post-installation setup completed!", tag="COMPLETE")
diff --git a/docs/examples/cli/browser.yml b/docs/examples/cli/browser.yml
new file mode 100644
index 00000000..dd6caf60
--- /dev/null
+++ b/docs/examples/cli/browser.yml
@@ -0,0 +1,13 @@
+browser_type: "chromium"
+headless: true
+viewport_width: 1280
+viewport_height: 800
+user_agent_mode: "random"
+verbose: true
+text_mode: false
+light_mode: false
+ignore_https_errors: true
+java_script_enabled: true
+extra_args:
+  - "--disable-gpu"
+  - "--no-sandbox"
\ No newline at end of file
diff --git a/docs/examples/cli/crawler.yml b/docs/examples/cli/crawler.yml
new file mode 100644
index 00000000..61bd6670
--- /dev/null
+++ b/docs/examples/cli/crawler.yml
@@ -0,0 +1,13 @@
+cache_mode: "bypass"
+wait_until: "networkidle"
+page_timeout: 30000
+delay_before_return_html: 0.5
+word_count_threshold: 100
+scan_full_page: true
+scroll_delay: 0.3
+process_iframes: false
+remove_overlay_elements: true
+magic: true
+verbose: true
+exclude_external_links: true
+exclude_social_media_links: true
\ No newline at end of file
diff --git a/docs/examples/cli/css_schema.json b/docs/examples/cli/css_schema.json
new file mode 100644
index 00000000..935efeb8
--- /dev/null
+++ b/docs/examples/cli/css_schema.json
@@ -0,0 +1,27 @@
+{
+  "name": "ArticleExtractor",
+  "baseSelector": ".cards[data-tax=news] .card__data",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h4.card__title",
+      "type": "text"
+    },
+    {
+      "name": "link",
+      "selector": "h4.card__title a", 
+      "type": "attribute",
+      "attribute": "href"
+    },
+    {
+      "name": "details",
+      "selector": ".card__details",
+      "type": "text"
+    },
+    {
+      "name": "topics",
+      "selector": ".card__topics.topics",
+      "type": "text"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/docs/examples/cli/extract.yml b/docs/examples/cli/extract.yml
new file mode 100644
index 00000000..be22dd5e
--- /dev/null
+++ b/docs/examples/cli/extract.yml
@@ -0,0 +1,11 @@
+type: "llm"
+provider: "openai/gpt-4o-mini"
+api_token: "env:OPENAI_API_KEY"
+instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
+params:
+  chunk_token_threshold: 4096
+  overlap_rate: 0.1
+  word_token_rate: 0.75
+  temperature: 0.3
+  max_tokens: 1000
+  verbose: true
\ No newline at end of file
diff --git a/docs/examples/cli/extract_css.yml b/docs/examples/cli/extract_css.yml
new file mode 100644
index 00000000..a4004a3e
--- /dev/null
+++ b/docs/examples/cli/extract_css.yml
@@ -0,0 +1,3 @@
+type: "json-css"
+params:
+  verbose: true 
\ No newline at end of file
diff --git a/docs/examples/cli/llm_schema.json b/docs/examples/cli/llm_schema.json
new file mode 100644
index 00000000..a6969ccd
--- /dev/null
+++ b/docs/examples/cli/llm_schema.json
@@ -0,0 +1,26 @@
+{
+  "title": "NewsArticle",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "The title/headline of the news article"
+    },
+    "link": {
+      "type": "string",
+      "description": "The URL or link to the full article"
+    },
+    "details": {
+      "type": "string", 
+      "description": "Brief summary or details about the article content"
+    },
+    "topics": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "description": "List of topics or categories associated with the article"
+    }
+  },
+  "required": ["title", "details"]
+}
\ No newline at end of file
diff --git a/docs/md_v2/core/cli.md b/docs/md_v2/core/cli.md
new file mode 100644
index 00000000..ff4bf658
--- /dev/null
+++ b/docs/md_v2/core/cli.md
@@ -0,0 +1,304 @@
+# Crawl4AI CLI Guide
+
+## Table of Contents
+- [Installation](#installation)
+- [Basic Usage](#basic-usage)
+- [Configuration](#configuration)
+  - [Browser Configuration](#browser-configuration)
+  - [Crawler Configuration](#crawler-configuration)
+  - [Extraction Configuration](#extraction-configuration)
+  - [Content Filtering](#content-filtering)
+- [Advanced Features](#advanced-features)
+  - [LLM Q&A](#llm-qa)
+  - [Structured Data Extraction](#structured-data-extraction)
+  - [Content Filtering](#content-filtering-1)
+- [Output Formats](#output-formats)
+- [Examples](#examples)
+- [Configuration Reference](#configuration-reference)
+- [Best Practices & Tips](#best-practices--tips)
+
+## Basic Usage
+
+The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
+
+```bash
+# Basic crawling
+crwl https://example.com
+
+# Get markdown output
+crwl https://example.com -o markdown
+
+# Verbose JSON output with cache bypass
+crwl https://example.com -o json -v --bypass-cache
+
+# See usage examples
+crwl --example
+```
+
+## Quick Example of Advanced Usage
+
+If you clone the repository and run the following command, you will receive the content of the page in JSON format according to a JSON-CSS schema:
+
+```bash
+crwl "https://www.infoq.com/ai-ml-data-eng/" -e docs/examples/cli/extract_css.yml -s docs/examples/cli/css_schema.json -o json;
+```
+
+## Configuration
+
+### Browser Configuration
+
+Browser settings can be configured via YAML file or command line parameters:
+
+```yaml
+# browser.yml
+headless: true
+viewport_width: 1280
+user_agent_mode: "random"
+verbose: true
+ignore_https_errors: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -B browser.yml
+
+# Using direct parameters
+crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+```
+
+### Crawler Configuration
+
+Control crawling behavior:
+
+```yaml
+# crawler.yml
+cache_mode: "bypass"
+wait_until: "networkidle"
+page_timeout: 30000
+delay_before_return_html: 0.5
+word_count_threshold: 100
+scan_full_page: true
+scroll_delay: 0.3
+process_iframes: false
+remove_overlay_elements: true
+magic: true
+verbose: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -C crawler.yml
+
+# Using direct parameters
+crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+```
+
+### Extraction Configuration
+
+Two types of extraction are supported:
+
+1. CSS/XPath-based extraction:
+```yaml
+# extract_css.yml
+type: "json-css"
+params:
+  verbose: true
+```
+
+```json
+// css_schema.json
+{
+  "name": "ArticleExtractor",
+  "baseSelector": ".article",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h1.title",
+      "type": "text"
+    },
+    {
+      "name": "link",
+      "selector": "a.read-more",
+      "type": "attribute",
+      "attribute": "href"
+    }
+  ]
+}
+```
+
+2. LLM-based extraction:
+```yaml
+# extract_llm.yml
+type: "llm"
+provider: "openai/gpt-4"
+instruction: "Extract all articles with their titles and links"
+api_token: "your-token"
+params:
+  temperature: 0.3
+  max_tokens: 1000
+```
+
+```json
+// llm_schema.json
+{
+  "title": "Article",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "The title of the article"
+    },
+    "link": {
+      "type": "string",
+      "description": "URL to the full article"
+    }
+  }
+}
+```
+
+## Advanced Features
+
+### LLM Q&A
+
+Ask questions about crawled content:
+
+```bash
+# Simple question
+crwl https://example.com -q "What is the main topic discussed?"
+
+# View content then ask questions
+crwl https://example.com -o markdown  # See content first
+crwl https://example.com -q "Summarize the key points"
+crwl https://example.com -q "What are the conclusions?"
+
+# Combined with advanced crawling
+crwl https://example.com \
+    -B browser.yml \
+    -c "css_selector=article,scan_full_page=true" \
+    -q "What are the pros and cons mentioned?"
+```
+
+First-time setup:
+- Prompts for LLM provider and API token
+- Saves configuration in `~/.crawl4ai/global.yml`
+- Supports various providers (openai/gpt-4, anthropic/claude-3-sonnet, etc.)
+- For case of `ollama` you do not need to provide API token.
+- See [LiteLLM Providers](https://docs.litellm.ai/docs/providers) for full list
+
+### Structured Data Extraction
+
+Extract structured data using CSS selectors:
+
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json
+```
+
+Or using LLM-based extraction:
+
+```bash
+crwl https://example.com \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -o json
+```
+
+### Content Filtering
+
+Filter content for relevance:
+
+```yaml
+# filter_bm25.yml
+type: "bm25"
+query: "target content"
+threshold: 1.0
+
+# filter_pruning.yml
+type: "pruning"
+query: "focus topic"
+threshold: 0.48
+```
+
+```bash
+crwl https://example.com -f filter_bm25.yml -o markdown-fit
+```
+
+## Output Formats
+
+- `all` - Full crawl result including metadata
+- `json` - Extracted structured data (when using extraction)
+- `markdown` / `md` - Raw markdown output
+- `markdown-fit` / `md-fit` - Filtered markdown for better readability
+
+## Complete Examples
+
+1. Basic Extraction:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -C crawler.yml \
+    -o json
+```
+
+2. Structured Data Extraction:
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json \
+    -v
+```
+
+3. LLM Extraction with Filtering:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -f filter_bm25.yml \
+    -o json
+```
+
+4. Interactive Q&A:
+```bash
+# First crawl and view
+crwl https://example.com -o markdown
+
+# Then ask questions
+crwl https://example.com -q "What are the main points?"
+crwl https://example.com -q "Summarize the conclusions"
+```
+
+## Best Practices & Tips
+
+1. **Configuration Management**:
+   - Keep common configurations in YAML files
+   - Use CLI parameters for quick overrides
+   - Store sensitive data (API tokens) in `~/.crawl4ai/global.yml`
+
+2. **Performance Optimization**:
+   - Use `--bypass-cache` for fresh content
+   - Enable `scan_full_page` for infinite scroll pages
+   - Adjust `delay_before_return_html` for dynamic content
+
+3. **Content Extraction**:
+   - Use CSS extraction for structured content
+   - Use LLM extraction for unstructured content
+   - Combine with filters for focused results
+
+4. **Q&A Workflow**:
+   - View content first with `-o markdown`
+   - Ask specific questions
+   - Use broader context with appropriate selectors
+
+## Recap
+
+The Crawl4AI CLI provides:
+- Flexible configuration via files and parameters
+- Multiple extraction strategies (CSS, XPath, LLM)
+- Content filtering and optimization
+- Interactive Q&A capabilities
+- Various output formats
+
diff --git a/mkdocs.yml b/mkdocs.yml
index 16f44b05..7b2f18c6 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -15,6 +15,7 @@ nav:
     - "Blog Home": "blog/index.md"
     - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
   - Core:
+    - "Command Line Interface": "core/cli.md"
     - "Simple Crawling": "core/simple-crawling.md"
     - "Crawler Result": "core/crawler-result.md"
     - "Browser & Crawler Config": "core/browser-crawler-config.md"
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
new file mode 100644
index 00000000..b7416dc2
--- /dev/null
+++ b/tests/cli/test_cli.py
@@ -0,0 +1,133 @@
+import pytest
+from click.testing import CliRunner
+from pathlib import Path
+import json
+import yaml
+from crawl4ai.cli import cli, load_config_file, parse_key_values
+import tempfile
+import os
+import click
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+@pytest.fixture
+def temp_config_dir():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        old_home = os.environ.get('HOME')
+        os.environ['HOME'] = tmpdir
+        yield Path(tmpdir)
+        if old_home:
+            os.environ['HOME'] = old_home
+
+@pytest.fixture
+def sample_configs(temp_config_dir):
+    configs = {
+        'browser.yml': {
+            'headless': True,
+            'viewport_width': 1280,
+            'user_agent_mode': 'random'
+        },
+        'crawler.yml': {
+            'cache_mode': 'bypass',
+            'wait_until': 'networkidle',
+            'scan_full_page': True
+        },
+        'extract_css.yml': {
+            'type': 'json-css',
+            'params': {'verbose': True}
+        },
+        'css_schema.json': {
+            'name': 'ArticleExtractor',
+            'baseSelector': '.article',
+            'fields': [
+                {'name': 'title', 'selector': 'h1.title', 'type': 'text'},
+                {'name': 'link', 'selector': 'a.read-more', 'type': 'attribute', 'attribute': 'href'}
+            ]
+        }
+    }
+    
+    for filename, content in configs.items():
+        path = temp_config_dir / filename
+        with open(path, 'w') as f:
+            if filename.endswith('.yml'):
+                yaml.dump(content, f)
+            else:
+                json.dump(content, f)
+                
+    return {name: str(temp_config_dir / name) for name in configs}
+
+class TestCLIBasics:
+    def test_help(self, runner):
+        result = runner.invoke(cli, ['--help'])
+        assert result.exit_code == 0
+        assert 'Crawl4AI CLI' in result.output
+
+    def test_examples(self, runner):
+        result = runner.invoke(cli, ['--example'])
+        assert result.exit_code == 0
+        assert 'Examples' in result.output
+
+    def test_missing_url(self, runner):
+        result = runner.invoke(cli)
+        assert result.exit_code != 0
+        assert 'URL argument is required' in result.output
+
+class TestConfigParsing:
+    def test_parse_key_values_basic(self):
+        result = parse_key_values(None, None, "key1=value1,key2=true")
+        assert result == {'key1': 'value1', 'key2': True}
+
+    def test_parse_key_values_invalid(self):
+        with pytest.raises(click.BadParameter):
+            parse_key_values(None, None, "invalid_format")
+
+class TestConfigLoading:
+    def test_load_yaml_config(self, sample_configs):
+        config = load_config_file(sample_configs['browser.yml'])
+        assert config['headless'] is True
+        assert config['viewport_width'] == 1280
+
+    def test_load_json_config(self, sample_configs):
+        config = load_config_file(sample_configs['css_schema.json'])
+        assert config['name'] == 'ArticleExtractor'
+        assert len(config['fields']) == 2
+
+    def test_load_nonexistent_config(self):
+        with pytest.raises(click.BadParameter):
+            load_config_file('nonexistent.yml')
+
+class TestLLMConfig:
+    def test_llm_config_creation(self, temp_config_dir, runner):
+        def input_simulation(inputs):
+            return runner.invoke(cli, ['https://example.com', '-q', 'test question'], 
+                               input='\n'.join(inputs))
+            
+class TestCrawlingFeatures:
+    def test_basic_crawl(self, runner):
+        result = runner.invoke(cli, ['https://example.com'])
+        assert result.exit_code == 0
+
+
+class TestErrorHandling:
+    def test_invalid_config_file(self, runner):
+        result = runner.invoke(cli, [
+            'https://example.com',
+            '--browser-config', 'nonexistent.yml'
+        ])
+        assert result.exit_code != 0
+
+    def test_invalid_schema(self, runner, temp_config_dir):
+        invalid_schema = temp_config_dir / 'invalid_schema.json'
+        with open(invalid_schema, 'w') as f:
+            f.write('invalid json')
+            
+        result = runner.invoke(cli, [
+            'https://example.com',
+            '--schema', str(invalid_schema)
+        ])
+        assert result.exit_code != 0
+
+if __name__ == '__main__':
+    pytest.main(['-v', '-s', '--tb=native', __file__])
\ No newline at end of file