From 91a5fea11fc21942c32b55517391f178b47bbe73 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 10 Feb 2025 16:58:52 +0800 Subject: [PATCH] feat(cli): add command line interface with comprehensive features Implements a full-featured CLI for Crawl4AI with the following capabilities: - Basic and advanced web crawling - Configuration management via YAML/JSON files - Multiple extraction strategies (CSS, XPath, LLM) - Content filtering and optimization - Interactive Q&A capabilities - Various output formats - Comprehensive documentation and examples Also includes: - Home directory setup for configuration and cache - Environment variable support for API tokens - Test suite for CLI functionality --- crawl4ai/async_configs.py | 6 +- crawl4ai/cli.py | 404 ++++++++++++++++++++++++++++++ crawl4ai/config.py | 2 + crawl4ai/extraction_strategy.py | 14 +- crawl4ai/install.py | 33 +++ docs/examples/cli/browser.yml | 13 + docs/examples/cli/crawler.yml | 13 + docs/examples/cli/css_schema.json | 27 ++ docs/examples/cli/extract.yml | 11 + docs/examples/cli/extract_css.yml | 3 + docs/examples/cli/llm_schema.json | 26 ++ docs/md_v2/core/cli.md | 304 ++++++++++++++++++++++ mkdocs.yml | 1 + tests/cli/test_cli.py | 133 ++++++++++ 14 files changed, 983 insertions(+), 7 deletions(-) create mode 100644 docs/examples/cli/browser.yml create mode 100644 docs/examples/cli/crawler.yml create mode 100644 docs/examples/cli/css_schema.json create mode 100644 docs/examples/cli/extract.yml create mode 100644 docs/examples/cli/extract_css.yml create mode 100644 docs/examples/cli/llm_schema.json create mode 100644 docs/md_v2/core/cli.md create mode 100644 tests/cli/test_cli.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 6c3b294d..b60cc468 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -361,7 +361,8 @@ class BrowserConfig(): @staticmethod def load( data: dict) -> "BrowserConfig": # Deserialize the object from a dictionary - return from_serializable_dict(data) if data else BrowserConfig() + config = from_serializable_dict(data) + return BrowserConfig.from_kwargs(config) class CrawlerRunConfig(): @@ -811,7 +812,8 @@ class CrawlerRunConfig(): @staticmethod def load(data: dict) -> "CrawlerRunConfig": # Deserialize the object from a dictionary - return from_serializable_dict(data) if data else CrawlerRunConfig() + config = from_serializable_dict(data) + return CrawlerRunConfig.from_kwargs(config) def to_dict(self): return { diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index e69de29b..d1fb60d2 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -0,0 +1,404 @@ +import click +import os +from typing import Dict, Any, Optional +import json +import yaml +import anyio +from crawl4ai import ( + CacheMode, + AsyncWebCrawler, + CrawlResult, + BrowserConfig, + CrawlerRunConfig, + LLMExtractionStrategy, + JsonCssExtractionStrategy, + JsonXPathExtractionStrategy, + BM25ContentFilter, + PruningContentFilter +) +from litellm import completion +from pathlib import Path + +def get_global_config() -> dict: + config_dir = Path.home() / ".crawl4ai" + config_file = config_dir / "global.yml" + + if not config_file.exists(): + config_dir.mkdir(parents=True, exist_ok=True) + return {} + + with open(config_file) as f: + return yaml.safe_load(f) or {} + +def save_global_config(config: dict): + config_file = Path.home() / ".crawl4ai" / "global.yml" + with open(config_file, "w") as f: + yaml.dump(config, f) + +def setup_llm_config() -> tuple[str, str]: + config = get_global_config() + provider = config.get("DEFAULT_LLM_PROVIDER") + token = config.get("DEFAULT_LLM_PROVIDER_TOKEN") + + if not provider: + click.echo("\nNo default LLM provider configured.") + click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')") + click.echo("See available providers at: https://docs.litellm.ai/docs/providers") + provider = click.prompt("Enter provider") + + if not provider.startswith("ollama/"): + if not token: + token = click.prompt("Enter API token for " + provider, hide_input=True) + else: + token = "no-token" + + if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"): + config["DEFAULT_LLM_PROVIDER"] = provider + config["DEFAULT_LLM_PROVIDER_TOKEN"] = token + save_global_config(config) + click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml") + + return provider, token + +async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str): + response = completion( + model=provider, + api_key=token, + messages=[ + { + "content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.", + "role": "system" + }, + { + "content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}", + "role": "user" + }, + ], + stream=True, + ) + + for chunk in response: + if content := chunk["choices"][0]["delta"].get("content"): + print(content, end="", flush=True) + print() # New line at end + + + +def parse_key_values(ctx, param, value) -> Dict[str, Any]: + if not value: + return {} + result = {} + pairs = value.split(',') + for pair in pairs: + try: + k, v = pair.split('=', 1) + # Handle common value types + if v.lower() == 'true': v = True + elif v.lower() == 'false': v = False + elif v.isdigit(): v = int(v) + elif v.replace('.','',1).isdigit(): v = float(v) + elif v.startswith('[') and v.endswith(']'): + v = [x.strip() for x in v[1:-1].split(',') if x.strip()] + elif v.startswith('{') and v.endswith('}'): + try: + v = json.loads(v) + except json.JSONDecodeError: + raise click.BadParameter(f'Invalid JSON object: {v}') + result[k.strip()] = v + except ValueError: + raise click.BadParameter(f'Invalid key=value pair: {pair}') + return result + +def load_config_file(path: Optional[str]) -> dict: + if not path: + return {} + + try: + with open(path) as f: + if path.endswith((".yaml", ".yml")): + return yaml.safe_load(f) + return json.load(f) + except Exception as e: + raise click.BadParameter(f'Error loading config file {path}: {str(e)}') + +def load_schema_file(path: Optional[str]) -> dict: + if not path: + return None + return load_config_file(path) + +async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool): + if verbose: + click.echo("Starting crawler with configurations:") + click.echo(f"Browser config: {browser_cfg.dump()}") + click.echo(f"Crawler config: {crawler_cfg.dump()}") + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + try: + result = await crawler.arun(url=url, config=crawler_cfg) + return result + except Exception as e: + raise click.ClickException(f"Crawling failed: {str(e)}") + +def show_examples(): + examples = """ +🚀 Crawl4AI CLI Examples + +1️⃣ Basic Usage: + # Simple crawl with default settings + crwl https://example.com + + # Get markdown output + crwl https://example.com -o markdown + + # Verbose JSON output with cache bypass + crwl https://example.com -o json -v --bypass-cache + +2️⃣ Using Config Files: + # Using browser and crawler configs + crwl https://example.com -B browser.yml -C crawler.yml + + # CSS-based extraction + crwl https://example.com -e extract_css.yml -s css_schema.json -o json + + # LLM-based extraction + crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json + +3️⃣ Direct Parameters: + # Browser settings + crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random" + + # Crawler settings + crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true" + +4️⃣ Sample Config Files: + +browser.yml: + headless: true + viewport_width: 1280 + user_agent_mode: "random" + verbose: true + ignore_https_errors: true + +extract_css.yml: + type: "json-css" + params: + verbose: true + +css_schema.json: + { + "name": "ArticleExtractor", + "baseSelector": ".article", + "fields": [ + { + "name": "title", + "selector": "h1.title", + "type": "text" + }, + { + "name": "link", + "selector": "a.read-more", + "type": "attribute", + "attribute": "href" + } + ] + } + +extract_llm.yml: + type: "llm" + provider: "openai/gpt-4" + instruction: "Extract all articles with their titles and links" + api_token: "your-token" + params: + temperature: 0.3 + max_tokens: 1000 + +llm_schema.json: + { + "title": "Article", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The title of the article" + }, + "link": { + "type": "string", + "description": "URL to the full article" + } + } + } + +5️⃣ Advanced Usage: + # Combine configs with direct parameters + crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920" + + # Full extraction pipeline + crwl https://example.com \\ + -B browser.yml \\ + -C crawler.yml \\ + -e extract_llm.yml \\ + -s llm_schema.json \\ + -o json \\ + -v + + # Content filtering with BM25 + crwl https://example.com \\ + -f filter_bm25.yml \\ + -o markdown-fit + +For more documentation visit: https://github.com/unclecode/crawl4ai + +6️⃣ Q&A with LLM: + # Ask a question about the content + crwl https://example.com -q "What is the main topic discussed?" + + # First view content, then ask questions + crwl https://example.com -o markdown # See the crawled content first + crwl https://example.com -q "Summarize the key points" + crwl https://example.com -q "What are the conclusions?" + + # Advanced crawling with Q&A + crwl https://example.com \\ + -B browser.yml \\ + -c "css_selector=article,scan_full_page=true" \\ + -q "What are the pros and cons mentioned?" + + Note: First time using -q will prompt for LLM provider and API token. + These will be saved in ~/.crawl4ai/global.yml for future use. + + Supported provider format: 'company/model' + Examples: + - ollama/llama3.3 + - openai/gpt-4 + - anthropic/claude-3-sonnet + - cohere/command + - google/gemini-pro + + See full list of providers: https://docs.litellm.ai/docs/providers +""" + click.echo(examples) + +@click.command(context_settings={"help_option_names": ["-h", "--help"]}) +@click.argument("url", required=False) +@click.option("--example", is_flag=True, help="Show usage examples") +@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)") +@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)") +@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file") +@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file") +@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction") +@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2") +@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") +@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "markdown-v2", "md", "md-fit"]), default="all") +@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling") +@click.option("--question", "-q", help="Ask a question about the crawled content") +@click.option("--verbose", "-v", is_flag=True) +def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, + extraction_config: str, schema: str, browser: Dict, crawler: Dict, + output: str, bypass_cache: bool, question: str, verbose: bool): + """Crawl4AI CLI - Web content extraction tool + + Simple Usage: + crwl https://example.com + + Run with --example to see detailed usage examples.""" + + if example: + show_examples() + return + + if not url: + raise click.UsageError("URL argument is required unless using --example") + + try: + # Load base configurations + browser_cfg = BrowserConfig.load(load_config_file(browser_config)) + crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config)) + + # Override with CLI params + if browser: + browser_cfg = browser_cfg.clone(**browser) + if crawler: + crawler_cfg = crawler_cfg.clone(**crawler) + + # Handle content filter config + if filter_config: + filter_conf = load_config_file(filter_config) + if filter_conf["type"] == "bm25": + crawler_cfg.content_filter = BM25ContentFilter( + user_query=filter_conf.get("query"), + bm25_threshold=filter_conf.get("threshold", 1.0) + ) + elif filter_conf["type"] == "pruning": + crawler_cfg.content_filter = PruningContentFilter( + user_query=filter_conf.get("query"), + threshold=filter_conf.get("threshold", 0.48) + ) + + # Handle extraction strategy + if extraction_config: + extract_conf = load_config_file(extraction_config) + schema_data = load_schema_file(schema) + + # Check if type does not exist show proper message + if not extract_conf.get("type"): + raise click.ClickException("Extraction type not specified") + if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]: + raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}") + + if extract_conf["type"] == "llm": + # if no provider show error emssage + if not extract_conf.get("provider") or not extract_conf.get("api_token"): + raise click.ClickException("LLM provider and API token are required for LLM extraction") + + crawler_cfg.extraction_strategy = LLMExtractionStrategy( + provider=extract_conf["provider"], + instruction=extract_conf["instruction"], + api_token=extract_conf.get("api_token", extract_conf.get("api_key")), + schema=schema_data, + **extract_conf.get("params", {}) + ) + elif extract_conf["type"] == "json-css": + crawler_cfg.extraction_strategy = JsonCssExtractionStrategy( + schema=schema_data + ) + elif extract_conf["type"] == "json-xpath": + crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy( + schema=schema_data + ) + + + # No cache + if bypass_cache: + crawler_cfg.cache_mode = CacheMode.BYPASS + + # Run crawler + result : CrawlResult = anyio.run( + run_crawler, + url, + browser_cfg, + crawler_cfg, + verbose + ) + + # Handle question + if question: + provider, token = setup_llm_config() + markdown = result.markdown_v2.raw_markdown + anyio.run(stream_llm_response, url, markdown, question, provider, token) + return + + # Handle output + if output == "all": + click.echo(json.dumps(result.model_dump(), indent=2)) + elif output == "json": + click.echo(json.dumps(json.loads(result.extracted_content), indent=2)) + elif output in ["markdown", "md"]: + click.echo(result.markdown_v2.raw_markdown) + elif output in ["markdown-fit", "md-fit"]: + click.echo(result.markdown_v2.fit_markdown) + + except Exception as e: + raise click.ClickException(str(e)) + +if __name__ == "__main__": + cli() \ No newline at end of file diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 3e26514a..51fe4434 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -15,6 +15,8 @@ PROVIDER_MODELS = { "openai/gpt-4o": os.getenv("OPENAI_API_KEY"), "openai/o1-mini": os.getenv("OPENAI_API_KEY"), "openai/o1-preview": os.getenv("OPENAI_API_KEY"), + "openai/o3-mini": os.getenv("OPENAI_API_KEY"), + "openai/o3-mini-high": os.getenv("OPENAI_API_KEY"), "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"), diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 8b9dc9cf..90b726bc 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -533,11 +533,15 @@ class LLMExtractionStrategy(ExtractionStrategy): """ super().__init__(**kwargs) self.provider = provider - self.api_token = ( - api_token - or PROVIDER_MODELS.get(provider, "no-token") - or os.getenv("OPENAI_API_KEY") - ) + if api_token and not api_token.startswith("env:"): + self.api_token = api_token + elif api_token and api_token.startswith("env:"): + self.api_token = os.getenv(api_token[4:]) + else: + self.api_token = ( + PROVIDER_MODELS.get(provider, "no-token") + or os.getenv("OPENAI_API_KEY") + ) self.instruction = instruction self.extract_type = extraction_type self.schema = schema diff --git a/crawl4ai/install.py b/crawl4ai/install.py index 139be591..05179a71 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -2,14 +2,47 @@ import subprocess import sys import asyncio from .async_logger import AsyncLogger, LogLevel +from pathlib import Path +import os +import shutil # Initialize logger logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) +def setup_home_directory(): + """Set up the .crawl4ai folder structure in the user's home directory.""" + base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") + crawl4ai_folder = Path(base_dir) if base_dir else Path.home() + crawl4ai_config = crawl4ai_folder / "global.yml" + crawl4ai_folder = crawl4ai_folder / ".crawl4ai" + cache_folder = crawl4ai_folder / "cache" + content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", + ] + + # Clean up old cache if exists + if cache_folder.exists(): + shutil.rmtree(cache_folder) + + # Create new folder structure + crawl4ai_folder.mkdir(exist_ok=True) + cache_folder.mkdir(exist_ok=True) + for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + + # If config file does not exist, create it + if not crawl4ai_config.exists(): + with open(crawl4ai_config, "w") as f: + f.write("") def post_install(): """Run all post-installation tasks""" logger.info("Running post-installation setup...", tag="INIT") + setup_home_directory() install_playwright() run_migration() logger.success("Post-installation setup completed!", tag="COMPLETE") diff --git a/docs/examples/cli/browser.yml b/docs/examples/cli/browser.yml new file mode 100644 index 00000000..dd6caf60 --- /dev/null +++ b/docs/examples/cli/browser.yml @@ -0,0 +1,13 @@ +browser_type: "chromium" +headless: true +viewport_width: 1280 +viewport_height: 800 +user_agent_mode: "random" +verbose: true +text_mode: false +light_mode: false +ignore_https_errors: true +java_script_enabled: true +extra_args: + - "--disable-gpu" + - "--no-sandbox" \ No newline at end of file diff --git a/docs/examples/cli/crawler.yml b/docs/examples/cli/crawler.yml new file mode 100644 index 00000000..61bd6670 --- /dev/null +++ b/docs/examples/cli/crawler.yml @@ -0,0 +1,13 @@ +cache_mode: "bypass" +wait_until: "networkidle" +page_timeout: 30000 +delay_before_return_html: 0.5 +word_count_threshold: 100 +scan_full_page: true +scroll_delay: 0.3 +process_iframes: false +remove_overlay_elements: true +magic: true +verbose: true +exclude_external_links: true +exclude_social_media_links: true \ No newline at end of file diff --git a/docs/examples/cli/css_schema.json b/docs/examples/cli/css_schema.json new file mode 100644 index 00000000..935efeb8 --- /dev/null +++ b/docs/examples/cli/css_schema.json @@ -0,0 +1,27 @@ +{ + "name": "ArticleExtractor", + "baseSelector": ".cards[data-tax=news] .card__data", + "fields": [ + { + "name": "title", + "selector": "h4.card__title", + "type": "text" + }, + { + "name": "link", + "selector": "h4.card__title a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "details", + "selector": ".card__details", + "type": "text" + }, + { + "name": "topics", + "selector": ".card__topics.topics", + "type": "text" + } + ] +} \ No newline at end of file diff --git a/docs/examples/cli/extract.yml b/docs/examples/cli/extract.yml new file mode 100644 index 00000000..be22dd5e --- /dev/null +++ b/docs/examples/cli/extract.yml @@ -0,0 +1,11 @@ +type: "llm" +provider: "openai/gpt-4o-mini" +api_token: "env:OPENAI_API_KEY" +instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format" +params: + chunk_token_threshold: 4096 + overlap_rate: 0.1 + word_token_rate: 0.75 + temperature: 0.3 + max_tokens: 1000 + verbose: true \ No newline at end of file diff --git a/docs/examples/cli/extract_css.yml b/docs/examples/cli/extract_css.yml new file mode 100644 index 00000000..a4004a3e --- /dev/null +++ b/docs/examples/cli/extract_css.yml @@ -0,0 +1,3 @@ +type: "json-css" +params: + verbose: true \ No newline at end of file diff --git a/docs/examples/cli/llm_schema.json b/docs/examples/cli/llm_schema.json new file mode 100644 index 00000000..a6969ccd --- /dev/null +++ b/docs/examples/cli/llm_schema.json @@ -0,0 +1,26 @@ +{ + "title": "NewsArticle", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The title/headline of the news article" + }, + "link": { + "type": "string", + "description": "The URL or link to the full article" + }, + "details": { + "type": "string", + "description": "Brief summary or details about the article content" + }, + "topics": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of topics or categories associated with the article" + } + }, + "required": ["title", "details"] +} \ No newline at end of file diff --git a/docs/md_v2/core/cli.md b/docs/md_v2/core/cli.md new file mode 100644 index 00000000..ff4bf658 --- /dev/null +++ b/docs/md_v2/core/cli.md @@ -0,0 +1,304 @@ +# Crawl4AI CLI Guide + +## Table of Contents +- [Installation](#installation) +- [Basic Usage](#basic-usage) +- [Configuration](#configuration) + - [Browser Configuration](#browser-configuration) + - [Crawler Configuration](#crawler-configuration) + - [Extraction Configuration](#extraction-configuration) + - [Content Filtering](#content-filtering) +- [Advanced Features](#advanced-features) + - [LLM Q&A](#llm-qa) + - [Structured Data Extraction](#structured-data-extraction) + - [Content Filtering](#content-filtering-1) +- [Output Formats](#output-formats) +- [Examples](#examples) +- [Configuration Reference](#configuration-reference) +- [Best Practices & Tips](#best-practices--tips) + +## Basic Usage + +The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library: + +```bash +# Basic crawling +crwl https://example.com + +# Get markdown output +crwl https://example.com -o markdown + +# Verbose JSON output with cache bypass +crwl https://example.com -o json -v --bypass-cache + +# See usage examples +crwl --example +``` + +## Quick Example of Advanced Usage + +If you clone the repository and run the following command, you will receive the content of the page in JSON format according to a JSON-CSS schema: + +```bash +crwl "https://www.infoq.com/ai-ml-data-eng/" -e docs/examples/cli/extract_css.yml -s docs/examples/cli/css_schema.json -o json; +``` + +## Configuration + +### Browser Configuration + +Browser settings can be configured via YAML file or command line parameters: + +```yaml +# browser.yml +headless: true +viewport_width: 1280 +user_agent_mode: "random" +verbose: true +ignore_https_errors: true +``` + +```bash +# Using config file +crwl https://example.com -B browser.yml + +# Using direct parameters +crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random" +``` + +### Crawler Configuration + +Control crawling behavior: + +```yaml +# crawler.yml +cache_mode: "bypass" +wait_until: "networkidle" +page_timeout: 30000 +delay_before_return_html: 0.5 +word_count_threshold: 100 +scan_full_page: true +scroll_delay: 0.3 +process_iframes: false +remove_overlay_elements: true +magic: true +verbose: true +``` + +```bash +# Using config file +crwl https://example.com -C crawler.yml + +# Using direct parameters +crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true" +``` + +### Extraction Configuration + +Two types of extraction are supported: + +1. CSS/XPath-based extraction: +```yaml +# extract_css.yml +type: "json-css" +params: + verbose: true +``` + +```json +// css_schema.json +{ + "name": "ArticleExtractor", + "baseSelector": ".article", + "fields": [ + { + "name": "title", + "selector": "h1.title", + "type": "text" + }, + { + "name": "link", + "selector": "a.read-more", + "type": "attribute", + "attribute": "href" + } + ] +} +``` + +2. LLM-based extraction: +```yaml +# extract_llm.yml +type: "llm" +provider: "openai/gpt-4" +instruction: "Extract all articles with their titles and links" +api_token: "your-token" +params: + temperature: 0.3 + max_tokens: 1000 +``` + +```json +// llm_schema.json +{ + "title": "Article", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The title of the article" + }, + "link": { + "type": "string", + "description": "URL to the full article" + } + } +} +``` + +## Advanced Features + +### LLM Q&A + +Ask questions about crawled content: + +```bash +# Simple question +crwl https://example.com -q "What is the main topic discussed?" + +# View content then ask questions +crwl https://example.com -o markdown # See content first +crwl https://example.com -q "Summarize the key points" +crwl https://example.com -q "What are the conclusions?" + +# Combined with advanced crawling +crwl https://example.com \ + -B browser.yml \ + -c "css_selector=article,scan_full_page=true" \ + -q "What are the pros and cons mentioned?" +``` + +First-time setup: +- Prompts for LLM provider and API token +- Saves configuration in `~/.crawl4ai/global.yml` +- Supports various providers (openai/gpt-4, anthropic/claude-3-sonnet, etc.) +- For case of `ollama` you do not need to provide API token. +- See [LiteLLM Providers](https://docs.litellm.ai/docs/providers) for full list + +### Structured Data Extraction + +Extract structured data using CSS selectors: + +```bash +crwl https://example.com \ + -e extract_css.yml \ + -s css_schema.json \ + -o json +``` + +Or using LLM-based extraction: + +```bash +crwl https://example.com \ + -e extract_llm.yml \ + -s llm_schema.json \ + -o json +``` + +### Content Filtering + +Filter content for relevance: + +```yaml +# filter_bm25.yml +type: "bm25" +query: "target content" +threshold: 1.0 + +# filter_pruning.yml +type: "pruning" +query: "focus topic" +threshold: 0.48 +``` + +```bash +crwl https://example.com -f filter_bm25.yml -o markdown-fit +``` + +## Output Formats + +- `all` - Full crawl result including metadata +- `json` - Extracted structured data (when using extraction) +- `markdown` / `md` - Raw markdown output +- `markdown-fit` / `md-fit` - Filtered markdown for better readability + +## Complete Examples + +1. Basic Extraction: +```bash +crwl https://example.com \ + -B browser.yml \ + -C crawler.yml \ + -o json +``` + +2. Structured Data Extraction: +```bash +crwl https://example.com \ + -e extract_css.yml \ + -s css_schema.json \ + -o json \ + -v +``` + +3. LLM Extraction with Filtering: +```bash +crwl https://example.com \ + -B browser.yml \ + -e extract_llm.yml \ + -s llm_schema.json \ + -f filter_bm25.yml \ + -o json +``` + +4. Interactive Q&A: +```bash +# First crawl and view +crwl https://example.com -o markdown + +# Then ask questions +crwl https://example.com -q "What are the main points?" +crwl https://example.com -q "Summarize the conclusions" +``` + +## Best Practices & Tips + +1. **Configuration Management**: + - Keep common configurations in YAML files + - Use CLI parameters for quick overrides + - Store sensitive data (API tokens) in `~/.crawl4ai/global.yml` + +2. **Performance Optimization**: + - Use `--bypass-cache` for fresh content + - Enable `scan_full_page` for infinite scroll pages + - Adjust `delay_before_return_html` for dynamic content + +3. **Content Extraction**: + - Use CSS extraction for structured content + - Use LLM extraction for unstructured content + - Combine with filters for focused results + +4. **Q&A Workflow**: + - View content first with `-o markdown` + - Ask specific questions + - Use broader context with appropriate selectors + +## Recap + +The Crawl4AI CLI provides: +- Flexible configuration via files and parameters +- Multiple extraction strategies (CSS, XPath, LLM) +- Content filtering and optimization +- Interactive Q&A capabilities +- Various output formats + diff --git a/mkdocs.yml b/mkdocs.yml index 16f44b05..7b2f18c6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -15,6 +15,7 @@ nav: - "Blog Home": "blog/index.md" - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md" - Core: + - "Command Line Interface": "core/cli.md" - "Simple Crawling": "core/simple-crawling.md" - "Crawler Result": "core/crawler-result.md" - "Browser & Crawler Config": "core/browser-crawler-config.md" diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py new file mode 100644 index 00000000..b7416dc2 --- /dev/null +++ b/tests/cli/test_cli.py @@ -0,0 +1,133 @@ +import pytest +from click.testing import CliRunner +from pathlib import Path +import json +import yaml +from crawl4ai.cli import cli, load_config_file, parse_key_values +import tempfile +import os +import click + +@pytest.fixture +def runner(): + return CliRunner() + +@pytest.fixture +def temp_config_dir(): + with tempfile.TemporaryDirectory() as tmpdir: + old_home = os.environ.get('HOME') + os.environ['HOME'] = tmpdir + yield Path(tmpdir) + if old_home: + os.environ['HOME'] = old_home + +@pytest.fixture +def sample_configs(temp_config_dir): + configs = { + 'browser.yml': { + 'headless': True, + 'viewport_width': 1280, + 'user_agent_mode': 'random' + }, + 'crawler.yml': { + 'cache_mode': 'bypass', + 'wait_until': 'networkidle', + 'scan_full_page': True + }, + 'extract_css.yml': { + 'type': 'json-css', + 'params': {'verbose': True} + }, + 'css_schema.json': { + 'name': 'ArticleExtractor', + 'baseSelector': '.article', + 'fields': [ + {'name': 'title', 'selector': 'h1.title', 'type': 'text'}, + {'name': 'link', 'selector': 'a.read-more', 'type': 'attribute', 'attribute': 'href'} + ] + } + } + + for filename, content in configs.items(): + path = temp_config_dir / filename + with open(path, 'w') as f: + if filename.endswith('.yml'): + yaml.dump(content, f) + else: + json.dump(content, f) + + return {name: str(temp_config_dir / name) for name in configs} + +class TestCLIBasics: + def test_help(self, runner): + result = runner.invoke(cli, ['--help']) + assert result.exit_code == 0 + assert 'Crawl4AI CLI' in result.output + + def test_examples(self, runner): + result = runner.invoke(cli, ['--example']) + assert result.exit_code == 0 + assert 'Examples' in result.output + + def test_missing_url(self, runner): + result = runner.invoke(cli) + assert result.exit_code != 0 + assert 'URL argument is required' in result.output + +class TestConfigParsing: + def test_parse_key_values_basic(self): + result = parse_key_values(None, None, "key1=value1,key2=true") + assert result == {'key1': 'value1', 'key2': True} + + def test_parse_key_values_invalid(self): + with pytest.raises(click.BadParameter): + parse_key_values(None, None, "invalid_format") + +class TestConfigLoading: + def test_load_yaml_config(self, sample_configs): + config = load_config_file(sample_configs['browser.yml']) + assert config['headless'] is True + assert config['viewport_width'] == 1280 + + def test_load_json_config(self, sample_configs): + config = load_config_file(sample_configs['css_schema.json']) + assert config['name'] == 'ArticleExtractor' + assert len(config['fields']) == 2 + + def test_load_nonexistent_config(self): + with pytest.raises(click.BadParameter): + load_config_file('nonexistent.yml') + +class TestLLMConfig: + def test_llm_config_creation(self, temp_config_dir, runner): + def input_simulation(inputs): + return runner.invoke(cli, ['https://example.com', '-q', 'test question'], + input='\n'.join(inputs)) + +class TestCrawlingFeatures: + def test_basic_crawl(self, runner): + result = runner.invoke(cli, ['https://example.com']) + assert result.exit_code == 0 + + +class TestErrorHandling: + def test_invalid_config_file(self, runner): + result = runner.invoke(cli, [ + 'https://example.com', + '--browser-config', 'nonexistent.yml' + ]) + assert result.exit_code != 0 + + def test_invalid_schema(self, runner, temp_config_dir): + invalid_schema = temp_config_dir / 'invalid_schema.json' + with open(invalid_schema, 'w') as f: + f.write('invalid json') + + result = runner.invoke(cli, [ + 'https://example.com', + '--schema', str(invalid_schema) + ]) + assert result.exit_code != 0 + +if __name__ == '__main__': + pytest.main(['-v', '-s', '--tb=native', __file__]) \ No newline at end of file