feat(cli): add command line interface with comprehensive features

Implements a full-featured CLI for Crawl4AI with the following capabilities: - Basic and advanced web crawling - Configuration management via YAML/JSON files - Multiple extraction strategies (CSS, XPath, LLM) - Content filtering and optimization - Interactive Q&A capabilities - Various output formats - Comprehensive documentation and examples Also includes: - Home directory setup for configuration and cache - Environment variable support for API tokens - Test suite for CLI functionality
2025-02-10 16:58:52 +08:00
parent 467be9ac76
commit 91a5fea11f
14 changed files with 983 additions and 7 deletions
--- a/docs/examples/cli/browser.yml
+++ b/docs/examples/cli/browser.yml
@@ -0,0 +1,13 @@
+browser_type: "chromium"
+headless: true
+viewport_width: 1280
+viewport_height: 800
+user_agent_mode: "random"
+verbose: true
+text_mode: false
+light_mode: false
+ignore_https_errors: true
+java_script_enabled: true
+extra_args:
+  - "--disable-gpu"
+  - "--no-sandbox"
--- a/docs/examples/cli/crawler.yml
+++ b/docs/examples/cli/crawler.yml
@@ -0,0 +1,13 @@
+cache_mode: "bypass"
+wait_until: "networkidle"
+page_timeout: 30000
+delay_before_return_html: 0.5
+word_count_threshold: 100
+scan_full_page: true
+scroll_delay: 0.3
+process_iframes: false
+remove_overlay_elements: true
+magic: true
+verbose: true
+exclude_external_links: true
+exclude_social_media_links: true
--- a/docs/examples/cli/css_schema.json
+++ b/docs/examples/cli/css_schema.json
@@ -0,0 +1,27 @@
+{
+  "name": "ArticleExtractor",
+  "baseSelector": ".cards[data-tax=news] .card__data",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h4.card__title",
+      "type": "text"
+    },
+    {
+      "name": "link",
+      "selector": "h4.card__title a", 
+      "type": "attribute",
+      "attribute": "href"
+    },
+    {
+      "name": "details",
+      "selector": ".card__details",
+      "type": "text"
+    },
+    {
+      "name": "topics",
+      "selector": ".card__topics.topics",
+      "type": "text"
+    }
+  ]
+}
--- a/docs/examples/cli/extract.yml
+++ b/docs/examples/cli/extract.yml
@@ -0,0 +1,11 @@
+type: "llm"
+provider: "openai/gpt-4o-mini"
+api_token: "env:OPENAI_API_KEY"
+instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
+params:
+  chunk_token_threshold: 4096
+  overlap_rate: 0.1
+  word_token_rate: 0.75
+  temperature: 0.3
+  max_tokens: 1000
+  verbose: true
--- a/docs/examples/cli/extract_css.yml
+++ b/docs/examples/cli/extract_css.yml
@@ -0,0 +1,3 @@
+type: "json-css"
+params:
+  verbose: true 
--- a/docs/examples/cli/llm_schema.json
+++ b/docs/examples/cli/llm_schema.json
@@ -0,0 +1,26 @@
+{
+  "title": "NewsArticle",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "The title/headline of the news article"
+    },
+    "link": {
+      "type": "string",
+      "description": "The URL or link to the full article"
+    },
+    "details": {
+      "type": "string", 
+      "description": "Brief summary or details about the article content"
+    },
+    "topics": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "description": "List of topics or categories associated with the article"
+    }
+  },
+  "required": ["title", "details"]
+}