feat(cli): add command line interface with comprehensive features
Implements a full-featured CLI for Crawl4AI with the following capabilities: - Basic and advanced web crawling - Configuration management via YAML/JSON files - Multiple extraction strategies (CSS, XPath, LLM) - Content filtering and optimization - Interactive Q&A capabilities - Various output formats - Comprehensive documentation and examples Also includes: - Home directory setup for configuration and cache - Environment variable support for API tokens - Test suite for CLI functionality
This commit is contained in:
13
docs/examples/cli/browser.yml
Normal file
13
docs/examples/cli/browser.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
browser_type: "chromium"
|
||||
headless: true
|
||||
viewport_width: 1280
|
||||
viewport_height: 800
|
||||
user_agent_mode: "random"
|
||||
verbose: true
|
||||
text_mode: false
|
||||
light_mode: false
|
||||
ignore_https_errors: true
|
||||
java_script_enabled: true
|
||||
extra_args:
|
||||
- "--disable-gpu"
|
||||
- "--no-sandbox"
|
||||
13
docs/examples/cli/crawler.yml
Normal file
13
docs/examples/cli/crawler.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
cache_mode: "bypass"
|
||||
wait_until: "networkidle"
|
||||
page_timeout: 30000
|
||||
delay_before_return_html: 0.5
|
||||
word_count_threshold: 100
|
||||
scan_full_page: true
|
||||
scroll_delay: 0.3
|
||||
process_iframes: false
|
||||
remove_overlay_elements: true
|
||||
magic: true
|
||||
verbose: true
|
||||
exclude_external_links: true
|
||||
exclude_social_media_links: true
|
||||
27
docs/examples/cli/css_schema.json
Normal file
27
docs/examples/cli/css_schema.json
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "ArticleExtractor",
|
||||
"baseSelector": ".cards[data-tax=news] .card__data",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4.card__title",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "link",
|
||||
"selector": "h4.card__title a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
},
|
||||
{
|
||||
"name": "details",
|
||||
"selector": ".card__details",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "topics",
|
||||
"selector": ".card__topics.topics",
|
||||
"type": "text"
|
||||
}
|
||||
]
|
||||
}
|
||||
11
docs/examples/cli/extract.yml
Normal file
11
docs/examples/cli/extract.yml
Normal file
@@ -0,0 +1,11 @@
|
||||
type: "llm"
|
||||
provider: "openai/gpt-4o-mini"
|
||||
api_token: "env:OPENAI_API_KEY"
|
||||
instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
|
||||
params:
|
||||
chunk_token_threshold: 4096
|
||||
overlap_rate: 0.1
|
||||
word_token_rate: 0.75
|
||||
temperature: 0.3
|
||||
max_tokens: 1000
|
||||
verbose: true
|
||||
3
docs/examples/cli/extract_css.yml
Normal file
3
docs/examples/cli/extract_css.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
type: "json-css"
|
||||
params:
|
||||
verbose: true
|
||||
26
docs/examples/cli/llm_schema.json
Normal file
26
docs/examples/cli/llm_schema.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"title": "NewsArticle",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "The title/headline of the news article"
|
||||
},
|
||||
"link": {
|
||||
"type": "string",
|
||||
"description": "The URL or link to the full article"
|
||||
},
|
||||
"details": {
|
||||
"type": "string",
|
||||
"description": "Brief summary or details about the article content"
|
||||
},
|
||||
"topics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "List of topics or categories associated with the article"
|
||||
}
|
||||
},
|
||||
"required": ["title", "details"]
|
||||
}
|
||||
Reference in New Issue
Block a user