feat(cli): add command line interface with comprehensive features

Implements a full-featured CLI for Crawl4AI with the following capabilities:
- Basic and advanced web crawling
- Configuration management via YAML/JSON files
- Multiple extraction strategies (CSS, XPath, LLM)
- Content filtering and optimization
- Interactive Q&A capabilities
- Various output formats
- Comprehensive documentation and examples

Also includes:
- Home directory setup for configuration and cache
- Environment variable support for API tokens
- Test suite for CLI functionality
This commit is contained in:
UncleCode
2025-02-10 16:58:52 +08:00
parent 467be9ac76
commit 91a5fea11f
14 changed files with 983 additions and 7 deletions

View File

@@ -0,0 +1,13 @@
browser_type: "chromium"
headless: true
viewport_width: 1280
viewport_height: 800
user_agent_mode: "random"
verbose: true
text_mode: false
light_mode: false
ignore_https_errors: true
java_script_enabled: true
extra_args:
- "--disable-gpu"
- "--no-sandbox"

View File

@@ -0,0 +1,13 @@
cache_mode: "bypass"
wait_until: "networkidle"
page_timeout: 30000
delay_before_return_html: 0.5
word_count_threshold: 100
scan_full_page: true
scroll_delay: 0.3
process_iframes: false
remove_overlay_elements: true
magic: true
verbose: true
exclude_external_links: true
exclude_social_media_links: true

View File

@@ -0,0 +1,27 @@
{
"name": "ArticleExtractor",
"baseSelector": ".cards[data-tax=news] .card__data",
"fields": [
{
"name": "title",
"selector": "h4.card__title",
"type": "text"
},
{
"name": "link",
"selector": "h4.card__title a",
"type": "attribute",
"attribute": "href"
},
{
"name": "details",
"selector": ".card__details",
"type": "text"
},
{
"name": "topics",
"selector": ".card__topics.topics",
"type": "text"
}
]
}

View File

@@ -0,0 +1,11 @@
type: "llm"
provider: "openai/gpt-4o-mini"
api_token: "env:OPENAI_API_KEY"
instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
params:
chunk_token_threshold: 4096
overlap_rate: 0.1
word_token_rate: 0.75
temperature: 0.3
max_tokens: 1000
verbose: true

View File

@@ -0,0 +1,3 @@
type: "json-css"
params:
verbose: true

View File

@@ -0,0 +1,26 @@
{
"title": "NewsArticle",
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title/headline of the news article"
},
"link": {
"type": "string",
"description": "The URL or link to the full article"
},
"details": {
"type": "string",
"description": "Brief summary or details about the article content"
},
"topics": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of topics or categories associated with the article"
}
},
"required": ["title", "details"]
}