feat(cli): add browser profile management functionality
Adds new interactive browser profile management system that allows users to: - Create and manage browser profiles for authenticated crawling - List existing profiles with detailed information - Delete unused profiles - Use profiles during crawling with the new -p/--profile flag Also restructures CLI to use Click groups and adds humanize dependency for better size formatting.
This commit is contained in:
331
crawl4ai/cli.py
331
crawl4ai/cli.py
@@ -1,9 +1,20 @@
|
||||
import click
|
||||
import os
|
||||
from typing import Dict, Any, Optional
|
||||
import time
|
||||
import datetime
|
||||
import sys
|
||||
import shutil
|
||||
import humanize
|
||||
from typing import Dict, Any, Optional, List
|
||||
import json
|
||||
import yaml
|
||||
import anyio
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich.prompt import Prompt, Confirm
|
||||
from rich.style import Style
|
||||
|
||||
from crawl4ai import (
|
||||
CacheMode,
|
||||
AsyncWebCrawler,
|
||||
@@ -14,13 +25,17 @@ from crawl4ai import (
|
||||
JsonCssExtractionStrategy,
|
||||
JsonXPathExtractionStrategy,
|
||||
BM25ContentFilter,
|
||||
PruningContentFilter
|
||||
PruningContentFilter,
|
||||
BrowserProfiler
|
||||
)
|
||||
from litellm import completion
|
||||
from pathlib import Path
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
|
||||
# Initialize rich console
|
||||
console = Console()
|
||||
|
||||
def get_global_config() -> dict:
|
||||
config_dir = Path.home() / ".crawl4ai"
|
||||
config_file = config_dir / "global.yml"
|
||||
@@ -172,7 +187,21 @@ def show_examples():
|
||||
# Crawler settings
|
||||
crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
|
||||
|
||||
4️⃣ Sample Config Files:
|
||||
4️⃣ Profile Management for Identity-Based Crawling:
|
||||
# Launch interactive profile manager
|
||||
crwl profiles
|
||||
|
||||
# Create, list, and delete browser profiles for identity-based crawling
|
||||
# Use a profile for crawling (keeps you logged in)
|
||||
crwl https://example.com -p my-profile-name
|
||||
|
||||
# Example: Crawl a site that requires login
|
||||
# 1. First create a profile and log in:
|
||||
crwl profiles
|
||||
# 2. Then use that profile to crawl the authenticated site:
|
||||
crwl https://site-requiring-login.com/dashboard -p my-profile-name
|
||||
|
||||
5️⃣ Sample Config Files:
|
||||
|
||||
browser.yml:
|
||||
headless: true
|
||||
@@ -230,7 +259,7 @@ llm_schema.json:
|
||||
}
|
||||
}
|
||||
|
||||
5️⃣ Advanced Usage:
|
||||
6️⃣ Advanced Usage:
|
||||
# Combine configs with direct parameters
|
||||
crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
|
||||
|
||||
@@ -248,9 +277,15 @@ llm_schema.json:
|
||||
-f filter_bm25.yml \\
|
||||
-o markdown-fit
|
||||
|
||||
# Authenticated crawling with profile
|
||||
crwl https://login-required-site.com \\
|
||||
-p my-authenticated-profile \\
|
||||
-c "css_selector=.dashboard-content" \\
|
||||
-o markdown
|
||||
|
||||
For more documentation visit: https://github.com/unclecode/crawl4ai
|
||||
|
||||
6️⃣ Q&A with LLM:
|
||||
7️⃣ Q&A with LLM:
|
||||
# Ask a question about the content
|
||||
crwl https://example.com -q "What is the main topic discussed?"
|
||||
|
||||
@@ -277,12 +312,173 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
|
||||
- google/gemini-pro
|
||||
|
||||
See full list of providers: https://docs.litellm.ai/docs/providers
|
||||
|
||||
8️⃣ Profile Management:
|
||||
# Launch interactive profile manager
|
||||
crwl profiles
|
||||
|
||||
# Create a profile and use it for crawling
|
||||
crwl profiles # Create and set up your profile interactively
|
||||
crwl https://example.com -p my-profile-name # Use profile for crawling
|
||||
|
||||
# Example workflow for authenticated site
|
||||
# 1. First create a profile and log in to the site:
|
||||
crwl profiles # Select "Create new profile" option
|
||||
# 2. Then use that profile to crawl authenticated content:
|
||||
crwl https://site-requiring-login.com/dashboard -p my-profile-name
|
||||
"""
|
||||
click.echo(examples)
|
||||
|
||||
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
|
||||
@click.argument("url", required=False)
|
||||
@click.option("--example", is_flag=True, help="Show usage examples")
|
||||
def get_directory_size(path: str) -> int:
|
||||
"""Calculate the total size of a directory in bytes"""
|
||||
total_size = 0
|
||||
for dirpath, _, filenames in os.walk(path):
|
||||
for f in filenames:
|
||||
fp = os.path.join(dirpath, f)
|
||||
if not os.path.islink(fp):
|
||||
total_size += os.path.getsize(fp)
|
||||
return total_size
|
||||
|
||||
def display_profiles_table(profiles: List[Dict[str, Any]]):
|
||||
"""Display a rich table of browser profiles"""
|
||||
if not profiles:
|
||||
console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]",
|
||||
title="Browser Profiles", border_style="blue"))
|
||||
return
|
||||
|
||||
table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
|
||||
table.add_column("#", style="dim", width=4)
|
||||
table.add_column("Name", style="cyan", no_wrap=True)
|
||||
table.add_column("Path", style="green")
|
||||
table.add_column("Created", style="yellow")
|
||||
table.add_column("Browser", style="magenta")
|
||||
table.add_column("Size", style="blue", justify="right")
|
||||
|
||||
for i, profile in enumerate(profiles):
|
||||
# Calculate folder size
|
||||
size = get_directory_size(profile["path"])
|
||||
human_size = humanize.naturalsize(size)
|
||||
|
||||
# Format creation date
|
||||
created = profile["created"].strftime("%Y-%m-%d %H:%M")
|
||||
|
||||
# Add row to table
|
||||
table.add_row(
|
||||
str(i+1),
|
||||
profile["name"],
|
||||
profile["path"],
|
||||
created,
|
||||
profile["type"].capitalize(),
|
||||
human_size
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
async def create_profile_interactive(profiler: BrowserProfiler):
|
||||
"""Interactive profile creation wizard"""
|
||||
console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
|
||||
"This will open a browser window for you to set up your identity.\n"
|
||||
"Log in to sites, adjust settings, then press 'q' to save.",
|
||||
border_style="cyan"))
|
||||
|
||||
profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
|
||||
|
||||
console.print("[cyan]Creating profile...[/cyan]")
|
||||
console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
|
||||
|
||||
# Create the profile
|
||||
try:
|
||||
profile_path = await profiler.create_profile(profile_name)
|
||||
|
||||
if profile_path:
|
||||
console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
|
||||
else:
|
||||
console.print("[red]Failed to create profile.[/red]")
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error creating profile: {str(e)}[/red]")
|
||||
|
||||
def delete_profile_interactive(profiler: BrowserProfiler):
|
||||
"""Interactive profile deletion"""
|
||||
profiles = profiler.list_profiles()
|
||||
|
||||
if not profiles:
|
||||
console.print("[yellow]No profiles found to delete.[/yellow]")
|
||||
return
|
||||
|
||||
# Display profiles
|
||||
display_profiles_table(profiles)
|
||||
|
||||
# Get profile selection
|
||||
idx = Prompt.ask(
|
||||
"[red]Enter number of profile to delete[/red]",
|
||||
console=console,
|
||||
choices=[str(i+1) for i in range(len(profiles))],
|
||||
show_choices=False
|
||||
)
|
||||
|
||||
try:
|
||||
idx = int(idx) - 1
|
||||
profile = profiles[idx]
|
||||
|
||||
# Confirm deletion
|
||||
if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
|
||||
success = profiler.delete_profile(profile["path"])
|
||||
|
||||
if success:
|
||||
console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
|
||||
else:
|
||||
console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
|
||||
except (ValueError, IndexError):
|
||||
console.print("[red]Invalid selection.[/red]")
|
||||
|
||||
async def manage_profiles():
|
||||
"""Interactive profile management menu"""
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
options = {
|
||||
"1": "List profiles",
|
||||
"2": "Create new profile",
|
||||
"3": "Delete profile",
|
||||
"4": "Exit",
|
||||
}
|
||||
|
||||
while True:
|
||||
console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
|
||||
|
||||
for key, value in options.items():
|
||||
color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "cyan"
|
||||
console.print(f"[{color}]{key}[/{color}]. {value}")
|
||||
|
||||
choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
|
||||
|
||||
if choice == "1":
|
||||
# List profiles
|
||||
profiles = profiler.list_profiles()
|
||||
display_profiles_table(profiles)
|
||||
|
||||
elif choice == "2":
|
||||
# Create profile
|
||||
await create_profile_interactive(profiler)
|
||||
|
||||
elif choice == "3":
|
||||
# Delete profile
|
||||
delete_profile_interactive(profiler)
|
||||
|
||||
elif choice == "4":
|
||||
# Exit
|
||||
console.print("[cyan]Exiting profile manager.[/cyan]")
|
||||
break
|
||||
|
||||
# Add a separator between operations
|
||||
console.print("\n")
|
||||
|
||||
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
|
||||
def cli():
|
||||
"""Crawl4AI CLI - Web content extraction and browser profile management tool"""
|
||||
pass
|
||||
|
||||
@cli.command("crawl")
|
||||
@click.argument("url", required=True)
|
||||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||||
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
|
||||
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
|
||||
@@ -291,25 +487,43 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
|
||||
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
|
||||
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
||||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
|
||||
@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling")
|
||||
@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool):
|
||||
"""Crawl4AI CLI - Web content extraction tool
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
"""Crawl a website and extract content
|
||||
|
||||
Simple Usage:
|
||||
crwl https://example.com
|
||||
crwl crawl https://example.com
|
||||
"""
|
||||
|
||||
Run with --example to see detailed usage examples."""
|
||||
# Handle profile option
|
||||
if profile:
|
||||
profiler = BrowserProfiler()
|
||||
profile_path = profiler.get_profile_path(profile)
|
||||
|
||||
if example:
|
||||
show_examples()
|
||||
return
|
||||
if not profile_path:
|
||||
profiles = profiler.list_profiles()
|
||||
|
||||
if not url:
|
||||
raise click.UsageError("URL argument is required unless using --example")
|
||||
if profiles:
|
||||
console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
|
||||
display_profiles_table(profiles)
|
||||
else:
|
||||
console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
|
||||
|
||||
return
|
||||
|
||||
# Include the profile in browser config
|
||||
if not browser:
|
||||
browser = {}
|
||||
browser["user_data_dir"] = profile_path
|
||||
browser["use_managed_browser"] = True
|
||||
|
||||
if verbose:
|
||||
console.print(f"[green]Using browser profile:[/green] {profile}")
|
||||
|
||||
try:
|
||||
# Load base configurations
|
||||
@@ -401,5 +615,82 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
|
||||
except Exception as e:
|
||||
raise click.ClickException(str(e))
|
||||
|
||||
@cli.command("examples")
|
||||
def examples_cmd():
|
||||
"""Show usage examples"""
|
||||
show_examples()
|
||||
|
||||
@cli.command("profiles")
|
||||
def profiles_cmd():
|
||||
"""Manage browser profiles interactively
|
||||
|
||||
Launch an interactive browser profile manager where you can:
|
||||
- List all existing profiles
|
||||
- Create new profiles for authenticated browsing
|
||||
- Delete unused profiles
|
||||
"""
|
||||
# Run interactive profile manager
|
||||
anyio.run(manage_profiles)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("url", required=False)
|
||||
@click.option("--example", is_flag=True, help="Show usage examples")
|
||||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||||
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
|
||||
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
|
||||
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
|
||||
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
|
||||
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
|
||||
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
||||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
|
||||
@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
"""Crawl4AI CLI - Web content extraction tool
|
||||
|
||||
Simple Usage:
|
||||
crwl https://example.com
|
||||
|
||||
Run with --example to see detailed usage examples.
|
||||
|
||||
Other commands:
|
||||
crwl profiles - Manage browser profiles for identity-based crawling
|
||||
crwl crawl - Crawl a website with advanced options
|
||||
crwl examples - Show more usage examples
|
||||
"""
|
||||
|
||||
if example:
|
||||
show_examples()
|
||||
return
|
||||
|
||||
if not url:
|
||||
# Show help without error message
|
||||
ctx = click.get_current_context()
|
||||
click.echo(ctx.get_help())
|
||||
return
|
||||
|
||||
# Forward to crawl command
|
||||
ctx = click.get_current_context()
|
||||
ctx.invoke(
|
||||
crawl_cmd,
|
||||
url=url,
|
||||
browser_config=browser_config,
|
||||
crawler_config=crawler_config,
|
||||
filter_config=filter_config,
|
||||
extraction_config=extraction_config,
|
||||
schema=schema,
|
||||
browser=browser,
|
||||
crawler=crawler,
|
||||
output=output,
|
||||
bypass_cache=bypass_cache,
|
||||
question=question,
|
||||
verbose=verbose,
|
||||
profile=profile
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
@@ -41,7 +41,8 @@ dependencies = [
|
||||
"click>=8.1.7",
|
||||
"pyperclip>=1.8.2",
|
||||
"faust-cchardet>=2.1.19",
|
||||
"aiohttp>=3.11.11"
|
||||
"aiohttp>=3.11.11",
|
||||
"humanize>=4.10.0"
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
|
||||
Reference in New Issue
Block a user