feat(cli): add browser profile management functionality

Adds new interactive browser profile management system that allows users to: - Create and manage browser profiles for authenticated crawling - List existing profiles with detailed information - Delete unused profiles - Use profiles during crawling with the new -p/--profile flag Also restructures CLI to use Click groups and adds humanize dependency for better size formatting.
2025-03-02 20:54:45 +08:00
parent cba4a466e5
commit 95175cb394
2 changed files with 316 additions and 24 deletions
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1,9 +1,20 @@
 import click
 import os
-from typing import Dict, Any, Optional
+import time
 import datetime
 import sys
 import shutil
 import humanize
 from typing import Dict, Any, Optional, List
 import json
 import yaml
 import anyio
 from rich.console import Console
 from rich.table import Table
 from rich.panel import Panel
 from rich.prompt import Prompt, Confirm
 from rich.style import Style
 from crawl4ai import (
    CacheMode,
    AsyncWebCrawler, 
@@ -14,13 +25,17 @@ from crawl4ai import (
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy,
    BM25ContentFilter, 
-    PruningContentFilter
+    PruningContentFilter,
    BrowserProfiler
 )
 from litellm import completion
 from pathlib import Path
 from crawl4ai.async_configs import LlmConfig
 # Initialize rich console
 console = Console()
 def get_global_config() -> dict:
    config_dir = Path.home() / ".crawl4ai"
    config_file = config_dir / "global.yml"
@@ -172,7 +187,21 @@ def show_examples():
    # Crawler settings
    crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
-4️⃣  Sample Config Files:
+4️⃣  Profile Management for Identity-Based Crawling:
    # Launch interactive profile manager
    crwl profiles
    # Create, list, and delete browser profiles for identity-based crawling
    # Use a profile for crawling (keeps you logged in)
    crwl https://example.com -p my-profile-name
    # Example: Crawl a site that requires login
    # 1. First create a profile and log in:
    crwl profiles
    # 2. Then use that profile to crawl the authenticated site:
    crwl https://site-requiring-login.com/dashboard -p my-profile-name
 5️⃣  Sample Config Files:
 browser.yml:
    headless: true
@@ -230,7 +259,7 @@ llm_schema.json:
      }
    }
-5️⃣  Advanced Usage:
+6️⃣  Advanced Usage:
    # Combine configs with direct parameters
    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
@@ -248,9 +277,15 @@ llm_schema.json:
        -f filter_bm25.yml \\
        -o markdown-fit
    # Authenticated crawling with profile
    crwl https://login-required-site.com \\
        -p my-authenticated-profile \\
        -c "css_selector=.dashboard-content" \\
        -o markdown
 For more documentation visit: https://github.com/unclecode/crawl4ai
-6️⃣  Q&A with LLM:
+7️⃣  Q&A with LLM:
    # Ask a question about the content
    crwl https://example.com -q "What is the main topic discussed?"
@@ -277,12 +312,173 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
      - google/gemini-pro
    See full list of providers: https://docs.litellm.ai/docs/providers
 8️⃣ Profile Management:
    # Launch interactive profile manager
    crwl profiles
    # Create a profile and use it for crawling
    crwl profiles  # Create and set up your profile interactively
    crwl https://example.com -p my-profile-name  # Use profile for crawling
    # Example workflow for authenticated site
    # 1. First create a profile and log in to the site:
    crwl profiles  # Select "Create new profile" option
    # 2. Then use that profile to crawl authenticated content:
    crwl https://site-requiring-login.com/dashboard -p my-profile-name
 """
    click.echo(examples)
-@click.command(context_settings={"help_option_names": ["-h", "--help"]})
+def get_directory_size(path: str) -> int:
-@click.argument("url", required=False)
+    """Calculate the total size of a directory in bytes"""
-@click.option("--example", is_flag=True, help="Show usage examples")
+    total_size = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size
 def display_profiles_table(profiles: List[Dict[str, Any]]):
    """Display a rich table of browser profiles"""
    if not profiles:
        console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]", 
                          title="Browser Profiles", border_style="blue"))
        return
    table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
    table.add_column("#", style="dim", width=4)
    table.add_column("Name", style="cyan", no_wrap=True)
    table.add_column("Path", style="green")
    table.add_column("Created", style="yellow")
    table.add_column("Browser", style="magenta")
    table.add_column("Size", style="blue", justify="right")
    for i, profile in enumerate(profiles):
        # Calculate folder size
        size = get_directory_size(profile["path"])
        human_size = humanize.naturalsize(size)
        # Format creation date
        created = profile["created"].strftime("%Y-%m-%d %H:%M")
        # Add row to table
        table.add_row(
            str(i+1), 
            profile["name"], 
            profile["path"], 
            created, 
            profile["type"].capitalize(), 
            human_size
        )
    console.print(table)
 async def create_profile_interactive(profiler: BrowserProfiler):
    """Interactive profile creation wizard"""
    console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
                      "This will open a browser window for you to set up your identity.\n"
                      "Log in to sites, adjust settings, then press 'q' to save.",
                      border_style="cyan"))
    profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
    console.print("[cyan]Creating profile...[/cyan]")
    console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
    # Create the profile
    try:
        profile_path = await profiler.create_profile(profile_name)
        if profile_path:
            console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
        else:
            console.print("[red]Failed to create profile.[/red]")
    except Exception as e:
        console.print(f"[red]Error creating profile: {str(e)}[/red]")
 def delete_profile_interactive(profiler: BrowserProfiler):
    """Interactive profile deletion"""
    profiles = profiler.list_profiles()
    if not profiles:
        console.print("[yellow]No profiles found to delete.[/yellow]")
        return
    # Display profiles
    display_profiles_table(profiles)
    # Get profile selection
    idx = Prompt.ask(
        "[red]Enter number of profile to delete[/red]", 
        console=console,
        choices=[str(i+1) for i in range(len(profiles))],
        show_choices=False
    )
    try:
        idx = int(idx) - 1
        profile = profiles[idx]
        # Confirm deletion
        if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
            success = profiler.delete_profile(profile["path"])
            if success:
                console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
            else:
                console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
    except (ValueError, IndexError):
        console.print("[red]Invalid selection.[/red]")
 async def manage_profiles():
    """Interactive profile management menu"""
    profiler = BrowserProfiler()
    options = {
        "1": "List profiles",
        "2": "Create new profile",
        "3": "Delete profile",
        "4": "Exit",
    }
    while True:
        console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
        for key, value in options.items():
            color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "cyan"
            console.print(f"[{color}]{key}[/{color}]. {value}")
        choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
        if choice == "1":
            # List profiles
            profiles = profiler.list_profiles()
            display_profiles_table(profiles)
        elif choice == "2":
            # Create profile
            await create_profile_interactive(profiler)
        elif choice == "3":
            # Delete profile
            delete_profile_interactive(profiler)
        elif choice == "4":
            # Exit
            console.print("[cyan]Exiting profile manager.[/cyan]")
            break
        # Add a separator between operations
        console.print("\n")
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
 def cli():
    """Crawl4AI CLI - Web content extraction and browser profile management tool"""
    pass
@cli.command("crawl")
@click.argument("url", required=True)
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
@@ -291,26 +487,44 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
-@click.option("--bypass-cache", is_flag=True, default = True,  help="Bypass cache when crawling")
+@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
-def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
-        extraction_config: str, schema: str, browser: Dict, crawler: Dict,
+def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, 
-        output: str, bypass_cache: bool, question: str, verbose: bool):
+           extraction_config: str, schema: str, browser: Dict, crawler: Dict,
-    """Crawl4AI CLI - Web content extraction tool
+           output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
-
+    """Crawl a website and extract content
    Simple Usage:
-        crwl https://example.com
+        crwl crawl https://example.com
    """
-    Run with --example to see detailed usage examples."""
+    # Handle profile option
-
+    if profile:
-    if example:
+        profiler = BrowserProfiler()
-        show_examples()
+        profile_path = profiler.get_profile_path(profile)
        return
-    if not url:
+        if not profile_path:
-        raise click.UsageError("URL argument is required unless using --example")
+            profiles = profiler.list_profiles()
-    
+            
            if profiles:
                console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
                display_profiles_table(profiles)
            else:
                console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
            return
        # Include the profile in browser config
        if not browser:
            browser = {}
        browser["user_data_dir"] = profile_path
        browser["use_managed_browser"] = True
        if verbose:
            console.print(f"[green]Using browser profile:[/green] {profile}")
    try:
        # Load base configurations
        browser_cfg = BrowserConfig.load(load_config_file(browser_config))
@@ -401,5 +615,82 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
    except Exception as e:
        raise click.ClickException(str(e))
@cli.command("examples")
 def examples_cmd():
    """Show usage examples"""
    show_examples()
@cli.command("profiles")
 def profiles_cmd():
    """Manage browser profiles interactively
    Launch an interactive browser profile manager where you can:
    - List all existing profiles
    - Create new profiles for authenticated browsing
    - Delete unused profiles
    """
    # Run interactive profile manager
    anyio.run(manage_profiles)
@cli.command()
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
 def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
        extraction_config: str, schema: str, browser: Dict, crawler: Dict,
        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
    """Crawl4AI CLI - Web content extraction tool
    Simple Usage:
        crwl https://example.com
    Run with --example to see detailed usage examples.
    Other commands:
        crwl profiles   - Manage browser profiles for identity-based crawling
        crwl crawl      - Crawl a website with advanced options
        crwl examples   - Show more usage examples
    """
    if example:
        show_examples()
        return
    if not url:
        # Show help without error message
        ctx = click.get_current_context()
        click.echo(ctx.get_help())
        return
    # Forward to crawl command
    ctx = click.get_current_context()
    ctx.invoke(
        crawl_cmd, 
        url=url, 
        browser_config=browser_config,
        crawler_config=crawler_config,
        filter_config=filter_config,
        extraction_config=extraction_config, 
        schema=schema,
        browser=browser,
        crawler=crawler,
        output=output,
        bypass_cache=bypass_cache,
        question=question,
        verbose=verbose,
        profile=profile
    )
 if __name__ == "__main__":
    cli()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,8 @@ dependencies = [
    "click>=8.1.7",
    "pyperclip>=1.8.2",
    "faust-cchardet>=2.1.19",
-    "aiohttp>=3.11.11"
+    "aiohttp>=3.11.11",
    "humanize>=4.10.0"
 ]
 classifiers = [
    "Development Status :: 4 - Beta",