feat(cli): add browser profile management functionality

Adds new interactive browser profile management system that allows users to: - Create and manage browser profiles for authenticated crawling - List existing profiles with detailed information - Delete unused profiles - Use profiles during crawling with the new -p/--profile flag Also restructures CLI to use Click groups and adds humanize dependency for better size formatting.
2025-03-02 20:54:45 +08:00
parent cba4a466e5
commit 95175cb394
2 changed files with 316 additions and 24 deletions
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1,9 +1,20 @@
 import click
 import os
-from typing import Dict, Any, Optional
+import time
+import datetime
+import sys
+import shutil
+import humanize
+from typing import Dict, Any, Optional, List
 import json
 import yaml
 import anyio
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.prompt import Prompt, Confirm
+from rich.style import Style
+
 from crawl4ai import (
    CacheMode,
    AsyncWebCrawler, 
@@ -14,13 +25,17 @@ from crawl4ai import (
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy,
    BM25ContentFilter, 
-    PruningContentFilter
+    PruningContentFilter,
+    BrowserProfiler
 )
 from litellm import completion
 from pathlib import Path

 from crawl4ai.async_configs import LlmConfig

+# Initialize rich console
+console = Console()
+
 def get_global_config() -> dict:
    config_dir = Path.home() / ".crawl4ai"
    config_file = config_dir / "global.yml"
@@ -172,7 +187,21 @@ def show_examples():
    # Crawler settings
    crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"

-4️⃣  Sample Config Files:
+4️⃣  Profile Management for Identity-Based Crawling:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create, list, and delete browser profiles for identity-based crawling
+    # Use a profile for crawling (keeps you logged in)
+    crwl https://example.com -p my-profile-name
+
+    # Example: Crawl a site that requires login
+    # 1. First create a profile and log in:
+    crwl profiles
+    # 2. Then use that profile to crawl the authenticated site:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+5️⃣  Sample Config Files:

 browser.yml:
    headless: true
@@ -230,7 +259,7 @@ llm_schema.json:
      }
    }

-5️⃣  Advanced Usage:
+6️⃣  Advanced Usage:
    # Combine configs with direct parameters
    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"

@@ -248,9 +277,15 @@ llm_schema.json:
        -f filter_bm25.yml \\
        -o markdown-fit

+    # Authenticated crawling with profile
+    crwl https://login-required-site.com \\
+        -p my-authenticated-profile \\
+        -c "css_selector=.dashboard-content" \\
+        -o markdown
+
 For more documentation visit: https://github.com/unclecode/crawl4ai

-6️⃣  Q&A with LLM:
+7️⃣  Q&A with LLM:
    # Ask a question about the content
    crwl https://example.com -q "What is the main topic discussed?"

@@ -277,12 +312,173 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
      - google/gemini-pro
    
    See full list of providers: https://docs.litellm.ai/docs/providers
+
+8️⃣ Profile Management:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create a profile and use it for crawling
+    crwl profiles  # Create and set up your profile interactively
+    crwl https://example.com -p my-profile-name  # Use profile for crawling
+
+    # Example workflow for authenticated site
+    # 1. First create a profile and log in to the site:
+    crwl profiles  # Select "Create new profile" option
+    # 2. Then use that profile to crawl authenticated content:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
 """
    click.echo(examples)

-@click.command(context_settings={"help_option_names": ["-h", "--help"]})
-@click.argument("url", required=False)
-@click.option("--example", is_flag=True, help="Show usage examples")
+def get_directory_size(path: str) -> int:
+    """Calculate the total size of a directory in bytes"""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if not os.path.islink(fp):
+                total_size += os.path.getsize(fp)
+    return total_size
+
+def display_profiles_table(profiles: List[Dict[str, Any]]):
+    """Display a rich table of browser profiles"""
+    if not profiles:
+        console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]", 
+                          title="Browser Profiles", border_style="blue"))
+        return
+    
+    table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("#", style="dim", width=4)
+    table.add_column("Name", style="cyan", no_wrap=True)
+    table.add_column("Path", style="green")
+    table.add_column("Created", style="yellow")
+    table.add_column("Browser", style="magenta")
+    table.add_column("Size", style="blue", justify="right")
+    
+    for i, profile in enumerate(profiles):
+        # Calculate folder size
+        size = get_directory_size(profile["path"])
+        human_size = humanize.naturalsize(size)
+        
+        # Format creation date
+        created = profile["created"].strftime("%Y-%m-%d %H:%M")
+        
+        # Add row to table
+        table.add_row(
+            str(i+1), 
+            profile["name"], 
+            profile["path"], 
+            created, 
+            profile["type"].capitalize(), 
+            human_size
+        )
+    
+    console.print(table)
+
+async def create_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile creation wizard"""
+    console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
+                      "This will open a browser window for you to set up your identity.\n"
+                      "Log in to sites, adjust settings, then press 'q' to save.",
+                      border_style="cyan"))
+    
+    profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
+    
+    console.print("[cyan]Creating profile...[/cyan]")
+    console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
+    
+    # Create the profile
+    try:
+        profile_path = await profiler.create_profile(profile_name)
+        
+        if profile_path:
+            console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
+        else:
+            console.print("[red]Failed to create profile.[/red]")
+    except Exception as e:
+        console.print(f"[red]Error creating profile: {str(e)}[/red]")
+
+def delete_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile deletion"""
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found to delete.[/yellow]")
+        return
+    
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[red]Enter number of profile to delete[/red]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Confirm deletion
+        if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
+            success = profiler.delete_profile(profile["path"])
+            
+            if success:
+                console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
+            else:
+                console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection.[/red]")
+        
+async def manage_profiles():
+    """Interactive profile management menu"""
+    profiler = BrowserProfiler()
+    
+    options = {
+        "1": "List profiles",
+        "2": "Create new profile",
+        "3": "Delete profile",
+        "4": "Exit",
+    }
+    
+    while True:
+        console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
+        
+        for key, value in options.items():
+            color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "cyan"
+            console.print(f"[{color}]{key}[/{color}]. {value}")
+        
+        choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
+        
+        if choice == "1":
+            # List profiles
+            profiles = profiler.list_profiles()
+            display_profiles_table(profiles)
+        
+        elif choice == "2":
+            # Create profile
+            await create_profile_interactive(profiler)
+        
+        elif choice == "3":
+            # Delete profile
+            delete_profile_interactive(profiler)
+        
+        elif choice == "4":
+            # Exit
+            console.print("[cyan]Exiting profile manager.[/cyan]")
+            break
+        
+        # Add a separator between operations
+        console.print("\n")
+
+@click.group(context_settings={"help_option_names": ["-h", "--help"]})
+def cli():
+    """Crawl4AI CLI - Web content extraction and browser profile management tool"""
+    pass
+
+@cli.command("crawl")
+@click.argument("url", required=True)
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
@@ -291,26 +487,44 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
-@click.option("--bypass-cache", is_flag=True, default = True,  help="Bypass cache when crawling")
+@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
-def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
-        extraction_config: str, schema: str, browser: Dict, crawler: Dict,
-        output: str, bypass_cache: bool, question: str, verbose: bool):
-    """Crawl4AI CLI - Web content extraction tool
-
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, 
+           extraction_config: str, schema: str, browser: Dict, crawler: Dict,
+           output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl a website and extract content
+    
    Simple Usage:
-        crwl https://example.com
+        crwl crawl https://example.com
+    """
    
-    Run with --example to see detailed usage examples."""
-
-    if example:
-        show_examples()
-        return
+    # Handle profile option
+    if profile:
+        profiler = BrowserProfiler()
+        profile_path = profiler.get_profile_path(profile)
        
-    if not url:
-        raise click.UsageError("URL argument is required unless using --example")
-    
+        if not profile_path:
+            profiles = profiler.list_profiles()
+            
+            if profiles:
+                console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
+                display_profiles_table(profiles)
+            else:
+                console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
+            
+            return
+        
+        # Include the profile in browser config
+        if not browser:
+            browser = {}
+        browser["user_data_dir"] = profile_path
+        browser["use_managed_browser"] = True
+        
+        if verbose:
+            console.print(f"[green]Using browser profile:[/green] {profile}")
+            
    try:
        # Load base configurations
        browser_cfg = BrowserConfig.load(load_config_file(browser_config))
@@ -401,5 +615,82 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
    except Exception as e:
        raise click.ClickException(str(e))

+@cli.command("examples")
+def examples_cmd():
+    """Show usage examples"""
+    show_examples()
+
+@cli.command("profiles")
+def profiles_cmd():
+    """Manage browser profiles interactively
+    
+    Launch an interactive browser profile manager where you can:
+    - List all existing profiles
+    - Create new profiles for authenticated browsing
+    - Delete unused profiles
+    """
+    # Run interactive profile manager
+    anyio.run(manage_profiles)
+
+@cli.command()
+@click.argument("url", required=False)
+@click.option("--example", is_flag=True, help="Show usage examples")
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
+        extraction_config: str, schema: str, browser: Dict, crawler: Dict,
+        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl4AI CLI - Web content extraction tool
+
+    Simple Usage:
+        crwl https://example.com
+    
+    Run with --example to see detailed usage examples.
+    
+    Other commands:
+        crwl profiles   - Manage browser profiles for identity-based crawling
+        crwl crawl      - Crawl a website with advanced options
+        crwl examples   - Show more usage examples
+    """
+
+    if example:
+        show_examples()
+        return
+        
+    if not url:
+        # Show help without error message
+        ctx = click.get_current_context()
+        click.echo(ctx.get_help())
+        return
+        
+    # Forward to crawl command
+    ctx = click.get_current_context()
+    ctx.invoke(
+        crawl_cmd, 
+        url=url, 
+        browser_config=browser_config,
+        crawler_config=crawler_config,
+        filter_config=filter_config,
+        extraction_config=extraction_config, 
+        schema=schema,
+        browser=browser,
+        crawler=crawler,
+        output=output,
+        bypass_cache=bypass_cache,
+        question=question,
+        verbose=verbose,
+        profile=profile
+    )
+
 if __name__ == "__main__":
    cli()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,8 @@ dependencies = [
    "click>=8.1.7",
    "pyperclip>=1.8.2",
    "faust-cchardet>=2.1.19",
-    "aiohttp>=3.11.11"
+    "aiohttp>=3.11.11",
+    "humanize>=4.10.0"
 ]
 classifiers = [
    "Development Status :: 4 - Beta",