Add: Cloud CLI module for profile management

New cloud module (crawl4ai/cloud/): - crwl cloud auth - Authenticate with API key - crwl cloud profiles upload - Upload local profile to cloud - crwl cloud profiles list - List cloud profiles - crwl cloud profiles delete - Delete cloud profile Features: - Stores credentials in ~/.crawl4ai/global.yml - Auto-shrinks profiles before upload (configurable) - Validates API key on auth - Rich formatted output with tables and panels
2026-01-25 09:35:48 +00:00
parent 94e19a4c72
commit ef226f5787
3 changed files with 493 additions and 0 deletions
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -34,6 +34,7 @@ from crawl4ai import (
 )
 from crawl4ai.browser_profiler import ShrinkLevel, _format_size
 from crawl4ai.config import USER_SETTINGS
+from crawl4ai.cloud import cloud_cmd
 from litellm import completion
 from pathlib import Path

@@ -625,6 +626,9 @@ def cli():
    """Crawl4AI CLI - Web content extraction and browser profile management tool"""
    pass

+# Add cloud command group
+cli.add_command(cloud_cmd)
+

@cli.group("browser")
 def browser_cmd():
--- a/crawl4ai/cloud/init.py
+++ b/crawl4ai/cloud/init.py
@@ -0,0 +1,16 @@
+"""
+Crawl4AI Cloud Module - Integration with Crawl4AI Cloud service.
+
+This module provides:
+- CLI commands for cloud profile management
+- API client for cloud operations (future)
+- Cloud configuration utilities
+"""
+
+from .cli import cloud_cmd, get_cloud_config, require_auth
+
+__all__ = [
+    "cloud_cmd",
+    "get_cloud_config",
+    "require_auth",
+]
--- a/crawl4ai/cloud/cli.py
+++ b/crawl4ai/cloud/cli.py
@@ -0,0 +1,473 @@
+"""
+Crawl4AI Cloud CLI - Commands for interacting with Crawl4AI Cloud service.
+
+Commands:
+  crwl cloud auth          - Authenticate with API key
+  crwl cloud profiles upload - Upload a profile to cloud
+  crwl cloud profiles list   - List cloud profiles
+  crwl cloud profiles delete - Delete a cloud profile
+"""
+
+import click
+import httpx
+import os
+import shutil
+import sys
+import tarfile
+import tempfile
+from pathlib import Path
+
+import yaml
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+
+from crawl4ai import BrowserProfiler
+from crawl4ai.browser_profiler import ShrinkLevel, _format_size
+
+console = Console()
+
+# Default cloud API URL
+DEFAULT_CLOUD_API_URL = "https://api.crawl4ai.com"
+
+
+def get_global_config() -> dict:
+    """Load global config from ~/.crawl4ai/global.yml"""
+    config_file = Path.home() / ".crawl4ai" / "global.yml"
+    if not config_file.exists():
+        return {}
+    with open(config_file) as f:
+        return yaml.safe_load(f) or {}
+
+
+def save_global_config(config: dict):
+    """Save global config to ~/.crawl4ai/global.yml"""
+    config_dir = Path.home() / ".crawl4ai"
+    config_dir.mkdir(parents=True, exist_ok=True)
+    config_file = config_dir / "global.yml"
+    with open(config_file, "w") as f:
+        yaml.dump(config, f)
+
+
+def get_cloud_config() -> tuple[str, str]:
+    """Get cloud API key and URL from config."""
+    config = get_global_config()
+    api_key = config.get("CLOUD_API_KEY")
+    api_url = config.get("CLOUD_API_URL", DEFAULT_CLOUD_API_URL)
+    return api_key, api_url
+
+
+def require_auth() -> tuple[str, str]:
+    """Require authentication, exit if not configured."""
+    api_key, api_url = get_cloud_config()
+    if not api_key:
+        console.print("[red]Not authenticated with Crawl4AI Cloud.[/red]")
+        console.print("\nRun [cyan]crwl cloud auth[/cyan] to authenticate.")
+        sys.exit(1)
+    return api_key, api_url
+
+
+# ==================== Cloud Command Group ====================
+
+@click.group("cloud")
+def cloud_cmd():
+    """Crawl4AI Cloud commands - manage cloud profiles and authentication.
+
+    Use browser profiles for authenticated crawling in the cloud.
+
+    Getting started:
+      1. Get an API key at https://api.crawl4ai.com/dashboard
+      2. Run: crwl cloud auth
+      3. Create a local profile: crwl profiles
+      4. Upload to cloud: crwl cloud profiles upload my_profile
+    """
+    pass
+
+
+# ==================== Auth Commands ====================
+
+@cloud_cmd.command("auth")
+@click.option("--api-key", "-k", help="API key (will prompt if not provided)")
+@click.option("--api-url", "-u", help=f"API URL (default: {DEFAULT_CLOUD_API_URL})")
+@click.option("--logout", is_flag=True, help="Remove saved credentials")
+@click.option("--status", is_flag=True, help="Show current auth status")
+def auth_cmd(api_key: str, api_url: str, logout: bool, status: bool):
+    """Authenticate with Crawl4AI Cloud.
+
+    Your API key is saved locally in ~/.crawl4ai/global.yml
+
+    To get an API key:
+      1. Go to https://api.crawl4ai.com/dashboard
+      2. Sign in or create an account
+      3. Navigate to API Keys section
+      4. Create a new key and copy it
+
+    Examples:
+      crwl cloud auth                    # Interactive authentication
+      crwl cloud auth --api-key sk_...   # Provide key directly
+      crwl cloud auth --status           # Check current status
+      crwl cloud auth --logout           # Remove saved credentials
+    """
+    config = get_global_config()
+
+    if status:
+        current_key = config.get("CLOUD_API_KEY")
+        current_url = config.get("CLOUD_API_URL", DEFAULT_CLOUD_API_URL)
+
+        if current_key:
+            # Mask the key for display
+            masked = current_key[:8] + "..." + current_key[-4:] if len(current_key) > 12 else "***"
+            console.print(Panel(
+                f"[green]Authenticated[/green]\n\n"
+                f"API Key: [cyan]{masked}[/cyan]\n"
+                f"API URL: [blue]{current_url}[/blue]",
+                title="Cloud Auth Status",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[yellow]Not authenticated[/yellow]\n\n"
+                "Run [cyan]crwl cloud auth[/cyan] to authenticate.\n\n"
+                "Get your API key at:\n"
+                "[blue]https://api.crawl4ai.com/dashboard[/blue]",
+                title="Cloud Auth Status",
+                border_style="yellow"
+            ))
+        return
+
+    if logout:
+        if "CLOUD_API_KEY" in config:
+            del config["CLOUD_API_KEY"]
+            save_global_config(config)
+            console.print("[green]Logged out successfully.[/green]")
+        else:
+            console.print("[yellow]Not currently authenticated.[/yellow]")
+        return
+
+    # Interactive auth
+    if not api_key:
+        console.print(Panel(
+            "[cyan]Crawl4AI Cloud Authentication[/cyan]\n\n"
+            "To get your API key:\n"
+            "  1. Go to [blue]https://api.crawl4ai.com/dashboard[/blue]\n"
+            "  2. Sign in or create an account\n"
+            "  3. Navigate to API Keys section\n"
+            "  4. Create a new key and paste it below",
+            title="Setup",
+            border_style="cyan"
+        ))
+        api_key = click.prompt("\nEnter your API key", hide_input=True)
+
+    if not api_key:
+        console.print("[red]API key is required.[/red]")
+        sys.exit(1)
+
+    # Validate the key by making a test request
+    test_url = api_url or config.get("CLOUD_API_URL", DEFAULT_CLOUD_API_URL)
+
+    console.print("\n[dim]Validating API key...[/dim]")
+
+    try:
+        response = httpx.get(
+            f"{test_url}/v1/profiles",
+            headers={"X-API-Key": api_key},
+            timeout=10.0
+        )
+
+        if response.status_code == 401:
+            console.print("[red]Invalid API key.[/red]")
+            sys.exit(1)
+        elif response.status_code != 200:
+            console.print(f"[red]Error validating key: {response.status_code}[/red]")
+            sys.exit(1)
+
+    except httpx.RequestError as e:
+        console.print(f"[red]Connection error: {e}[/red]")
+        sys.exit(1)
+
+    # Save to config
+    config["CLOUD_API_KEY"] = api_key
+    if api_url:
+        config["CLOUD_API_URL"] = api_url
+    save_global_config(config)
+
+    console.print("[green]Authentication successful![/green]")
+    console.print(f"Credentials saved to [cyan]~/.crawl4ai/global.yml[/cyan]")
+
+
+# ==================== Profiles Command Group ====================
+
+@cloud_cmd.group("profiles")
+def profiles_cmd():
+    """Manage cloud browser profiles.
+
+    Upload local browser profiles to Crawl4AI Cloud for authenticated crawling.
+
+    Workflow:
+      1. Create a local profile: crwl profiles
+      2. Shrink it (optional): crwl shrink my_profile
+      3. Upload to cloud: crwl cloud profiles upload my_profile
+      4. Use in API: {"browser_config": {"profile_id": "..."}}
+    """
+    pass
+
+
+@profiles_cmd.command("upload")
+@click.argument("profile_name")
+@click.option("--name", "-n", help="Cloud profile name (defaults to local name)")
+@click.option("--level", "-l",
+              type=click.Choice(["light", "medium", "aggressive", "minimal"]),
+              default="aggressive",
+              help="Shrink level before upload (default: aggressive)")
+@click.option("--no-shrink", is_flag=True, help="Skip shrinking (upload as-is)")
+def upload_cmd(profile_name: str, name: str, level: str, no_shrink: bool):
+    """Upload a browser profile to Crawl4AI Cloud.
+
+    The profile will be shrunk to remove caches before uploading.
+    Use --no-shrink to upload the profile as-is.
+
+    Examples:
+      crwl cloud profiles upload my_profile
+      crwl cloud profiles upload my_profile --name github-auth
+      crwl cloud profiles upload my_profile --level minimal
+      crwl cloud profiles upload my_profile --no-shrink
+    """
+    api_key, api_url = require_auth()
+
+    # Find the profile
+    profiler = BrowserProfiler()
+    profile_path = profiler.get_profile_path(profile_name)
+
+    if not profile_path:
+        console.print(f"[red]Profile not found: {profile_name}[/red]")
+        console.print("\nAvailable profiles:")
+        for p in profiler.list_profiles():
+            console.print(f"  - {p['name']}")
+        sys.exit(1)
+
+    cloud_name = name or profile_name
+
+    console.print(f"\n[cyan]Uploading profile:[/cyan] {profile_name}")
+    console.print(f"[cyan]Cloud name:[/cyan] {cloud_name}")
+
+    # Step 1: Shrink (unless --no-shrink)
+    if not no_shrink:
+        console.print(f"\n[dim][1/4] Shrinking profile ({level})...[/dim]")
+        try:
+            result = profiler.shrink(profile_name, ShrinkLevel(level), dry_run=False)
+            console.print(f"      Freed: {_format_size(result['bytes_freed'])}")
+            if result.get("size_after"):
+                console.print(f"      Size: {_format_size(result['size_after'])}")
+        except Exception as e:
+            console.print(f"[yellow]Warning: Could not shrink profile: {e}[/yellow]")
+    else:
+        console.print("\n[dim][1/4] Skipping shrink...[/dim]")
+
+    # Step 2: Package as tar.gz
+    console.print("[dim][2/4] Packaging profile...[/dim]")
+
+    temp_dir = Path(tempfile.mkdtemp(prefix="crawl4ai_upload_"))
+    tar_path = temp_dir / f"{cloud_name}.tar.gz"
+
+    try:
+        with tarfile.open(tar_path, "w:gz") as tar:
+            # Add profile contents (not the directory itself)
+            for item in Path(profile_path).iterdir():
+                tar.add(item, arcname=item.name)
+
+        size_bytes = tar_path.stat().st_size
+        console.print(f"      Created: {tar_path.name} ({_format_size(size_bytes)})")
+
+        # Step 3: Upload
+        console.print("[dim][3/4] Uploading to cloud...[/dim]")
+
+        with open(tar_path, "rb") as f:
+            response = httpx.post(
+                f"{api_url}/v1/profiles",
+                headers={"X-API-Key": api_key},
+                files={"file": (f"{cloud_name}.tar.gz", f, "application/gzip")},
+                data={"name": cloud_name},
+                timeout=120.0
+            )
+
+        if response.status_code == 409:
+            console.print(f"[red]Profile '{cloud_name}' already exists in cloud.[/red]")
+            console.print("Use --name to specify a different name, or delete the existing profile first.")
+            sys.exit(1)
+        elif response.status_code == 400:
+            error = response.json().get("detail", "Unknown error")
+            console.print(f"[red]Upload rejected: {error}[/red]")
+            sys.exit(1)
+        elif response.status_code != 200:
+            console.print(f"[red]Upload failed: {response.status_code}[/red]")
+            console.print(response.text)
+            sys.exit(1)
+
+        result = response.json()
+        profile_id = result["id"]
+
+        console.print("[dim][4/4] Done![/dim]")
+
+        # Success output
+        console.print(Panel(
+            f"[green]Profile uploaded successfully![/green]\n\n"
+            f"Profile ID: [cyan]{profile_id}[/cyan]\n"
+            f"Name: [blue]{cloud_name}[/blue]\n"
+            f"Size: {_format_size(size_bytes)}\n\n"
+            f"[dim]Use in API:[/dim]\n"
+            f'  {{"browser_config": {{"profile_id": "{profile_id}"}}}}',
+            title="Upload Complete",
+            border_style="green"
+        ))
+
+        if result.get("scan_warnings"):
+            console.print("\n[yellow]Scan warnings:[/yellow]")
+            for warning in result["scan_warnings"]:
+                console.print(f"  - {warning}")
+
+    finally:
+        # Cleanup temp directory
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+
+@profiles_cmd.command("list")
+def list_cmd():
+    """List all cloud profiles.
+
+    Shows all profiles uploaded to your Crawl4AI Cloud account.
+    """
+    api_key, api_url = require_auth()
+
+    console.print("\n[dim]Fetching profiles...[/dim]")
+
+    try:
+        response = httpx.get(
+            f"{api_url}/v1/profiles",
+            headers={"X-API-Key": api_key},
+            timeout=30.0
+        )
+
+        if response.status_code != 200:
+            console.print(f"[red]Error: {response.status_code}[/red]")
+            console.print(response.text)
+            sys.exit(1)
+
+        data = response.json()
+        profiles = data.get("profiles", [])
+
+        if not profiles:
+            console.print(Panel(
+                "[yellow]No cloud profiles found.[/yellow]\n\n"
+                "Upload a profile with:\n"
+                "  [cyan]crwl cloud profiles upload <profile_name>[/cyan]",
+                title="Cloud Profiles",
+                border_style="yellow"
+            ))
+            return
+
+        # Create table
+        table = Table(title="Cloud Profiles")
+        table.add_column("Name", style="cyan")
+        table.add_column("Profile ID", style="dim")
+        table.add_column("Size", justify="right")
+        table.add_column("Created", style="green")
+        table.add_column("Last Used", style="blue")
+
+        for p in profiles:
+            size = _format_size(p.get("size_bytes", 0)) if p.get("size_bytes") else "-"
+            created = p.get("created_at", "-")[:10] if p.get("created_at") else "-"
+            last_used = p.get("last_used_at", "-")[:10] if p.get("last_used_at") else "Never"
+
+            table.add_row(
+                p["name"],
+                p["id"][:8] + "...",
+                size,
+                created,
+                last_used
+            )
+
+        console.print(table)
+        console.print(f"\nTotal: {len(profiles)} profile(s)")
+
+    except httpx.RequestError as e:
+        console.print(f"[red]Connection error: {e}[/red]")
+        sys.exit(1)
+
+
+@profiles_cmd.command("delete")
+@click.argument("profile_name_or_id")
+@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
+def delete_cmd(profile_name_or_id: str, yes: bool):
+    """Delete a cloud profile.
+
+    You can specify either the profile name or ID.
+
+    Examples:
+      crwl cloud profiles delete my_profile
+      crwl cloud profiles delete abc123...
+      crwl cloud profiles delete my_profile --yes
+    """
+    api_key, api_url = require_auth()
+
+    # First, try to find the profile
+    console.print("\n[dim]Finding profile...[/dim]")
+
+    try:
+        # List profiles to find by name
+        response = httpx.get(
+            f"{api_url}/v1/profiles",
+            headers={"X-API-Key": api_key},
+            timeout=30.0
+        )
+
+        if response.status_code != 200:
+            console.print(f"[red]Error: {response.status_code}[/red]")
+            sys.exit(1)
+
+        profiles = response.json().get("profiles", [])
+
+        # Find matching profile
+        profile = None
+        for p in profiles:
+            if p["name"] == profile_name_or_id or p["id"] == profile_name_or_id or p["id"].startswith(profile_name_or_id):
+                profile = p
+                break
+
+        if not profile:
+            console.print(f"[red]Profile not found: {profile_name_or_id}[/red]")
+            console.print("\nAvailable profiles:")
+            for p in profiles:
+                console.print(f"  - {p['name']} ({p['id'][:8]}...)")
+            sys.exit(1)
+
+        # Confirm deletion
+        console.print(f"\nProfile: [cyan]{profile['name']}[/cyan]")
+        console.print(f"ID: [dim]{profile['id']}[/dim]")
+
+        if not yes:
+            if not click.confirm("\nAre you sure you want to delete this profile?"):
+                console.print("[yellow]Cancelled.[/yellow]")
+                return
+
+        # Delete
+        console.print("\n[dim]Deleting...[/dim]")
+
+        response = httpx.delete(
+            f"{api_url}/v1/profiles/{profile['id']}",
+            headers={"X-API-Key": api_key},
+            timeout=30.0
+        )
+
+        if response.status_code == 404:
+            console.print("[red]Profile not found (may have been already deleted).[/red]")
+            sys.exit(1)
+        elif response.status_code != 200:
+            console.print(f"[red]Error: {response.status_code}[/red]")
+            console.print(response.text)
+            sys.exit(1)
+
+        console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
+
+    except httpx.RequestError as e:
+        console.print(f"[red]Connection error: {e}[/red]")
+        sys.exit(1)