From 95175cb394c0814797b3806aaf41fe9c68ece1c8 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 2 Mar 2025 20:54:45 +0800 Subject: [PATCH] feat(cli): add browser profile management functionality Adds new interactive browser profile management system that allows users to: - Create and manage browser profiles for authenticated crawling - List existing profiles with detailed information - Delete unused profiles - Use profiles during crawling with the new -p/--profile flag Also restructures CLI to use Click groups and adds humanize dependency for better size formatting. --- crawl4ai/cli.py | 337 ++++++++++++++++++++++++++++++++++++++++++++---- pyproject.toml | 3 +- 2 files changed, 316 insertions(+), 24 deletions(-) diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index aabf6c0c..32fb67f7 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1,9 +1,20 @@ import click import os -from typing import Dict, Any, Optional +import time +import datetime +import sys +import shutil +import humanize +from typing import Dict, Any, Optional, List import json import yaml import anyio +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from rich.prompt import Prompt, Confirm +from rich.style import Style + from crawl4ai import ( CacheMode, AsyncWebCrawler, @@ -14,13 +25,17 @@ from crawl4ai import ( JsonCssExtractionStrategy, JsonXPathExtractionStrategy, BM25ContentFilter, - PruningContentFilter + PruningContentFilter, + BrowserProfiler ) from litellm import completion from pathlib import Path from crawl4ai.async_configs import LlmConfig +# Initialize rich console +console = Console() + def get_global_config() -> dict: config_dir = Path.home() / ".crawl4ai" config_file = config_dir / "global.yml" @@ -172,7 +187,21 @@ def show_examples(): # Crawler settings crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true" -4️⃣ Sample Config Files: +4️⃣ Profile Management for Identity-Based Crawling: + # Launch interactive profile manager + crwl profiles + + # Create, list, and delete browser profiles for identity-based crawling + # Use a profile for crawling (keeps you logged in) + crwl https://example.com -p my-profile-name + + # Example: Crawl a site that requires login + # 1. First create a profile and log in: + crwl profiles + # 2. Then use that profile to crawl the authenticated site: + crwl https://site-requiring-login.com/dashboard -p my-profile-name + +5️⃣ Sample Config Files: browser.yml: headless: true @@ -230,7 +259,7 @@ llm_schema.json: } } -5️⃣ Advanced Usage: +6️⃣ Advanced Usage: # Combine configs with direct parameters crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920" @@ -248,9 +277,15 @@ llm_schema.json: -f filter_bm25.yml \\ -o markdown-fit + # Authenticated crawling with profile + crwl https://login-required-site.com \\ + -p my-authenticated-profile \\ + -c "css_selector=.dashboard-content" \\ + -o markdown + For more documentation visit: https://github.com/unclecode/crawl4ai -6️⃣ Q&A with LLM: +7️⃣ Q&A with LLM: # Ask a question about the content crwl https://example.com -q "What is the main topic discussed?" @@ -277,12 +312,173 @@ For more documentation visit: https://github.com/unclecode/crawl4ai - google/gemini-pro See full list of providers: https://docs.litellm.ai/docs/providers + +8️⃣ Profile Management: + # Launch interactive profile manager + crwl profiles + + # Create a profile and use it for crawling + crwl profiles # Create and set up your profile interactively + crwl https://example.com -p my-profile-name # Use profile for crawling + + # Example workflow for authenticated site + # 1. First create a profile and log in to the site: + crwl profiles # Select "Create new profile" option + # 2. Then use that profile to crawl authenticated content: + crwl https://site-requiring-login.com/dashboard -p my-profile-name """ click.echo(examples) -@click.command(context_settings={"help_option_names": ["-h", "--help"]}) -@click.argument("url", required=False) -@click.option("--example", is_flag=True, help="Show usage examples") +def get_directory_size(path: str) -> int: + """Calculate the total size of a directory in bytes""" + total_size = 0 + for dirpath, _, filenames in os.walk(path): + for f in filenames: + fp = os.path.join(dirpath, f) + if not os.path.islink(fp): + total_size += os.path.getsize(fp) + return total_size + +def display_profiles_table(profiles: List[Dict[str, Any]]): + """Display a rich table of browser profiles""" + if not profiles: + console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]", + title="Browser Profiles", border_style="blue")) + return + + table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue") + table.add_column("#", style="dim", width=4) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("Path", style="green") + table.add_column("Created", style="yellow") + table.add_column("Browser", style="magenta") + table.add_column("Size", style="blue", justify="right") + + for i, profile in enumerate(profiles): + # Calculate folder size + size = get_directory_size(profile["path"]) + human_size = humanize.naturalsize(size) + + # Format creation date + created = profile["created"].strftime("%Y-%m-%d %H:%M") + + # Add row to table + table.add_row( + str(i+1), + profile["name"], + profile["path"], + created, + profile["type"].capitalize(), + human_size + ) + + console.print(table) + +async def create_profile_interactive(profiler: BrowserProfiler): + """Interactive profile creation wizard""" + console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n" + "This will open a browser window for you to set up your identity.\n" + "Log in to sites, adjust settings, then press 'q' to save.", + border_style="cyan")) + + profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}") + + console.print("[cyan]Creating profile...[/cyan]") + console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]") + + # Create the profile + try: + profile_path = await profiler.create_profile(profile_name) + + if profile_path: + console.print(f"[green]Profile successfully created at:[/green] {profile_path}") + else: + console.print("[red]Failed to create profile.[/red]") + except Exception as e: + console.print(f"[red]Error creating profile: {str(e)}[/red]") + +def delete_profile_interactive(profiler: BrowserProfiler): + """Interactive profile deletion""" + profiles = profiler.list_profiles() + + if not profiles: + console.print("[yellow]No profiles found to delete.[/yellow]") + return + + # Display profiles + display_profiles_table(profiles) + + # Get profile selection + idx = Prompt.ask( + "[red]Enter number of profile to delete[/red]", + console=console, + choices=[str(i+1) for i in range(len(profiles))], + show_choices=False + ) + + try: + idx = int(idx) - 1 + profile = profiles[idx] + + # Confirm deletion + if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"): + success = profiler.delete_profile(profile["path"]) + + if success: + console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]") + else: + console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]") + except (ValueError, IndexError): + console.print("[red]Invalid selection.[/red]") + +async def manage_profiles(): + """Interactive profile management menu""" + profiler = BrowserProfiler() + + options = { + "1": "List profiles", + "2": "Create new profile", + "3": "Delete profile", + "4": "Exit", + } + + while True: + console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan")) + + for key, value in options.items(): + color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "cyan" + console.print(f"[{color}]{key}[/{color}]. {value}") + + choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1") + + if choice == "1": + # List profiles + profiles = profiler.list_profiles() + display_profiles_table(profiles) + + elif choice == "2": + # Create profile + await create_profile_interactive(profiler) + + elif choice == "3": + # Delete profile + delete_profile_interactive(profiler) + + elif choice == "4": + # Exit + console.print("[cyan]Exiting profile manager.[/cyan]") + break + + # Add a separator between operations + console.print("\n") + +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) +def cli(): + """Crawl4AI CLI - Web content extraction and browser profile management tool""" + pass + +@cli.command("crawl") +@click.argument("url", required=True) @click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)") @click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)") @click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file") @@ -291,26 +487,44 @@ For more documentation visit: https://github.com/unclecode/crawl4ai @click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2") @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") @click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all") -@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling") +@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling") @click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--verbose", "-v", is_flag=True) -def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, - extraction_config: str, schema: str, browser: Dict, crawler: Dict, - output: str, bypass_cache: bool, question: str, verbose: bool): - """Crawl4AI CLI - Web content extraction tool - +@click.option("--profile", "-p", help="Use a specific browser profile (by name)") +def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, + extraction_config: str, schema: str, browser: Dict, crawler: Dict, + output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): + """Crawl a website and extract content + Simple Usage: - crwl https://example.com + crwl crawl https://example.com + """ - Run with --example to see detailed usage examples.""" - - if example: - show_examples() - return + # Handle profile option + if profile: + profiler = BrowserProfiler() + profile_path = profiler.get_profile_path(profile) - if not url: - raise click.UsageError("URL argument is required unless using --example") - + if not profile_path: + profiles = profiler.list_profiles() + + if profiles: + console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]") + display_profiles_table(profiles) + else: + console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]") + + return + + # Include the profile in browser config + if not browser: + browser = {} + browser["user_data_dir"] = profile_path + browser["use_managed_browser"] = True + + if verbose: + console.print(f"[green]Using browser profile:[/green] {profile}") + try: # Load base configurations browser_cfg = BrowserConfig.load(load_config_file(browser_config)) @@ -401,5 +615,82 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte except Exception as e: raise click.ClickException(str(e)) +@cli.command("examples") +def examples_cmd(): + """Show usage examples""" + show_examples() + +@cli.command("profiles") +def profiles_cmd(): + """Manage browser profiles interactively + + Launch an interactive browser profile manager where you can: + - List all existing profiles + - Create new profiles for authenticated browsing + - Delete unused profiles + """ + # Run interactive profile manager + anyio.run(manage_profiles) + +@cli.command() +@click.argument("url", required=False) +@click.option("--example", is_flag=True, help="Show usage examples") +@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)") +@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)") +@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file") +@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file") +@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction") +@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2") +@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") +@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all") +@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling") +@click.option("--question", "-q", help="Ask a question about the crawled content") +@click.option("--verbose", "-v", is_flag=True) +@click.option("--profile", "-p", help="Use a specific browser profile (by name)") +def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, + extraction_config: str, schema: str, browser: Dict, crawler: Dict, + output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): + """Crawl4AI CLI - Web content extraction tool + + Simple Usage: + crwl https://example.com + + Run with --example to see detailed usage examples. + + Other commands: + crwl profiles - Manage browser profiles for identity-based crawling + crwl crawl - Crawl a website with advanced options + crwl examples - Show more usage examples + """ + + if example: + show_examples() + return + + if not url: + # Show help without error message + ctx = click.get_current_context() + click.echo(ctx.get_help()) + return + + # Forward to crawl command + ctx = click.get_current_context() + ctx.invoke( + crawl_cmd, + url=url, + browser_config=browser_config, + crawler_config=crawler_config, + filter_config=filter_config, + extraction_config=extraction_config, + schema=schema, + browser=browser, + crawler=crawler, + output=output, + bypass_cache=bypass_cache, + question=question, + verbose=verbose, + profile=profile + ) + if __name__ == "__main__": cli() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index af5eca74..4143a646 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,8 @@ dependencies = [ "click>=8.1.7", "pyperclip>=1.8.2", "faust-cchardet>=2.1.19", - "aiohttp>=3.11.11" + "aiohttp>=3.11.11", + "humanize>=4.10.0" ] classifiers = [ "Development Status :: 4 - Beta",