feat(cli): add browser profile management functionality

Adds new interactive browser profile management system that allows users to:
- Create and manage browser profiles for authenticated crawling
- List existing profiles with detailed information
- Delete unused profiles
- Use profiles during crawling with the new -p/--profile flag

Also restructures CLI to use Click groups and adds humanize dependency for better size formatting.
This commit is contained in:
UncleCode
2025-03-02 20:54:45 +08:00
parent cba4a466e5
commit 95175cb394
2 changed files with 316 additions and 24 deletions

View File

@@ -1,9 +1,20 @@
import click
import os
from typing import Dict, Any, Optional
import time
import datetime
import sys
import shutil
import humanize
from typing import Dict, Any, Optional, List
import json
import yaml
import anyio
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.prompt import Prompt, Confirm
from rich.style import Style
from crawl4ai import (
CacheMode,
AsyncWebCrawler,
@@ -14,13 +25,17 @@ from crawl4ai import (
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
BM25ContentFilter,
PruningContentFilter
PruningContentFilter,
BrowserProfiler
)
from litellm import completion
from pathlib import Path
from crawl4ai.async_configs import LlmConfig
# Initialize rich console
console = Console()
def get_global_config() -> dict:
config_dir = Path.home() / ".crawl4ai"
config_file = config_dir / "global.yml"
@@ -172,7 +187,21 @@ def show_examples():
# Crawler settings
crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
4Sample Config Files:
4Profile Management for Identity-Based Crawling:
# Launch interactive profile manager
crwl profiles
# Create, list, and delete browser profiles for identity-based crawling
# Use a profile for crawling (keeps you logged in)
crwl https://example.com -p my-profile-name
# Example: Crawl a site that requires login
# 1. First create a profile and log in:
crwl profiles
# 2. Then use that profile to crawl the authenticated site:
crwl https://site-requiring-login.com/dashboard -p my-profile-name
5⃣ Sample Config Files:
browser.yml:
headless: true
@@ -230,7 +259,7 @@ llm_schema.json:
}
}
5️⃣ Advanced Usage:
6️⃣ Advanced Usage:
# Combine configs with direct parameters
crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
@@ -248,9 +277,15 @@ llm_schema.json:
-f filter_bm25.yml \\
-o markdown-fit
# Authenticated crawling with profile
crwl https://login-required-site.com \\
-p my-authenticated-profile \\
-c "css_selector=.dashboard-content" \\
-o markdown
For more documentation visit: https://github.com/unclecode/crawl4ai
6️⃣ Q&A with LLM:
7️⃣ Q&A with LLM:
# Ask a question about the content
crwl https://example.com -q "What is the main topic discussed?"
@@ -277,12 +312,173 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
- google/gemini-pro
See full list of providers: https://docs.litellm.ai/docs/providers
8⃣ Profile Management:
# Launch interactive profile manager
crwl profiles
# Create a profile and use it for crawling
crwl profiles # Create and set up your profile interactively
crwl https://example.com -p my-profile-name # Use profile for crawling
# Example workflow for authenticated site
# 1. First create a profile and log in to the site:
crwl profiles # Select "Create new profile" option
# 2. Then use that profile to crawl authenticated content:
crwl https://site-requiring-login.com/dashboard -p my-profile-name
"""
click.echo(examples)
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
def get_directory_size(path: str) -> int:
"""Calculate the total size of a directory in bytes"""
total_size = 0
for dirpath, _, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
total_size += os.path.getsize(fp)
return total_size
def display_profiles_table(profiles: List[Dict[str, Any]]):
"""Display a rich table of browser profiles"""
if not profiles:
console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]",
title="Browser Profiles", border_style="blue"))
return
table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
table.add_column("#", style="dim", width=4)
table.add_column("Name", style="cyan", no_wrap=True)
table.add_column("Path", style="green")
table.add_column("Created", style="yellow")
table.add_column("Browser", style="magenta")
table.add_column("Size", style="blue", justify="right")
for i, profile in enumerate(profiles):
# Calculate folder size
size = get_directory_size(profile["path"])
human_size = humanize.naturalsize(size)
# Format creation date
created = profile["created"].strftime("%Y-%m-%d %H:%M")
# Add row to table
table.add_row(
str(i+1),
profile["name"],
profile["path"],
created,
profile["type"].capitalize(),
human_size
)
console.print(table)
async def create_profile_interactive(profiler: BrowserProfiler):
"""Interactive profile creation wizard"""
console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
"This will open a browser window for you to set up your identity.\n"
"Log in to sites, adjust settings, then press 'q' to save.",
border_style="cyan"))
profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
console.print("[cyan]Creating profile...[/cyan]")
console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
# Create the profile
try:
profile_path = await profiler.create_profile(profile_name)
if profile_path:
console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
else:
console.print("[red]Failed to create profile.[/red]")
except Exception as e:
console.print(f"[red]Error creating profile: {str(e)}[/red]")
def delete_profile_interactive(profiler: BrowserProfiler):
"""Interactive profile deletion"""
profiles = profiler.list_profiles()
if not profiles:
console.print("[yellow]No profiles found to delete.[/yellow]")
return
# Display profiles
display_profiles_table(profiles)
# Get profile selection
idx = Prompt.ask(
"[red]Enter number of profile to delete[/red]",
console=console,
choices=[str(i+1) for i in range(len(profiles))],
show_choices=False
)
try:
idx = int(idx) - 1
profile = profiles[idx]
# Confirm deletion
if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
success = profiler.delete_profile(profile["path"])
if success:
console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
else:
console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
except (ValueError, IndexError):
console.print("[red]Invalid selection.[/red]")
async def manage_profiles():
"""Interactive profile management menu"""
profiler = BrowserProfiler()
options = {
"1": "List profiles",
"2": "Create new profile",
"3": "Delete profile",
"4": "Exit",
}
while True:
console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
for key, value in options.items():
color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "cyan"
console.print(f"[{color}]{key}[/{color}]. {value}")
choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
if choice == "1":
# List profiles
profiles = profiler.list_profiles()
display_profiles_table(profiles)
elif choice == "2":
# Create profile
await create_profile_interactive(profiler)
elif choice == "3":
# Delete profile
delete_profile_interactive(profiler)
elif choice == "4":
# Exit
console.print("[cyan]Exiting profile manager.[/cyan]")
break
# Add a separator between operations
console.print("\n")
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
def cli():
"""Crawl4AI CLI - Web content extraction and browser profile management tool"""
pass
@cli.command("crawl")
@click.argument("url", required=True)
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
@@ -291,26 +487,44 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling")
@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, schema: str, browser: Dict, crawler: Dict,
output: str, bypass_cache: bool, question: str, verbose: bool):
"""Crawl4AI CLI - Web content extraction tool
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, schema: str, browser: Dict, crawler: Dict,
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
"""Crawl a website and extract content
Simple Usage:
crwl https://example.com
crwl crawl https://example.com
"""
Run with --example to see detailed usage examples."""
if example:
show_examples()
return
# Handle profile option
if profile:
profiler = BrowserProfiler()
profile_path = profiler.get_profile_path(profile)
if not url:
raise click.UsageError("URL argument is required unless using --example")
if not profile_path:
profiles = profiler.list_profiles()
if profiles:
console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
display_profiles_table(profiles)
else:
console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
return
# Include the profile in browser config
if not browser:
browser = {}
browser["user_data_dir"] = profile_path
browser["use_managed_browser"] = True
if verbose:
console.print(f"[green]Using browser profile:[/green] {profile}")
try:
# Load base configurations
browser_cfg = BrowserConfig.load(load_config_file(browser_config))
@@ -401,5 +615,82 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
except Exception as e:
raise click.ClickException(str(e))
@cli.command("examples")
def examples_cmd():
"""Show usage examples"""
show_examples()
@cli.command("profiles")
def profiles_cmd():
"""Manage browser profiles interactively
Launch an interactive browser profile manager where you can:
- List all existing profiles
- Create new profiles for authenticated browsing
- Delete unused profiles
"""
# Run interactive profile manager
anyio.run(manage_profiles)
@cli.command()
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, schema: str, browser: Dict, crawler: Dict,
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
"""Crawl4AI CLI - Web content extraction tool
Simple Usage:
crwl https://example.com
Run with --example to see detailed usage examples.
Other commands:
crwl profiles - Manage browser profiles for identity-based crawling
crwl crawl - Crawl a website with advanced options
crwl examples - Show more usage examples
"""
if example:
show_examples()
return
if not url:
# Show help without error message
ctx = click.get_current_context()
click.echo(ctx.get_help())
return
# Forward to crawl command
ctx = click.get_current_context()
ctx.invoke(
crawl_cmd,
url=url,
browser_config=browser_config,
crawler_config=crawler_config,
filter_config=filter_config,
extraction_config=extraction_config,
schema=schema,
browser=browser,
crawler=crawler,
output=output,
bypass_cache=bypass_cache,
question=question,
verbose=verbose,
profile=profile
)
if __name__ == "__main__":
cli()

View File

@@ -41,7 +41,8 @@ dependencies = [
"click>=8.1.7",
"pyperclip>=1.8.2",
"faust-cchardet>=2.1.19",
"aiohttp>=3.11.11"
"aiohttp>=3.11.11",
"humanize>=4.10.0"
]
classifiers = [
"Development Status :: 4 - Beta",