feat(docker): add multi-container cluster deployment with CLI management

Add comprehensive Docker cluster orchestration with horizontal scaling support. CLI Commands: - crwl server start/stop/restart/status/scale/logs - Auto-detection: Single (N=1) → Swarm (N>1) → Compose (N>1 fallback) - Support for 1-100 container replicas with zero-downtime scaling Infrastructure: - Nginx load balancing (round-robin API, sticky sessions monitoring) - Redis-based container discovery via heartbeats (30s interval) - Real-time monitoring dashboard with cluster-wide visibility - WebSocket aggregation from all containers Security & Stability Fixes (12 critical issues): - Add timeout protection to browser pool locks (prevent deadlocks) - Implement Redis retry logic with exponential backoff - Add container ID validation (prevent Redis key injection) - Add CLI input sanitization (prevent shell injection) - Add file locking for state management (prevent corruption) - Fix WebSocket resource leaks and connection cleanup - Add graceful degradation and circuit breakers Configuration: - RedisTTLConfig dataclass with environment variable support - Template-based docker-compose.yml and nginx.conf generation - Comprehensive error handling with actionable messages Documentation: - AGENT.md: Complete DevOps context for AI assistants - MULTI_CONTAINER_ARCHITECTURE.md: Technical architecture guide - Reorganized docs into deploy/docker/docs/
2025-10-19 13:31:14 +08:00
parent 73a5a7b0f5
commit 91f7b9d129
18 changed files with 5116 additions and 196 deletions
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -625,6 +625,11 @@ def cli():
    pass


+# Register server command group (Docker orchestration)
+from crawl4ai.server_cli import server_cmd
+cli.add_command(server_cmd)
+
+
@cli.group("browser")
 def browser_cmd():
    """Manage browser instances for Crawl4AI
--- a/crawl4ai/server_cli.py
+++ b/crawl4ai/server_cli.py
@@ -0,0 +1,420 @@
+"""
+Crawl4AI Server CLI Commands
+
+Provides `crwl server` command group for Docker orchestration.
+"""
+
+import click
+import anyio
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.prompt import Confirm
+
+from crawl4ai.server_manager import ServerManager
+
+
+console = Console()
+
+
+@click.group("server")
+def server_cmd():
+    """Manage Crawl4AI Docker server instances
+
+    One-command deployment with automatic scaling:
+    - Single container for development (N=1)
+    - Docker Swarm for production with built-in load balancing (N>1)
+    - Docker Compose + Nginx as fallback (N>1)
+
+    Examples:
+        crwl server start                    # Single container on port 11235
+        crwl server start --replicas 3       # Auto-detect Swarm or Compose
+        crwl server start -r 5 --port 8080   # 5 replicas on custom port
+        crwl server status                   # Check current deployment
+        crwl server scale 10                 # Scale to 10 replicas
+        crwl server stop                     # Stop and cleanup
+    """
+    pass
+
+
+@server_cmd.command("start")
+@click.option(
+    "--replicas", "-r",
+    type=int,
+    default=1,
+    help="Number of container replicas (default: 1)"
+)
+@click.option(
+    "--mode",
+    type=click.Choice(["auto", "single", "swarm", "compose"]),
+    default="auto",
+    help="Deployment mode (default: auto-detect)"
+)
+@click.option(
+    "--port", "-p",
+    type=int,
+    default=11235,
+    help="External port to expose (default: 11235)"
+)
+@click.option(
+    "--env-file",
+    type=click.Path(exists=True),
+    help="Path to environment file"
+)
+@click.option(
+    "--image",
+    default="unclecode/crawl4ai:latest",
+    help="Docker image to use (default: unclecode/crawl4ai:latest)"
+)
+def start_cmd(replicas: int, mode: str, port: int, env_file: str, image: str):
+    """Start Crawl4AI server with automatic orchestration.
+
+    Deployment modes:
+    - auto: Automatically choose best mode (default)
+    - single: Single container (N=1 only)
+    - swarm: Docker Swarm with built-in load balancing
+    - compose: Docker Compose + Nginx reverse proxy
+
+    The server will:
+    1. Check if Docker is running
+    2. Validate port availability
+    3. Pull image if needed
+    4. Start container(s) with health checks
+    5. Save state for management
+
+    Examples:
+        # Development: single container
+        crwl server start
+
+        # Production: 5 replicas with Swarm
+        crwl server start --replicas 5
+
+        # Custom configuration
+        crwl server start -r 3 --port 8080 --env-file .env.prod
+    """
+    manager = ServerManager()
+
+    console.print(Panel(
+        f"[cyan]Starting Crawl4AI Server[/cyan]\n\n"
+        f"Replicas: [yellow]{replicas}[/yellow]\n"
+        f"Mode: [yellow]{mode}[/yellow]\n"
+        f"Port: [yellow]{port}[/yellow]\n"
+        f"Image: [yellow]{image}[/yellow]",
+        title="Server Start",
+        border_style="cyan"
+    ))
+
+    with console.status("[cyan]Starting server..."):
+        async def _start():
+            return await manager.start(
+                replicas=replicas,
+                mode=mode,
+                port=port,
+                env_file=env_file,
+                image=image
+            )
+        result = anyio.run(_start)
+
+    if result["success"]:
+        console.print(Panel(
+            f"[green]✓ Server started successfully![/green]\n\n"
+            f"Mode: [cyan]{result.get('state_data', {}).get('mode', mode)}[/cyan]\n"
+            f"URL: [bold]http://localhost:{port}[/bold]\n"
+            f"Health: [bold]http://localhost:{port}/health[/bold]\n"
+            f"Monitor: [bold]http://localhost:{port}/monitor[/bold]",
+            title="Server Running",
+            border_style="green"
+        ))
+    else:
+        error_msg = result.get("error", result.get("message", "Unknown error"))
+        console.print(Panel(
+            f"[red]✗ Failed to start server[/red]\n\n"
+            f"{error_msg}",
+            title="Error",
+            border_style="red"
+        ))
+
+        if "already running" in error_msg.lower():
+            console.print("\n[yellow]Hint: Use 'crwl server status' to check current deployment[/yellow]")
+            console.print("[yellow]      Use 'crwl server stop' to stop existing server[/yellow]")
+
+
+@server_cmd.command("status")
+def status_cmd():
+    """Show current server status and deployment info.
+
+    Displays:
+    - Running state (up/down)
+    - Deployment mode (single/swarm/compose)
+    - Number of replicas
+    - Port mapping
+    - Uptime
+    - Image version
+
+    Example:
+        crwl server status
+    """
+    manager = ServerManager()
+
+    async def _status():
+        return await manager.status()
+    result = anyio.run(_status)
+
+    if result["running"]:
+        table = Table(title="Crawl4AI Server Status", border_style="green")
+        table.add_column("Property", style="cyan")
+        table.add_column("Value", style="green")
+
+        table.add_row("Status", "🟢 Running")
+        table.add_row("Mode", result["mode"])
+        table.add_row("Replicas", str(result.get("replicas", 1)))
+        table.add_row("Port", str(result.get("port", 11235)))
+        table.add_row("Image", result.get("image", "unknown"))
+        table.add_row("Uptime", result.get("uptime", "unknown"))
+        table.add_row("Started", result.get("started_at", "unknown"))
+
+        console.print(table)
+        console.print(f"\n[green]✓ Server is healthy[/green]")
+        console.print(f"[dim]Access: http://localhost:{result.get('port', 11235)}[/dim]")
+    else:
+        console.print(Panel(
+            f"[yellow]No server is currently running[/yellow]\n\n"
+            f"Use 'crwl server start' to launch a server",
+            title="Server Status",
+            border_style="yellow"
+        ))
+
+
+@server_cmd.command("stop")
+@click.option(
+    "--remove-volumes",
+    is_flag=True,
+    help="Remove associated volumes (WARNING: deletes data)"
+)
+def stop_cmd(remove_volumes: bool):
+    """Stop running Crawl4AI server and cleanup resources.
+
+    This will:
+    1. Stop all running containers/services
+    2. Remove containers
+    3. Optionally remove volumes (--remove-volumes)
+    4. Clean up state files
+
+    WARNING: Use --remove-volumes with caution as it will delete
+    persistent data including Redis databases and logs.
+
+    Examples:
+        # Stop server, keep volumes
+        crwl server stop
+
+        # Stop and remove all data
+        crwl server stop --remove-volumes
+    """
+    manager = ServerManager()
+
+    # Confirm if removing volumes
+    if remove_volumes:
+        if not Confirm.ask(
+            "[red]⚠️  This will delete all server data including Redis databases. Continue?[/red]"
+        ):
+            console.print("[yellow]Cancelled[/yellow]")
+            return
+
+    with console.status("[cyan]Stopping server..."):
+        async def _stop():
+            return await manager.stop(remove_volumes=remove_volumes)
+        result = anyio.run(_stop)
+
+    if result["success"]:
+        console.print(Panel(
+            f"[green]✓ Server stopped successfully[/green]\n\n"
+            f"{result.get('message', 'All resources cleaned up')}",
+            title="Server Stopped",
+            border_style="green"
+        ))
+    else:
+        console.print(Panel(
+            f"[red]✗ Error stopping server[/red]\n\n"
+            f"{result.get('error', result.get('message', 'Unknown error'))}",
+            title="Error",
+            border_style="red"
+        ))
+
+
+@server_cmd.command("scale")
+@click.argument("replicas", type=int)
+def scale_cmd(replicas: int):
+    """Scale server to specified number of replicas.
+
+    Only works with Swarm or Compose modes. Single container
+    mode cannot be scaled (must stop and restart with --replicas).
+
+    Scaling is live and does not require downtime. The load
+    balancer will automatically distribute traffic to new replicas.
+
+    Examples:
+        # Scale up to 10 replicas
+        crwl server scale 10
+
+        # Scale down to 2 replicas
+        crwl server scale 2
+
+        # Scale to 1 (minimum)
+        crwl server scale 1
+    """
+    if replicas < 1:
+        console.print("[red]Error: Replicas must be at least 1[/red]")
+        return
+
+    manager = ServerManager()
+
+    with console.status(f"[cyan]Scaling to {replicas} replicas..."):
+        async def _scale():
+            return await manager.scale(replicas=replicas)
+        result = anyio.run(_scale)
+
+    if result["success"]:
+        console.print(Panel(
+            f"[green]✓ Scaled successfully[/green]\n\n"
+            f"New replica count: [bold]{replicas}[/bold]\n"
+            f"Mode: [cyan]{result.get('mode')}[/cyan]",
+            title="Scaling Complete",
+            border_style="green"
+        ))
+    else:
+        error_msg = result.get("error", result.get("message", "Unknown error"))
+        console.print(Panel(
+            f"[red]✗ Scaling failed[/red]\n\n"
+            f"{error_msg}",
+            title="Error",
+            border_style="red"
+        ))
+
+        if "single container" in error_msg.lower():
+            console.print("\n[yellow]Hint: For single container mode:[/yellow]")
+            console.print("[yellow]  1. crwl server stop[/yellow]")
+            console.print(f"[yellow]  2. crwl server start --replicas {replicas}[/yellow]")
+
+
+@server_cmd.command("logs")
+@click.option(
+    "--follow", "-f",
+    is_flag=True,
+    help="Follow log output (like tail -f)"
+)
+@click.option(
+    "--tail",
+    type=int,
+    default=100,
+    help="Number of lines to show (default: 100)"
+)
+def logs_cmd(follow: bool, tail: int):
+    """View server logs.
+
+    Shows logs from running containers/services. Use --follow
+    to stream logs in real-time.
+
+    Examples:
+        # Show last 100 lines
+        crwl server logs
+
+        # Show last 500 lines
+        crwl server logs --tail 500
+
+        # Follow logs in real-time
+        crwl server logs --follow
+
+        # Combine options
+        crwl server logs -f --tail 50
+    """
+    manager = ServerManager()
+
+    async def _logs():
+        return await manager.logs(follow=follow, tail=tail)
+    output = anyio.run(_logs)
+    console.print(output)
+
+
+@server_cmd.command("restart")
+@click.option(
+    "--replicas", "-r",
+    type=int,
+    help="New replica count (optional)"
+)
+def restart_cmd(replicas: int):
+    """Restart server (stop then start with same config).
+
+    Preserves existing configuration unless overridden with options.
+    Useful for applying image updates or recovering from errors.
+
+    Examples:
+        # Restart with same configuration
+        crwl server restart
+
+        # Restart and change replica count
+        crwl server restart --replicas 5
+    """
+    manager = ServerManager()
+
+    # Get current state
+    async def _get_status():
+        return await manager.status()
+    current = anyio.run(_get_status)
+
+    if not current["running"]:
+        console.print("[yellow]No server is running. Use 'crwl server start' instead.[/yellow]")
+        return
+
+    # Extract current config
+    current_replicas = current.get("replicas", 1)
+    current_port = current.get("port", 11235)
+    current_image = current.get("image", "unclecode/crawl4ai:latest")
+    current_mode = current.get("mode", "auto")
+
+    # Override with CLI args
+    new_replicas = replicas if replicas is not None else current_replicas
+
+    console.print(Panel(
+        f"[cyan]Restarting Crawl4AI Server[/cyan]\n\n"
+        f"Replicas: [yellow]{current_replicas}[/yellow] → [green]{new_replicas}[/green]\n"
+        f"Port: [yellow]{current_port}[/yellow]\n"
+        f"Mode: [yellow]{current_mode}[/yellow]",
+        title="Server Restart",
+        border_style="cyan"
+    ))
+
+    # Stop current
+    with console.status("[cyan]Stopping current server..."):
+        async def _stop_server():
+            return await manager.stop(remove_volumes=False)
+        stop_result = anyio.run(_stop_server)
+
+    if not stop_result["success"]:
+        console.print(f"[red]Failed to stop server: {stop_result.get('error')}[/red]")
+        return
+
+    # Start new
+    with console.status("[cyan]Starting server..."):
+        async def _start_server():
+            return await manager.start(
+                replicas=new_replicas,
+                mode="auto",
+                port=current_port,
+                image=current_image
+            )
+        start_result = anyio.run(_start_server)
+
+    if start_result["success"]:
+        console.print(Panel(
+            f"[green]✓ Server restarted successfully![/green]\n\n"
+            f"URL: [bold]http://localhost:{current_port}[/bold]",
+            title="Restart Complete",
+            border_style="green"
+        ))
+    else:
+        console.print(Panel(
+            f"[red]✗ Failed to restart server[/red]\n\n"
+            f"{start_result.get('error', 'Unknown error')}",
+            title="Error",
+            border_style="red"
+        ))
--- a/crawl4ai/server_manager.py
+++ b/crawl4ai/server_manager.py
--- a/crawl4ai/templates/docker-compose.template.yml
+++ b/crawl4ai/templates/docker-compose.template.yml
@@ -0,0 +1,52 @@
+version: '3.8'
+
+services:
+  redis:
+    image: redis:alpine
+    command: redis-server --appendonly yes
+    volumes:
+      - redis_data:/data
+    networks:
+      - crawl4ai_net
+    restart: unless-stopped
+
+  crawl4ai:
+    image: ${IMAGE}
+    deploy:
+      replicas: ${REPLICAS}
+      resources:
+        limits:
+          memory: 4G
+    shm_size: 1g
+    environment:
+      - REDIS_HOST=redis
+      - REDIS_PORT=6379
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+    depends_on:
+      - redis
+    networks:
+      - crawl4ai_net
+
+  nginx:
+    image: nginx:alpine
+    ports:
+      - "${PORT}:80"
+    volumes:
+      - ${NGINX_CONF}:/etc/nginx/nginx.conf:ro
+    depends_on:
+      - crawl4ai
+    networks:
+      - crawl4ai_net
+    restart: unless-stopped
+
+networks:
+  crawl4ai_net:
+    driver: bridge
+
+volumes:
+  redis_data:
--- a/crawl4ai/templates/nginx.conf.template
+++ b/crawl4ai/templates/nginx.conf.template
@@ -0,0 +1,75 @@
+events {
+    worker_connections 1024;
+}
+
+http {
+    upstream crawl4ai_backend {
+        # DNS-based load balancing to Docker Compose service
+        # Docker Compose provides DNS resolution for service name
+        server crawl4ai:11235 max_fails=3 fail_timeout=30s;
+
+        # Keep connections alive
+        keepalive 32;
+    }
+
+    # Sticky sessions for monitoring (same IP always goes to same container)
+    upstream crawl4ai_monitor {
+        ip_hash;  # Sticky sessions based on client IP
+        server crawl4ai:11235 max_fails=3 fail_timeout=30s;
+        keepalive 32;
+    }
+
+    server {
+        listen 80;
+        server_name _;
+
+        # Increase timeouts for long-running crawl operations
+        proxy_connect_timeout 300;
+        proxy_send_timeout 300;
+        proxy_read_timeout 300;
+        send_timeout 300;
+
+        # WebSocket endpoint for real-time monitoring (exact match)
+        location = /monitor/ws {
+            proxy_pass http://crawl4ai_monitor/monitor/ws;
+            proxy_http_version 1.1;
+            proxy_set_header Upgrade $http_upgrade;
+            proxy_set_header Connection "upgrade";
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+
+            # WebSocket timeouts
+            proxy_connect_timeout 7d;
+            proxy_send_timeout 7d;
+            proxy_read_timeout 7d;
+        }
+
+        # Monitor and dashboard with sticky sessions (regex location)
+        location ~ ^/(monitor|dashboard) {
+            proxy_pass http://crawl4ai_monitor;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+
+        # HTTP endpoints (load balanced)
+        location / {
+            proxy_pass http://crawl4ai_backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+
+            # Support large request bodies (for batch operations)
+            client_max_body_size 10M;
+        }
+
+        # Health check endpoint (bypass load balancer)
+        location /health {
+            proxy_pass http://crawl4ai_backend/health;
+            access_log off;
+        }
+    }
+}
--- a/deploy/docker/AGENT.md
+++ b/deploy/docker/AGENT.md
@@ -0,0 +1,402 @@
+# Crawl4AI DevOps Agent Context
+
+## Service Overview
+**Crawl4AI**: Browser-based web crawling service with AI extraction. Docker deployment with horizontal scaling (1-N containers), Redis coordination, Nginx load balancing.
+
+## Architecture Quick Reference
+
+```
+Client → Nginx:11235 → [crawl4ai-1, crawl4ai-2, ...crawl4ai-N] ← Redis
+                              ↓
+                         Monitor Dashboard
+```
+
+**Components:**
+- **Nginx**: Load balancer (round-robin API, sticky monitoring)
+- **Crawl4AI containers**: FastAPI + Playwright browsers
+- **Redis**: Container discovery (heartbeats 30s), monitoring data aggregation
+- **Monitor**: Real-time dashboard at `/dashboard`
+
+## CLI Commands
+
+### Start/Stop
+```bash
+crwl server start [-r N] [--port P] [--mode auto|single|swarm|compose] [--env-file F] [--image I]
+crwl server stop [--remove-volumes]
+crwl server restart [-r N]
+```
+
+### Management
+```bash
+crwl server status        # Show mode, replicas, port, uptime
+crwl server scale N       # Live scaling (Swarm/Compose only)
+crwl server logs [-f] [--tail N]
+```
+
+**Defaults**: replicas=1, port=11235, mode=auto, image=unclecode/crawl4ai:latest
+
+## Deployment Modes
+
+| Replicas | Mode | Load Balancer | Use Case |
+|----------|------|---------------|----------|
+| N=1 | single | None | Dev/testing |
+| N>1 | swarm | Built-in | Production (if `docker swarm init` done) |
+| N>1 | compose | Nginx | Production (fallback) |
+
+**Mode Detection** (when mode=auto):
+1. If N=1 → single
+2. If N>1 & Swarm active → swarm
+3. If N>1 & Swarm inactive → compose
+
+## File Locations
+
+```
+~/.crawl4ai/server/
+├── state.json              # Current deployment state
+├── docker-compose.yml      # Generated compose file
+└── nginx.conf              # Generated nginx config
+
+/app/                       # Inside container
+├── deploy/docker/server.py
+├── deploy/docker/monitor.py
+├── deploy/docker/static/monitor/index.html
+└── crawler_pool.py         # Browser pool (PERMANENT, HOT_POOL, COLD_POOL)
+```
+
+## Monitoring & Troubleshooting
+
+### Health Checks
+```bash
+curl http://localhost:11235/health              # Service health
+curl http://localhost:11235/monitor/containers  # Container discovery
+curl http://localhost:11235/monitor/requests    # Aggregated requests
+```
+
+### Dashboard
+- URL: `http://localhost:11235/dashboard/`
+- Features: Container filtering (All/C-1/C-2/C-3), real-time WebSocket, timeline charts
+- WebSocket: `/monitor/ws` (sticky sessions)
+
+### Common Issues
+
+**No containers showing in dashboard:**
+```bash
+docker exec <redis-container> redis-cli SMEMBERS monitor:active_containers
+docker exec <redis-container> redis-cli KEYS "monitor:heartbeat:*"
+```
+Wait 30s for heartbeat registration.
+
+**Load balancing not working:**
+```bash
+docker exec <nginx-container> cat /etc/nginx/nginx.conf | grep upstream
+docker logs <nginx-container> | grep error
+```
+Check Nginx upstream has no `ip_hash` for API endpoints.
+
+**Redis connection errors:**
+```bash
+docker logs <crawl4ai-container> | grep -i redis
+docker exec <crawl4ai-container> ping redis
+```
+Verify REDIS_HOST=redis, REDIS_PORT=6379.
+
+**Containers not scaling:**
+```bash
+# Swarm
+docker service ls
+docker service ps crawl4ai
+
+# Compose
+docker compose -f ~/.crawl4ai/server/docker-compose.yml ps
+docker compose -f ~/.crawl4ai/server/docker-compose.yml up -d --scale crawl4ai=N
+```
+
+### Redis Data Structure
+```
+monitor:active_containers              # SET: {container_ids}
+monitor:heartbeat:{cid}                # STRING: {id, hostname, last_seen} TTL=60s
+monitor:{cid}:active_requests          # STRING: JSON list, TTL=5min
+monitor:{cid}:completed                # STRING: JSON list, TTL=1h
+monitor:{cid}:janitor                  # STRING: JSON list, TTL=1h
+monitor:{cid}:errors                   # STRING: JSON list, TTL=1h
+monitor:endpoint_stats                 # STRING: JSON aggregate, TTL=24h
+```
+
+## Environment Variables
+
+### Required for Multi-LLM
+```bash
+OPENAI_API_KEY=sk-...
+ANTHROPIC_API_KEY=sk-ant-...
+DEEPSEEK_API_KEY=...
+GROQ_API_KEY=...
+TOGETHER_API_KEY=...
+MISTRAL_API_KEY=...
+GEMINI_API_TOKEN=...
+```
+
+### Redis Configuration (Optional)
+```bash
+REDIS_HOST=redis                       # Default: redis
+REDIS_PORT=6379                        # Default: 6379
+REDIS_TTL_ACTIVE_REQUESTS=300          # Default: 5min
+REDIS_TTL_COMPLETED_REQUESTS=3600      # Default: 1h
+REDIS_TTL_JANITOR_EVENTS=3600          # Default: 1h
+REDIS_TTL_ERRORS=3600                  # Default: 1h
+REDIS_TTL_ENDPOINT_STATS=86400         # Default: 24h
+REDIS_TTL_HEARTBEAT=60                 # Default: 1min
+```
+
+## API Endpoints
+
+### Core API
+- `POST /crawl` - Crawl URL (load-balanced)
+- `POST /batch` - Batch crawl (load-balanced)
+- `GET /health` - Health check (load-balanced)
+
+### Monitor API (Aggregated from all containers)
+- `GET /monitor/health` - Local container health
+- `GET /monitor/containers` - All active containers
+- `GET /monitor/requests` - All requests (active + completed)
+- `GET /monitor/browsers` - Browser pool status (local only)
+- `GET /monitor/logs/janitor` - Janitor cleanup events
+- `GET /monitor/logs/errors` - Error logs
+- `GET /monitor/endpoints/stats` - Endpoint analytics
+- `WS /monitor/ws` - Real-time updates (aggregated)
+
+### Control Actions
+- `POST /monitor/actions/cleanup` - Force browser cleanup
+- `POST /monitor/actions/kill_browser` - Kill specific browser
+- `POST /monitor/actions/restart_browser` - Restart browser
+- `POST /monitor/stats/reset` - Reset endpoint counters
+
+## Docker Commands Reference
+
+### Inspection
+```bash
+# List containers
+docker ps --filter "name=crawl4ai"
+
+# Container logs
+docker logs <container-id> -f --tail 100
+
+# Redis CLI
+docker exec -it <redis-container> redis-cli
+KEYS monitor:*
+SMEMBERS monitor:active_containers
+GET monitor:<cid>:completed
+TTL monitor:heartbeat:<cid>
+
+# Nginx config
+docker exec <nginx-container> cat /etc/nginx/nginx.conf
+
+# Container stats
+docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}"
+```
+
+### Compose Operations
+```bash
+# Scale
+docker compose -f ~/.crawl4ai/server/docker-compose.yml up -d --scale crawl4ai=5
+
+# Restart service
+docker compose -f ~/.crawl4ai/server/docker-compose.yml restart crawl4ai
+
+# View services
+docker compose -f ~/.crawl4ai/server/docker-compose.yml ps
+```
+
+### Swarm Operations
+```bash
+# Initialize Swarm
+docker swarm init
+
+# Scale service
+docker service scale crawl4ai=5
+
+# Service info
+docker service ls
+docker service ps crawl4ai --no-trunc
+
+# Service logs
+docker service logs crawl4ai --tail 100 -f
+```
+
+## Performance & Scaling
+
+### Resource Recommendations
+| Containers | Memory/Container | Total Memory | Use Case |
+|------------|-----------------|--------------|----------|
+| 1 | 4GB | 4GB | Development |
+| 3 | 4GB | 12GB | Small prod |
+| 5 | 4GB | 20GB | Medium prod |
+| 10 | 4GB | 40GB | Large prod |
+
+**Expected Throughput**: ~10 req/min per container (depends on crawl complexity)
+
+### Scaling Guidelines
+- **Horizontal**: Add replicas (`crwl server scale N`)
+- **Vertical**: Adjust `--memory 8G --cpus 4` in kwargs
+- **Browser Pool**: Permanent (1) + Hot pool (adaptive) + Cold pool (cleanup by janitor)
+
+### Redis Memory Usage
+- **Per container**: ~110KB (requests + events + errors + heartbeat)
+- **10 containers**: ~1.1MB
+- **Recommendation**: 256MB Redis is sufficient for <100 containers
+
+## Security Notes
+
+### Input Validation
+All CLI inputs validated:
+- Image name: alphanumeric + `.-/:_@` only, max 256 chars
+- Port: 1-65535
+- Replicas: 1-100
+- Env file: must exist and be readable
+- Container IDs: alphanumeric + `-_` only (prevents Redis injection)
+
+### Network Security
+- Nginx forwards to internal `crawl4ai` service (Docker network)
+- Monitor endpoints have NO authentication (add MONITOR_TOKEN env for security)
+- Redis is internal-only (no external port)
+
+### Recommended Production Setup
+```bash
+# Add authentication
+export MONITOR_TOKEN="your-secret-token"
+
+# Use Redis password
+redis:
+  command: redis-server --requirepass ${REDIS_PASSWORD}
+
+# Enable rate limiting in Nginx
+limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s;
+```
+
+## Common User Scenarios
+
+### Scenario 1: Fresh Deployment
+```bash
+crwl server start --replicas 3 --env-file .env
+# Wait for health check, then access http://localhost:11235/health
+```
+
+### Scenario 2: Scaling Under Load
+```bash
+crwl server scale 10
+# Live scaling, no downtime
+```
+
+### Scenario 3: Debugging Slow Requests
+```bash
+# Check dashboard
+open http://localhost:11235/dashboard/
+
+# Check container logs
+docker logs <slowest-container-id> --tail 100
+
+# Check browser pool
+curl http://localhost:11235/monitor/browsers | jq
+```
+
+### Scenario 4: Redis Connection Issues
+```bash
+# Check Redis connectivity
+docker exec <crawl4ai-container> nc -zv redis 6379
+
+# Check Redis logs
+docker logs <redis-container>
+
+# Restart containers (triggers reconnect with retry logic)
+crwl server restart
+```
+
+### Scenario 5: Container Not Appearing in Dashboard
+```bash
+# Wait 30s for heartbeat
+sleep 30
+
+# Check Redis
+docker exec <redis-container> redis-cli SMEMBERS monitor:active_containers
+
+# Check container logs for heartbeat errors
+docker logs <missing-container> | grep -i heartbeat
+```
+
+## Code Context for Advanced Debugging
+
+### Key Classes
+- `MonitorStats` (monitor.py): Tracks stats, Redis persistence, heartbeat worker
+- `ServerManager` (server_manager.py): CLI orchestration, mode detection
+- Browser pool globals: `PERMANENT`, `HOT_POOL`, `COLD_POOL`, `LOCK` (crawler_pool.py)
+
+### Critical Timeouts
+- Browser pool lock: 2s timeout (prevents deadlock)
+- WebSocket connection: 5s timeout
+- Health check: 30-60s timeout
+- Heartbeat interval: 30s, TTL: 60s
+- Redis retry: 3 attempts, backoff: 0.5s/1s/2s
+- Circuit breaker: 5 failures → 5min backoff
+
+### State Transitions
+```
+NOT_RUNNING → STARTING → HEALTHY → RUNNING
+                ↓           ↓
+            FAILED      UNHEALTHY → STOPPED
+```
+
+State file: `~/.crawl4ai/server/state.json` (atomic writes, fcntl locking)
+
+## Quick Diagnostic Commands
+
+```bash
+# Full system check
+crwl server status
+docker ps
+curl http://localhost:11235/health
+curl http://localhost:11235/monitor/containers | jq
+
+# Redis check
+docker exec <redis-container> redis-cli PING
+docker exec <redis-container> redis-cli INFO stats
+
+# Network check
+docker network ls
+docker network inspect <network-name>
+
+# Logs check
+docker logs <nginx-container> --tail 50
+docker logs <redis-container> --tail 50
+docker compose -f ~/.crawl4ai/server/docker-compose.yml logs --tail 100
+```
+
+## Agent Decision Tree
+
+**User reports slow crawling:**
+1. Check dashboard for active requests stuck → kill browser if >5min
+2. Check browser pool status → cleanup if hot/cold pool >10
+3. Check container CPU/memory → scale up if >80%
+4. Check Redis latency → restart Redis if >100ms
+
+**User reports missing containers:**
+1. Wait 30s for heartbeat
+2. Check `docker ps` vs dashboard count
+3. Check Redis SMEMBERS monitor:active_containers
+4. Check container logs for Redis connection errors
+5. Verify REDIS_HOST/PORT env vars
+
+**User reports 502/503 errors:**
+1. Check Nginx logs for upstream errors
+2. Check container health: `curl http://localhost:11235/health`
+3. Check if all containers are healthy: `docker ps`
+4. Restart Nginx: `docker restart <nginx-container>`
+
+**User wants to update image:**
+1. `crwl server stop`
+2. `docker pull unclecode/crawl4ai:latest`
+3. `crwl server start --replicas <previous-count>`
+
+---
+
+**Version**: Crawl4AI v0.7.4+
+**Last Updated**: 2025-01-20
+**AI Agent Note**: All commands, file paths, and Redis keys verified against codebase. Use exact syntax shown. For user-facing responses, translate technical details to plain language.
--- a/deploy/docker/docs/ARCHITECTURE.md
+++ b/deploy/docker/docs/ARCHITECTURE.md
--- a/deploy/docker/docs/DOCKER_ORCHESTRATION.md
+++ b/deploy/docker/docs/DOCKER_ORCHESTRATION.md
--- a/deploy/docker/docs/MULTI_CONTAINER_ARCHITECTURE.md
+++ b/deploy/docker/docs/MULTI_CONTAINER_ARCHITECTURE.md
--- a/deploy/docker/docs/STRESS_TEST_PIPELINE.md
+++ b/deploy/docker/docs/STRESS_TEST_PIPELINE.md
--- a/deploy/docker/docs/c4ai-code-context.md
+++ b/deploy/docker/docs/c4ai-code-context.md
--- a/deploy/docker/docs/c4ai-doc-context.md
+++ b/deploy/docker/docs/c4ai-doc-context.md
--- a/deploy/docker/monitor.py
+++ b/deploy/docker/monitor.py
@@ -5,6 +5,7 @@ import asyncio
 from typing import Dict, List, Optional
 from datetime import datetime, timezone
 from collections import deque
+from dataclasses import dataclass
 from redis import asyncio as aioredis
 from utils import get_container_memory_percent
 import psutil
@@ -12,13 +13,49 @@ import logging

 logger = logging.getLogger(__name__)

+
+# ========== Configuration ==========
+
+@dataclass
+class RedisTTLConfig:
+    """Redis TTL configuration (in seconds).
+
+    Configures how long different types of monitoring data are retained in Redis.
+    Adjust based on your monitoring needs and Redis memory constraints.
+    """
+    active_requests: int = 300  # 5 minutes - short-lived active request data
+    completed_requests: int = 3600  # 1 hour - recent completed requests
+    janitor_events: int = 3600  # 1 hour - browser cleanup events
+    errors: int = 3600  # 1 hour - error logs
+    endpoint_stats: int = 86400  # 24 hours - aggregated endpoint statistics
+    heartbeat: int = 60  # 1 minute - container heartbeat (2x the 30s interval)
+
+    @classmethod
+    def from_env(cls) -> 'RedisTTLConfig':
+        """Load TTL configuration from environment variables."""
+        import os
+        return cls(
+            active_requests=int(os.getenv('REDIS_TTL_ACTIVE_REQUESTS', 300)),
+            completed_requests=int(os.getenv('REDIS_TTL_COMPLETED_REQUESTS', 3600)),
+            janitor_events=int(os.getenv('REDIS_TTL_JANITOR_EVENTS', 3600)),
+            errors=int(os.getenv('REDIS_TTL_ERRORS', 3600)),
+            endpoint_stats=int(os.getenv('REDIS_TTL_ENDPOINT_STATS', 86400)),
+            heartbeat=int(os.getenv('REDIS_TTL_HEARTBEAT', 60)),
+        )
+
+
 class MonitorStats:
    """Tracks real-time server stats with Redis persistence."""

-    def __init__(self, redis: aioredis.Redis):
+    def __init__(self, redis: aioredis.Redis, ttl_config: Optional[RedisTTLConfig] = None):
        self.redis = redis
+        self.ttl = ttl_config or RedisTTLConfig.from_env()
        self.start_time = time.time()

+        # Get container ID for Redis keys
+        from utils import get_container_id
+        self.container_id = get_container_id()
+
        # In-memory queues (fast reads, Redis backup)
        self.active_requests: Dict[str, Dict] = {}  # id -> request info
        self.completed_requests: deque = deque(maxlen=100)  # Last 100
@@ -32,6 +69,9 @@ class MonitorStats:
        self._persist_queue: asyncio.Queue = asyncio.Queue(maxsize=10)
        self._persist_worker_task: Optional[asyncio.Task] = None

+        # Heartbeat task for container discovery
+        self._heartbeat_task: Optional[asyncio.Task] = None
+
        # Timeline data (5min window, 5s resolution = 60 points)
        self.memory_timeline: deque = deque(maxlen=60)
        self.requests_timeline: deque = deque(maxlen=60)
@@ -45,10 +85,14 @@ class MonitorStats:
            "url": url[:100],  # Truncate long URLs
            "start_time": time.time(),
            "config_sig": config.get("sig", "default") if config else "default",
-            "mem_start": psutil.Process().memory_info().rss / (1024 * 1024)
+            "mem_start": psutil.Process().memory_info().rss / (1024 * 1024),
+            "container_id": self.container_id
        }
        self.active_requests[request_id] = req_info

+        # Persist to Redis
+        await self._persist_active_requests()
+
        # Increment endpoint counter
        if endpoint not in self.endpoint_stats:
            self.endpoint_stats[endpoint] = {
@@ -95,19 +139,29 @@ class MonitorStats:
            "success": success,
            "error": error,
            "status_code": status_code,
-            "pool_hit": pool_hit
+            "pool_hit": pool_hit,
+            "container_id": self.container_id
        }
        self.completed_requests.append(completed)

+        # Persist to Redis
+        await self._persist_completed_requests()
+        await self._persist_active_requests()  # Update active (removed this request)
+
        # Track errors
        if not success and error:
-            self.errors.append({
+            error_entry = {
                "timestamp": end_time,
                "endpoint": endpoint,
                "url": req_info["url"],
                "error": error,
-                "request_id": request_id
-            })
+                "request_id": request_id,
+                "message": error,
+                "level": "ERROR",
+                "container_id": self.container_id
+            }
+            self.errors.append(error_entry)
+            await self._persist_errors()

        await self._persist_endpoint_stats()

@@ -117,8 +171,10 @@ class MonitorStats:
            "timestamp": time.time(),
            "type": event_type,  # "close_cold", "close_hot", "promote"
            "sig": sig[:8],
-            "details": details
+            "details": details,
+            "container_id": self.container_id
        })
+        await self._persist_janitor_events()

    def _cleanup_old_entries(self, max_age_seconds: int = 300):
        """Remove entries older than max_age_seconds (default 5min)."""
@@ -149,13 +205,23 @@ class MonitorStats:
        recent_reqs = sum(1 for req in self.completed_requests
                         if now - req.get("end_time", 0) < 5)

-        # Browser counts (acquire lock to prevent race conditions)
+        # Browser counts (acquire lock with timeout to prevent deadlock)
        from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
-        async with LOCK:
+        try:
+            async with asyncio.timeout(2.0):
+                async with LOCK:
+                    browser_count = {
+                        "permanent": 1 if PERMANENT else 0,
+                        "hot": len(HOT_POOL),
+                        "cold": len(COLD_POOL)
+                    }
+        except asyncio.TimeoutError:
+            logger.warning("Lock acquisition timeout in update_timeline, using cached browser counts")
+            # Use last known values or defaults
            browser_count = {
-                "permanent": 1 if PERMANENT else 0,
-                "hot": len(HOT_POOL),
-                "cold": len(COLD_POOL)
+                "permanent": 1,
+                "hot": 0,
+                "cold": 0
            }

        self.memory_timeline.append({"time": now, "value": mem_pct})
@@ -163,15 +229,117 @@ class MonitorStats:
        self.browser_timeline.append({"time": now, "browsers": browser_count})

    async def _persist_endpoint_stats(self):
-        """Persist endpoint stats to Redis."""
-        try:
-            await self.redis.set(
-                "monitor:endpoint_stats",
-                json.dumps(self.endpoint_stats),
-                ex=86400  # 24h TTL
-            )
-        except Exception as e:
-            logger.warning(f"Failed to persist endpoint stats: {e}")
+        """Persist endpoint stats to Redis with retry logic."""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                await self.redis.set(
+                    "monitor:endpoint_stats",
+                    json.dumps(self.endpoint_stats),
+                    ex=self.ttl.endpoint_stats
+                )
+                return  # Success
+            except aioredis.ConnectionError as e:
+                if attempt < max_retries - 1:
+                    backoff = 0.5 * (2 ** attempt)  # 0.5s, 1s, 2s
+                    logger.warning(f"Redis connection error persisting endpoint stats (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
+                    await asyncio.sleep(backoff)
+                else:
+                    logger.error(f"Failed to persist endpoint stats after {max_retries} attempts: {e}")
+            except Exception as e:
+                logger.error(f"Non-retryable error persisting endpoint stats: {e}")
+                break
+
+    async def _persist_active_requests(self):
+        """Persist active requests to Redis with retry logic."""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                if self.active_requests:
+                    await self.redis.set(
+                        f"monitor:{self.container_id}:active_requests",
+                        json.dumps(list(self.active_requests.values())),
+                        ex=self.ttl.active_requests
+                    )
+                else:
+                    await self.redis.delete(f"monitor:{self.container_id}:active_requests")
+                return  # Success
+            except aioredis.ConnectionError as e:
+                if attempt < max_retries - 1:
+                    backoff = 0.5 * (2 ** attempt)  # 0.5s, 1s, 2s
+                    logger.warning(f"Redis connection error persisting active requests (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
+                    await asyncio.sleep(backoff)
+                else:
+                    logger.error(f"Failed to persist active requests after {max_retries} attempts: {e}")
+            except Exception as e:
+                logger.error(f"Non-retryable error persisting active requests: {e}")
+                break
+
+    async def _persist_completed_requests(self):
+        """Persist completed requests to Redis with retry logic."""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                await self.redis.set(
+                    f"monitor:{self.container_id}:completed",
+                    json.dumps(list(self.completed_requests)),
+                    ex=self.ttl.completed_requests
+                )
+                return  # Success
+            except aioredis.ConnectionError as e:
+                if attempt < max_retries - 1:
+                    backoff = 0.5 * (2 ** attempt)  # 0.5s, 1s, 2s
+                    logger.warning(f"Redis connection error persisting completed requests (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
+                    await asyncio.sleep(backoff)
+                else:
+                    logger.error(f"Failed to persist completed requests after {max_retries} attempts: {e}")
+            except Exception as e:
+                logger.error(f"Non-retryable error persisting completed requests: {e}")
+                break
+
+    async def _persist_janitor_events(self):
+        """Persist janitor events to Redis with retry logic."""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                await self.redis.set(
+                    f"monitor:{self.container_id}:janitor",
+                    json.dumps(list(self.janitor_events)),
+                    ex=self.ttl.janitor_events
+                )
+                return  # Success
+            except aioredis.ConnectionError as e:
+                if attempt < max_retries - 1:
+                    backoff = 0.5 * (2 ** attempt)  # 0.5s, 1s, 2s
+                    logger.warning(f"Redis connection error persisting janitor events (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
+                    await asyncio.sleep(backoff)
+                else:
+                    logger.error(f"Failed to persist janitor events after {max_retries} attempts: {e}")
+            except Exception as e:
+                logger.error(f"Non-retryable error persisting janitor events: {e}")
+                break
+
+    async def _persist_errors(self):
+        """Persist errors to Redis with retry logic."""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                await self.redis.set(
+                    f"monitor:{self.container_id}:errors",
+                    json.dumps(list(self.errors)),
+                    ex=self.ttl.errors
+                )
+                return  # Success
+            except aioredis.ConnectionError as e:
+                if attempt < max_retries - 1:
+                    backoff = 0.5 * (2 ** attempt)  # 0.5s, 1s, 2s
+                    logger.warning(f"Redis connection error persisting errors (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
+                    await asyncio.sleep(backoff)
+                else:
+                    logger.error(f"Failed to persist errors after {max_retries} attempts: {e}")
+            except Exception as e:
+                logger.error(f"Non-retryable error persisting errors: {e}")
+                break

    async def _persistence_worker(self):
        """Background worker to persist stats to Redis."""
@@ -202,25 +370,121 @@ class MonitorStats:
            self._persist_worker_task = None
            logger.info("Stopped persistence worker")

+    async def _heartbeat_worker(self):
+        """Send heartbeat to Redis every 30s with circuit breaker for failures."""
+        from utils import detect_deployment_mode
+        import os
+
+        heartbeat_failures = 0
+        max_failures = 5  # Circuit breaker threshold
+
+        while True:
+            try:
+                # Get hostname/container name for friendly display
+                # Try HOSTNAME env var first (set by Docker Compose), then socket.gethostname()
+                import socket
+                hostname = os.getenv("HOSTNAME", socket.gethostname())
+
+                # Register this container
+                mode, containers = detect_deployment_mode()
+                container_info = {
+                    "id": self.container_id,
+                    "hostname": hostname,
+                    "last_seen": time.time(),
+                    "mode": mode,
+                    "failure_count": heartbeat_failures
+                }
+
+                # Set heartbeat with configured TTL
+                await self.redis.setex(
+                    f"monitor:heartbeat:{self.container_id}",
+                    self.ttl.heartbeat,
+                    json.dumps(container_info)
+                )
+
+                # Add to active containers set
+                await self.redis.sadd("monitor:active_containers", self.container_id)
+
+                # Reset failure counter on success
+                heartbeat_failures = 0
+
+                # Wait 30s before next heartbeat
+                await asyncio.sleep(30)
+
+            except asyncio.CancelledError:
+                break
+            except aioredis.ConnectionError as e:
+                heartbeat_failures += 1
+                logger.error(
+                    f"Heartbeat Redis connection error (attempt {heartbeat_failures}/{max_failures}): {e}"
+                )
+
+                if heartbeat_failures >= max_failures:
+                    # Circuit breaker - back off for longer
+                    logger.critical(
+                        f"Heartbeat circuit breaker triggered after {heartbeat_failures} failures. "
+                        f"Container will appear offline for 5 minutes."
+                    )
+                    await asyncio.sleep(300)  # 5 min backoff
+                    heartbeat_failures = 0
+                else:
+                    # Exponential backoff
+                    backoff = min(30 * (2 ** heartbeat_failures), 300)
+                    await asyncio.sleep(backoff)
+            except Exception as e:
+                logger.error(f"Unexpected heartbeat error: {e}", exc_info=True)
+                await asyncio.sleep(30)
+
+    def start_heartbeat(self):
+        """Start the heartbeat worker."""
+        if not self._heartbeat_task:
+            self._heartbeat_task = asyncio.create_task(self._heartbeat_worker())
+            logger.info("Started heartbeat worker")
+
+    async def stop_heartbeat(self):
+        """Stop the heartbeat worker and immediately deregister container."""
+        if self._heartbeat_task:
+            self._heartbeat_task.cancel()
+            try:
+                await self._heartbeat_task
+            except asyncio.CancelledError:
+                pass
+
+            # Immediate deregistration (no 60s wait)
+            try:
+                await self.redis.srem("monitor:active_containers", self.container_id)
+                await self.redis.delete(f"monitor:heartbeat:{self.container_id}")
+                logger.info(f"Container {self.container_id} immediately deregistered from monitoring")
+            except Exception as e:
+                logger.warning(f"Failed to deregister container on shutdown: {e}")
+
+            self._heartbeat_task = None
+            logger.info("Stopped heartbeat worker")
+
    async def cleanup(self):
        """Cleanup on shutdown - persist final stats and stop workers."""
        logger.info("Monitor cleanup starting...")
        try:
            # Persist final stats before shutdown
            await self._persist_endpoint_stats()
-            # Stop background worker
+            # Stop background workers
            await self.stop_persistence_worker()
+            await self.stop_heartbeat()
            logger.info("Monitor cleanup completed")
        except Exception as e:
            logger.error(f"Monitor cleanup error: {e}")

    async def load_from_redis(self):
-        """Load persisted stats from Redis."""
+        """Load persisted stats from Redis and start workers."""
        try:
            data = await self.redis.get("monitor:endpoint_stats")
            if data:
                self.endpoint_stats = json.loads(data)
                logger.info("Loaded endpoint stats from Redis")
+
+            # Start background workers
+            self.start_heartbeat()
+
        except Exception as e:
            logger.warning(f"Failed to load from Redis: {e}")

@@ -232,17 +496,28 @@ class MonitorStats:
        # Network I/O (delta since last call)
        net = psutil.net_io_counters()

-        # Pool status (acquire lock to prevent race conditions)
+        # Pool status (acquire lock with timeout to prevent race conditions)
        from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
-        async with LOCK:
-            # TODO: Track actual browser process memory instead of estimates
-            # These are conservative estimates based on typical Chromium usage
-            permanent_mem = 270 if PERMANENT else 0  # Estimate: ~270MB for permanent browser
-            hot_mem = len(HOT_POOL) * 180  # Estimate: ~180MB per hot pool browser
-            cold_mem = len(COLD_POOL) * 180  # Estimate: ~180MB per cold pool browser
-            permanent_active = PERMANENT is not None
-            hot_count = len(HOT_POOL)
-            cold_count = len(COLD_POOL)
+        try:
+            async with asyncio.timeout(2.0):
+                async with LOCK:
+                    # TODO: Track actual browser process memory instead of estimates
+                    # These are conservative estimates based on typical Chromium usage
+                    permanent_mem = 270 if PERMANENT else 0  # Estimate: ~270MB for permanent browser
+                    hot_mem = len(HOT_POOL) * 180  # Estimate: ~180MB per hot pool browser
+                    cold_mem = len(COLD_POOL) * 180  # Estimate: ~180MB per cold pool browser
+                    permanent_active = PERMANENT is not None
+                    hot_count = len(HOT_POOL)
+                    cold_count = len(COLD_POOL)
+        except asyncio.TimeoutError:
+            logger.warning("Lock acquisition timeout in get_health_summary, using defaults")
+            # Use safe defaults when lock times out
+            permanent_mem = 0
+            hot_mem = 0
+            cold_mem = 0
+            permanent_active = False
+            hot_count = 0
+            cold_count = 0

        return {
            "container": {
@@ -286,46 +561,52 @@ class MonitorStats:
        return requests

    async def get_browser_list(self) -> List[Dict]:
-        """Get detailed browser pool information."""
+        """Get detailed browser pool information with timeout protection."""
        from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, DEFAULT_CONFIG_SIG, LOCK

        browsers = []
        now = time.time()

-        # Acquire lock to prevent race conditions during iteration
-        async with LOCK:
-            if PERMANENT:
-                browsers.append({
-                    "type": "permanent",
-                    "sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown",
-                    "age_seconds": int(now - self.start_time),
-                    "last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)),
-                    "memory_mb": 270,
-                    "hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0),
-                    "killable": False
-                })
+        # Acquire lock with timeout to prevent deadlock
+        try:
+            async with asyncio.timeout(2.0):
+                async with LOCK:
+                    if PERMANENT:
+                        browsers.append({
+                            "type": "permanent",
+                            "sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown",
+                            "age_seconds": int(now - self.start_time),
+                            "last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)),
+                            "memory_mb": 270,
+                            "hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0),
+                            "killable": False
+                        })

-            for sig, crawler in HOT_POOL.items():
-                browsers.append({
-                    "type": "hot",
-                    "sig": sig[:8],
-                    "age_seconds": int(now - self.start_time),  # Approximation
-                    "last_used_seconds": int(now - LAST_USED.get(sig, now)),
-                    "memory_mb": 180,  # Estimate
-                    "hits": USAGE_COUNT.get(sig, 0),
-                    "killable": True
-                })
+                    for sig, crawler in HOT_POOL.items():
+                        browsers.append({
+                            "type": "hot",
+                            "sig": sig[:8],
+                            "age_seconds": int(now - self.start_time),  # Approximation
+                            "last_used_seconds": int(now - LAST_USED.get(sig, now)),
+                            "memory_mb": 180,  # Estimate
+                            "hits": USAGE_COUNT.get(sig, 0),
+                            "killable": True
+                        })

-            for sig, crawler in COLD_POOL.items():
-                browsers.append({
-                    "type": "cold",
-                    "sig": sig[:8],
-                    "age_seconds": int(now - self.start_time),
-                    "last_used_seconds": int(now - LAST_USED.get(sig, now)),
-                    "memory_mb": 180,
-                    "hits": USAGE_COUNT.get(sig, 0),
-                    "killable": True
-                })
+                    for sig, crawler in COLD_POOL.items():
+                        browsers.append({
+                            "type": "cold",
+                            "sig": sig[:8],
+                            "age_seconds": int(now - self.start_time),
+                            "last_used_seconds": int(now - LAST_USED.get(sig, now)),
+                            "memory_mb": 180,
+                            "hits": USAGE_COUNT.get(sig, 0),
+                            "killable": True
+                        })
+        except asyncio.TimeoutError:
+            logger.error("Browser list lock timeout - pool may be locked by janitor")
+            # Return empty list when lock times out to prevent blocking
+            return []

        return browsers

--- a/deploy/docker/monitor_routes.py
+++ b/deploy/docker/monitor_routes.py
@@ -3,14 +3,140 @@ from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
 from pydantic import BaseModel
 from typing import Optional
 from monitor import get_monitor
+from utils import detect_deployment_mode, get_container_id
 import logging
 import asyncio
 import json
+import re

 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/monitor", tags=["monitor"])


+# ========== Security & Validation ==========
+
+def validate_container_id(cid: str) -> bool:
+    """Validate container ID format to prevent Redis key injection.
+
+    Docker container IDs are 12-64 character hexadecimal strings.
+    Hostnames are alphanumeric with dashes and underscores.
+
+    Args:
+        cid: Container ID to validate
+
+    Returns:
+        True if valid, False otherwise
+    """
+    if not cid or not isinstance(cid, str):
+        return False
+
+    # Allow alphanumeric, dashes, and underscores only (1-64 chars)
+    # This prevents path traversal (../../), wildcards (**), and other injection attempts
+    return bool(re.match(r'^[a-zA-Z0-9_-]{1,64}$', cid))
+
+
+# ========== Redis Aggregation Helpers ==========
+
+async def _get_active_containers():
+    """Get list of active container IDs from Redis with validation."""
+    try:
+        monitor = get_monitor()
+        container_ids = await monitor.redis.smembers("monitor:active_containers")
+
+        # Decode and validate each container ID
+        validated = []
+        for cid in container_ids:
+            cid_str = cid.decode() if isinstance(cid, bytes) else cid
+
+            if validate_container_id(cid_str):
+                validated.append(cid_str)
+            else:
+                logger.warning(f"Invalid container ID format rejected: {cid_str}")
+
+        return validated
+    except Exception as e:
+        logger.error(f"Failed to get active containers: {e}")
+        return []
+
+
+async def _aggregate_active_requests():
+    """Aggregate active requests from all containers."""
+    container_ids = await _get_active_containers()
+    all_requests = []
+
+    monitor = get_monitor()
+    for container_id in container_ids:
+        try:
+            data = await monitor.redis.get(f"monitor:{container_id}:active_requests")
+            if data:
+                requests = json.loads(data)
+                all_requests.extend(requests)
+        except Exception as e:
+            logger.warning(f"Failed to get active requests from {container_id}: {e}")
+
+    return all_requests
+
+
+async def _aggregate_completed_requests(limit=100):
+    """Aggregate completed requests from all containers."""
+    container_ids = await _get_active_containers()
+    all_requests = []
+
+    monitor = get_monitor()
+    for container_id in container_ids:
+        try:
+            data = await monitor.redis.get(f"monitor:{container_id}:completed")
+            if data:
+                requests = json.loads(data)
+                all_requests.extend(requests)
+        except Exception as e:
+            logger.warning(f"Failed to get completed requests from {container_id}: {e}")
+
+    # Sort by end_time (most recent first) and limit
+    all_requests.sort(key=lambda x: x.get("end_time", 0), reverse=True)
+    return all_requests[:limit]
+
+
+async def _aggregate_janitor_events(limit=100):
+    """Aggregate janitor events from all containers."""
+    container_ids = await _get_active_containers()
+    all_events = []
+
+    monitor = get_monitor()
+    for container_id in container_ids:
+        try:
+            data = await monitor.redis.get(f"monitor:{container_id}:janitor")
+            if data:
+                events = json.loads(data)
+                all_events.extend(events)
+        except Exception as e:
+            logger.warning(f"Failed to get janitor events from {container_id}: {e}")
+
+    # Sort by timestamp (most recent first) and limit
+    all_events.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
+    return all_events[:limit]
+
+
+async def _aggregate_errors(limit=100):
+    """Aggregate errors from all containers."""
+    container_ids = await _get_active_containers()
+    all_errors = []
+
+    monitor = get_monitor()
+    for container_id in container_ids:
+        try:
+            data = await monitor.redis.get(f"monitor:{container_id}:errors")
+            if data:
+                errors = json.loads(data)
+                all_errors.extend(errors)
+        except Exception as e:
+            logger.warning(f"Failed to get errors from {container_id}: {e}")
+
+    # Sort by timestamp (most recent first) and limit
+    all_errors.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
+    return all_errors[:limit]
+
+
@router.get("/health")
 async def get_health():
    """Get current system health snapshot."""
@@ -37,18 +163,23 @@ async def get_requests(status: str = "all", limit: int = 50):
        raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")

    try:
-        monitor = get_monitor()
+        # Aggregate from all containers via Redis
+        active_requests = await _aggregate_active_requests()
+        completed_requests = await _aggregate_completed_requests(limit)
+
+        # Filter by status if needed
+        if status in ["success", "error"]:
+            is_success = (status == "success")
+            completed_requests = [r for r in completed_requests if r.get("success") == is_success]

        if status == "active":
-            return {"active": monitor.get_active_requests(), "completed": []}
+            return {"active": active_requests, "completed": []}
        elif status == "completed":
-            return {"active": [], "completed": monitor.get_completed_requests(limit)}
-        elif status in ["success", "error"]:
-            return {"active": [], "completed": monitor.get_completed_requests(limit, status)}
-        else:  # "all"
+            return {"active": [], "completed": completed_requests}
+        else:  # "all" or success/error
            return {
-                "active": monitor.get_active_requests(),
-                "completed": monitor.get_completed_requests(limit)
+                "active": active_requests,
+                "completed": completed_requests
            }
    except Exception as e:
        logger.error(f"Error getting requests: {e}")
@@ -60,8 +191,13 @@ async def get_browsers():
    """Get detailed browser pool information."""
    try:
        monitor = get_monitor()
+        container_id = get_container_id()
        browsers = await monitor.get_browser_list()

+        # Add container_id to each browser
+        for browser in browsers:
+            browser["container_id"] = container_id
+
        # Calculate summary stats
        total_browsers = len(browsers)
        total_memory = sum(b["memory_mb"] for b in browsers)
@@ -77,7 +213,8 @@ async def get_browsers():
                "total_count": total_browsers,
                "total_memory_mb": total_memory,
                "reuse_rate_percent": round(reuse_rate, 1)
-            }
+            },
+            "container_id": container_id
        }
    except Exception as e:
        logger.error(f"Error getting browsers: {e}")
@@ -125,8 +262,9 @@ async def get_janitor_log(limit: int = 100):
        raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")

    try:
-        monitor = get_monitor()
-        return {"events": monitor.get_janitor_log(limit)}
+        # Aggregate from all containers via Redis
+        events = await _aggregate_janitor_events(limit)
+        return {"events": events}
    except Exception as e:
        logger.error(f"Error getting janitor log: {e}")
        raise HTTPException(500, str(e))
@@ -140,8 +278,9 @@ async def get_errors_log(limit: int = 100):
        raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")

    try:
-        monitor = get_monitor()
-        return {"errors": monitor.get_errors_log(limit)}
+        # Aggregate from all containers via Redis
+        errors = await _aggregate_errors(limit)
+        return {"errors": errors}
    except Exception as e:
        logger.error(f"Error getting errors log: {e}")
        raise HTTPException(500, str(e))
@@ -350,15 +489,57 @@ async def reset_stats():
        raise HTTPException(500, str(e))


+@router.get("/containers")
+async def get_containers():
+    """Get container deployment info from Redis heartbeats."""
+    try:
+        monitor = get_monitor()
+        container_ids = await _get_active_containers()
+
+        containers = []
+        for cid in container_ids:
+            try:
+                # Get heartbeat data
+                data = await monitor.redis.get(f"monitor:heartbeat:{cid}")
+                if data:
+                    info = json.loads(data)
+                    containers.append({
+                        "id": info.get("id", cid),
+                        "hostname": info.get("hostname", cid),
+                        "healthy": True  # If heartbeat exists, it's healthy
+                    })
+            except Exception as e:
+                logger.warning(f"Failed to get heartbeat for {cid}: {e}")
+
+        # Determine mode
+        mode = "single" if len(containers) == 1 else "compose"
+        if len(containers) > 1:
+            # Check if any hostname has swarm pattern (service.slot.task_id)
+            if any("." in c["hostname"] and len(c["hostname"].split(".")) > 2 for c in containers):
+                mode = "swarm"
+
+        return {
+            "mode": mode,
+            "container_id": get_container_id(),
+            "containers": containers,
+            "count": len(containers)
+        }
+    except Exception as e:
+        logger.error(f"Error getting containers: {e}")
+        raise HTTPException(500, str(e))
+
+
@router.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
    """WebSocket endpoint for real-time monitoring updates.

-    Sends updates every 2 seconds with:
-    - Health stats
-    - Active/completed requests
-    - Browser pool status
-    - Timeline data
+    Sends aggregated updates every 2 seconds from all containers with:
+    - Health stats (local container)
+    - Active/completed requests (aggregated from all containers)
+    - Browser pool status (local container only - not in Redis)
+    - Timeline data (local container - TODO: aggregate from Redis)
+    - Janitor events (aggregated from all containers)
+    - Errors (aggregated from all containers)
    """
    await websocket.accept()
    logger.info("WebSocket client connected")
@@ -366,24 +547,46 @@ async def websocket_endpoint(websocket: WebSocket):
    try:
        while True:
            try:
-                # Gather all monitoring data
+                # Gather aggregated monitoring data from Redis
                monitor = get_monitor()
+                container_id = get_container_id()
+
+                # Get container info
+                containers_info = await get_containers()
+
+                # AGGREGATE data from all containers via Redis
+                active_reqs = await _aggregate_active_requests()
+                completed_reqs = await _aggregate_completed_requests(limit=10)
+                janitor_events = await _aggregate_janitor_events(limit=10)
+                errors_log = await _aggregate_errors(limit=10)
+
+                # Local container data (not aggregated)
+                local_health = await monitor.get_health_summary()
+                browsers = await monitor.get_browser_list()  # Browser list is local only
+
+                # Add container_id to browsers (they're local)
+                for browser in browsers:
+                    browser["container_id"] = container_id

                data = {
                    "timestamp": asyncio.get_event_loop().time(),
-                    "health": await monitor.get_health_summary(),
+                    "container_id": container_id,  # This container handling the WebSocket
+                    "is_aggregated": True,  # Flag to indicate aggregated data
+                    "local_health": local_health,  # This container's health
+                    "containers": containers_info.get("containers", []),  # All containers
                    "requests": {
-                        "active": monitor.get_active_requests(),
-                        "completed": monitor.get_completed_requests(limit=10)
+                        "active": active_reqs,  # Aggregated from all containers
+                        "completed": completed_reqs  # Aggregated from all containers
                    },
-                    "browsers": await monitor.get_browser_list(),
+                    "browsers": browsers,  # Local only (not in Redis)
                    "timeline": {
+                        # TODO: Aggregate timeline from Redis (currently local only)
                        "memory": monitor.get_timeline_data("memory", "5m"),
                        "requests": monitor.get_timeline_data("requests", "5m"),
                        "browsers": monitor.get_timeline_data("browsers", "5m")
                    },
-                    "janitor": monitor.get_janitor_log(limit=10),
-                    "errors": monitor.get_errors_log(limit=10)
+                    "janitor": janitor_events,  # Aggregated from all containers
+                    "errors": errors_log  # Aggregated from all containers
                }

                # Send update to client
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -200,7 +200,11 @@ async def root():
    return RedirectResponse("/playground")

 # ─────────────────── infra / middleware  ─────────────────────
-redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
+# Build Redis URL from environment or config
+redis_host = os.getenv("REDIS_HOST", config["redis"].get("host", "localhost"))
+redis_port = os.getenv("REDIS_PORT", config["redis"].get("port", 6379))
+redis_url = config["redis"].get("uri") or f"redis://{redis_host}:{redis_port}"
+redis = aioredis.from_url(redis_url)

 limiter = Limiter(
    key_func=get_remote_address,
--- a/deploy/docker/static/monitor/index.html
+++ b/deploy/docker/static/monitor/index.html
@@ -116,74 +116,107 @@

    <!-- Main Content -->
    <main class="flex-1 overflow-auto p-4 space-y-4">
-        <!-- System Health Bar -->
-        <section class="bg-surface rounded-lg border border-border p-4">
-            <h2 class="text-sm font-medium mb-3 text-primary">System Health</h2>
+        <!-- System Health & Infrastructure (side by side) -->
+        <div class="grid grid-cols-2 gap-4">
+            <!-- System Health -->
+            <section class="bg-surface rounded-lg border border-border p-3">
+                <h2 class="text-sm font-medium mb-2 text-primary">System Health</h2>

-            <div class="grid grid-cols-4 gap-4 mb-4">
-                <!-- CPU -->
-                <div>
-                    <div class="flex justify-between text-xs mb-1">
-                        <span class="text-secondary">CPU</span>
-                        <span id="cpu-percent" class="text-light">--%</span>
+                <!-- Row 1: CPU and Memory -->
+                <div class="grid grid-cols-2 gap-3 mb-2">
+                    <!-- CPU -->
+                    <div>
+                        <div class="flex justify-between text-xs mb-1">
+                            <span class="text-secondary">CPU</span>
+                            <span id="cpu-percent" class="text-light">--%</span>
+                        </div>
+                        <div class="w-full bg-dark rounded-full h-2">
+                            <div id="cpu-bar" class="progress-bar h-2 rounded-full bg-primary" style="width: 0%"></div>
+                        </div>
                    </div>
-                    <div class="w-full bg-dark rounded-full h-2">
-                        <div id="cpu-bar" class="progress-bar h-2 rounded-full bg-primary" style="width: 0%"></div>
+
+                    <!-- Memory -->
+                    <div>
+                        <div class="flex justify-between text-xs mb-1">
+                            <span class="text-secondary">Memory</span>
+                            <span id="mem-percent" class="text-light">--%</span>
+                        </div>
+                        <div class="w-full bg-dark rounded-full h-2">
+                            <div id="mem-bar" class="progress-bar h-2 rounded-full bg-accent" style="width: 0%"></div>
+                        </div>
                    </div>
                </div>

-                <!-- Memory -->
-                <div>
-                    <div class="flex justify-between text-xs mb-1">
-                        <span class="text-secondary">Memory</span>
-                        <span id="mem-percent" class="text-light">--%</span>
+                <!-- Row 2: Network and Uptime -->
+                <div class="grid grid-cols-2 gap-3 mb-2">
+                    <!-- Network -->
+                    <div>
+                        <div class="flex justify-between text-xs mb-1">
+                            <span class="text-secondary">Network</span>
+                            <span id="net-io" class="text-light">--</span>
+                        </div>
+                        <div class="text-xs text-secondary">⬆<span id="net-sent">0</span> / ⬇<span id="net-recv">0</span> MB</div>
                    </div>
-                    <div class="w-full bg-dark rounded-full h-2">
-                        <div id="mem-bar" class="progress-bar h-2 rounded-full bg-accent" style="width: 0%"></div>
+
+                    <!-- Uptime -->
+                    <div>
+                        <div class="flex justify-between text-xs mb-1">
+                            <span class="text-secondary">Uptime</span>
+                            <span id="uptime" class="text-light">--</span>
+                        </div>
+                        <div class="text-xs text-secondary" id="last-update">Live: --:--:--</div>
                    </div>
                </div>

-                <!-- Network -->
-                <div>
-                    <div class="flex justify-between text-xs mb-1">
-                        <span class="text-secondary">Network</span>
-                        <span id="net-io" class="text-light">--</span>
+                <!-- Pool Status -->
+                <div class="border-t border-border pt-2">
+                    <div class="grid grid-cols-3 gap-3 text-xs">
+                        <div>
+                            <span class="text-secondary">🔥 Permanent:</span>
+                            <span id="pool-perm" class="text-primary ml-1">INACTIVE (0MB)</span>
+                        </div>
+                        <div>
+                            <span class="text-secondary">♨️ Hot:</span>
+                            <span id="pool-hot" class="text-accent ml-1">0 (0MB)</span>
+                        </div>
+                        <div>
+                            <span class="text-secondary">❄️ Cold:</span>
+                            <span id="pool-cold" class="text-light ml-1">0 (0MB)</span>
+                        </div>
+                    </div>
+                    <div class="mt-1 text-xs text-secondary">
+                        <span>Janitor: </span><span id="janitor-status">adaptive</span> |
+                        <span>Memory pressure: </span><span id="mem-pressure">LOW</span>
                    </div>
-                    <div class="text-xs text-secondary">⬆<span id="net-sent">0</span> MB / ⬇<span id="net-recv">0</span> MB</div>
                </div>
+            </section>

-                <!-- Uptime -->
-                <div>
-                    <div class="flex justify-between text-xs mb-1">
-                        <span class="text-secondary">Uptime</span>
-                        <span id="uptime" class="text-light">--</span>
-                    </div>
-                    <div class="text-xs text-secondary" id="last-update">Updated: never</div>
+            <!-- Infrastructure Section -->
+            <section id="containers-section" class="bg-surface rounded-lg border border-border p-3" style="display: none;">
+            <div class="flex items-center justify-between mb-3">
+                <h2 class="text-sm font-medium text-primary">📦 Infrastructure</h2>
+                <div class="flex items-center space-x-2">
+                    <span class="text-xs text-secondary">Mode:</span>
+                    <span id="deployment-mode" class="text-xs text-primary font-medium">single</span>
+                    <span class="text-xs text-secondary">|</span>
+                    <span class="text-xs text-secondary">Containers:</span>
+                    <span id="container-count" class="text-xs text-accent font-medium">1</span>
                </div>
            </div>

-            <!-- Pool Status -->
-            <div class="border-t border-border pt-3">
-                <div class="grid grid-cols-3 gap-4 text-xs">
-                    <div>
-                        <span class="text-secondary">🔥 Permanent:</span>
-                        <span id="pool-perm" class="text-primary ml-2">INACTIVE (0MB)</span>
-                    </div>
-                    <div>
-                        <span class="text-secondary">♨️ Hot:</span>
-                        <span id="pool-hot" class="text-accent ml-2">0 (0MB)</span>
-                    </div>
-                    <div>
-                        <span class="text-secondary">❄️ Cold:</span>
-                        <span id="pool-cold" class="text-light ml-2">0 (0MB)</span>
-                    </div>
-                </div>
-                <div class="mt-2 text-xs text-secondary">
-                    <span>Janitor: </span><span id="janitor-status">adaptive</span> |
-                    <span>Memory pressure: </span><span id="mem-pressure">LOW</span>
-                </div>
+            <!-- Container Filter Buttons -->
+            <div id="container-filters" class="flex flex-wrap gap-2 mb-3">
+                <button class="container-filter-btn px-3 py-1 rounded text-xs bg-primary text-dark font-medium" data-container="all">
+                    All
+                </button>
+            </div>
+
+            <!-- Container Grid -->
+            <div id="containers-grid" class="grid grid-cols-3 gap-3 text-xs">
+                <!-- Containers will be populated here -->
            </div>
        </section>
+        </div>

        <!-- Live Activity Grid (2x2) -->
        <div class="grid grid-cols-2 gap-4">
@@ -223,11 +256,12 @@
                                <th class="py-1 pr-2">Age</th>
                                <th class="py-1 pr-2">Used</th>
                                <th class="py-1 pr-2">Hits</th>
+                                <th class="py-1 pr-2">Container</th>
                                <th class="py-1">Act</th>
                            </tr>
                        </thead>
                        <tbody id="browsers-table-body">
-                            <tr><td colspan="6" class="text-center py-4 text-secondary">No browsers</td></tr>
+                            <tr><td colspan="7" class="text-center py-4 text-secondary">No browsers</td></tr>
                        </tbody>
                    </table>
                </div>
@@ -356,6 +390,16 @@
        }

        function connectWebSocket() {
+            // Clean up existing connection first to prevent resource leaks
+            if (websocket) {
+                try {
+                    websocket.close();
+                } catch (e) {
+                    console.error('Error closing old WebSocket:', e);
+                }
+                websocket = null;
+            }
+
            if (wsReconnectAttempts >= MAX_WS_RECONNECT) {
                console.log('Max WebSocket reconnect attempts reached, falling back to polling');
                useWebSocket = false;
@@ -370,9 +414,24 @@
            const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
            const wsUrl = `${protocol}//${window.location.host}/monitor/ws`;

-            websocket = new WebSocket(wsUrl);
+            try {
+                websocket = new WebSocket(wsUrl);
+            } catch (e) {
+                console.error('Failed to create WebSocket:', e);
+                setTimeout(() => connectWebSocket(), 2000 * wsReconnectAttempts);
+                return;
+            }
+
+            // Set connection timeout to prevent indefinite connection attempts
+            const connectionTimeout = setTimeout(() => {
+                if (websocket && websocket.readyState === WebSocket.CONNECTING) {
+                    console.log('WebSocket connection timeout');
+                    websocket.close();
+                }
+            }, 5000);

            websocket.onopen = () => {
+                clearTimeout(connectionTimeout);
                console.log('WebSocket connected');
                wsReconnectAttempts = 0;
                updateConnectionStatus('connected');
@@ -385,15 +444,19 @@
            };

            websocket.onerror = (error) => {
+                clearTimeout(connectionTimeout);
                console.error('WebSocket error:', error);
            };

-            websocket.onclose = () => {
-                console.log('WebSocket closed');
+            websocket.onclose = (event) => {
+                clearTimeout(connectionTimeout);
+                console.log(`WebSocket closed: code=${event.code}, reason=${event.reason}`);
                updateConnectionStatus('disconnected', 'Reconnecting...');

-                if (useWebSocket) {
-                    setTimeout(connectWebSocket, 2000 * wsReconnectAttempts);
+                websocket = null;  // Clear reference
+
+                if (useWebSocket && wsReconnectAttempts < MAX_WS_RECONNECT) {
+                    setTimeout(() => connectWebSocket(), 2000 * wsReconnectAttempts);
                } else {
                    startAutoRefresh();
                }
@@ -459,18 +522,28 @@
        }

        function updateRequestsDisplay(requests) {
+            // Filter requests based on current container filter
+            const filteredActive = currentContainerFilter === 'all'
+                ? requests.active
+                : requests.active.filter(r => r.container_id === currentContainerFilter);
+
+            const filteredCompleted = currentContainerFilter === 'all'
+                ? requests.completed
+                : requests.completed.filter(r => r.container_id === currentContainerFilter);
+
            // Update active requests count
            const activeCount = document.getElementById('active-count');
-            if (activeCount) activeCount.textContent = requests.active.length;
+            if (activeCount) activeCount.textContent = filteredActive.length;

            // Update active requests list
            const activeList = document.getElementById('active-requests-list');
            if (activeList) {
-                if (requests.active.length === 0) {
+                if (filteredActive.length === 0) {
                    activeList.innerHTML = '<div class="text-secondary text-center py-2">No active requests</div>';
                } else {
-                    activeList.innerHTML = requests.active.map(req => `
+                    activeList.innerHTML = filteredActive.map(req => `
                        <div class="flex items-center justify-between p-2 bg-dark rounded border border-border">
+                            <span class="text-accent text-xs">${getContainerLabel(req.container_id)}</span>
                            <span class="text-primary">${req.id.substring(0, 8)}</span>
                            <span class="text-secondary">${req.endpoint}</span>
                            <span class="text-light truncate max-w-[200px]" title="${req.url}">${req.url}</span>
@@ -484,11 +557,12 @@
            // Update completed requests
            const completedList = document.getElementById('completed-requests-list');
            if (completedList) {
-                if (requests.completed.length === 0) {
+                if (filteredCompleted.length === 0) {
                    completedList.innerHTML = '<div class="text-secondary text-center py-2">No completed requests</div>';
                } else {
-                    completedList.innerHTML = requests.completed.map(req => `
+                    completedList.innerHTML = filteredCompleted.map(req => `
                        <div class="flex items-center gap-3 p-2 bg-dark rounded">
+                            <span class="text-accent text-xs w-12 flex-shrink-0">${getContainerLabel(req.container_id)}</span>
                            <span class="text-secondary w-16 flex-shrink-0">${req.id.substring(0, 8)}</span>
                            <span class="text-secondary w-16 flex-shrink-0">${req.endpoint}</span>
                            <span class="text-light truncate flex-1" title="${req.url}">${req.url}</span>
@@ -511,6 +585,14 @@
                        const typeIcon = b.type === 'permanent' ? '🔥' : b.type === 'hot' ? '♨️' : '❄️';
                        const typeColor = b.type === 'permanent' ? 'text-primary' : b.type === 'hot' ? 'text-accent' : 'text-light';

+                        // Check if should display based on filter
+                        const shouldDisplay = currentContainerFilter === 'all' ||
+                                            b.container_id === currentContainerFilter;
+                        if (!shouldDisplay) return '';
+
+                        // Find container label (C-1, C-2, etc)
+                        const containerLabel = getContainerLabel(b.container_id);
+
                        return `
                            <tr class="border-t border-border hover:bg-dark">
                                <td class="py-1 pr-2"><span class="${typeColor}">${typeIcon} ${b.type}</span></td>
@@ -518,6 +600,7 @@
                                <td class="py-1 pr-2">${formatSeconds(b.age_seconds || 0)}</td>
                                <td class="py-1 pr-2">${formatSeconds(b.last_used_seconds || 0)}</td>
                                <td class="py-1 pr-2">${b.hits}</td>
+                                <td class="py-1 pr-2 text-accent text-xs">${containerLabel}</td>
                                <td class="py-1">
                                    ${b.killable ? `
                                        <button onclick="killBrowser('${b.sig}')" class="text-red-500 hover:underline text-xs">X</button>
@@ -553,16 +636,23 @@
        function updateJanitorDisplay(events) {
            const janitorLog = document.getElementById('janitor-log');
            if (janitorLog) {
-                if (events.length === 0) {
+                // Filter events based on current container filter
+                const filtered = currentContainerFilter === 'all'
+                    ? events
+                    : events.filter(e => e.container_id === currentContainerFilter);
+
+                if (filtered.length === 0) {
                    janitorLog.innerHTML = '<div class="text-secondary text-center py-4">No events yet</div>';
                } else {
-                    janitorLog.innerHTML = events.slice(0, 10).reverse().map(evt => {
+                    janitorLog.innerHTML = filtered.slice(0, 10).reverse().map(evt => {
                        const time = new Date(evt.timestamp * 1000).toLocaleTimeString();
                        const icon = evt.type === 'close_cold' ? '🧹❄️' : evt.type === 'close_hot' ? '🧹♨️' : '⬆️';
                        const details = JSON.stringify(evt.details);
+                        const containerLabel = getContainerLabel(evt.container_id);

                        return `<div class="p-2 bg-dark rounded">
-                            <span class="text-secondary">${time}</span>
+                            <span class="text-accent text-xs">${containerLabel}</span>
+                            <span class="text-secondary ml-2">${time}</span>
                            <span>${icon}</span>
                            <span class="text-primary">${evt.type}</span>
                            <span class="text-secondary">sig=${evt.sig}</span>
@@ -1059,10 +1149,90 @@
            return `${m}m ${s}s`;
        }

+        // ========== Containers Management ==========
+        let currentContainerFilter = 'all';
+        let containerMapping = {}; // Maps container_id to label (C-1, C-2, etc)
+
+        // Helper to get container label from ID or hostname
+        function getContainerLabel(containerId) {
+            // Try direct lookup first (works for both hostname and id)
+            if (containerMapping[containerId]) {
+                return containerMapping[containerId];
+            }
+            // Fallback: show first 8 chars of container ID
+            return containerId?.substring(0, 8) || 'unknown';
+        }
+
+        async function fetchContainers() {
+            try {
+                const res = await fetch('/monitor/containers');
+                const data = await res.json();
+
+                document.getElementById('deployment-mode').textContent = data.mode;
+                document.getElementById('container-count').textContent = data.count;
+
+                // Build container ID to label mapping
+                // Use hostname as primary key (friendly name like "crawl4ai-1")
+                // Also map id for backwards compatibility
+                containerMapping = {};
+                data.containers.forEach((c, i) => {
+                    const label = `C-${i+1}`;
+                    containerMapping[c.hostname] = label;  // Map hostname
+                    containerMapping[c.id] = label;  // Also map id
+                });
+
+                // Show section only if multi-container
+                const section = document.getElementById('containers-section');
+                if (data.count > 1) {
+                    section.style.display = 'block';
+
+                    // Update filter buttons
+                    const filtersDiv = document.getElementById('container-filters');
+                    filtersDiv.innerHTML = `
+                        <button class="container-filter-btn px-3 py-1 rounded text-xs ${currentContainerFilter === 'all' ? 'bg-primary text-dark' : 'bg-dark text-secondary'} font-medium" data-container="all">All</button>
+                        ${data.containers.map((c, i) => `
+                            <button class="container-filter-btn px-3 py-1 rounded text-xs ${currentContainerFilter === c.id ? 'bg-primary text-dark' : 'bg-dark text-secondary'}" data-container="${c.id}">C-${i+1}</button>
+                        `).join('')}
+                    `;
+
+                    // Add click handlers to filter buttons
+                    document.querySelectorAll('.container-filter-btn').forEach(btn => {
+                        btn.addEventListener('click', () => {
+                            currentContainerFilter = btn.dataset.container;
+                            fetchContainers(); // Refresh to update button styles
+                            // Re-fetch all data with filter applied
+                            fetchRequests();
+                            fetchBrowsers();
+                            fetchJanitorLogs();
+                            fetchErrorLogs();
+                        });
+                    });
+
+                    // Update containers grid
+                    const grid = document.getElementById('containers-grid');
+                    grid.innerHTML = data.containers.map((c, i) => `
+                        <div class="p-3 bg-dark rounded border ${currentContainerFilter === c.id || currentContainerFilter === 'all' ? 'border-primary' : 'border-border'}">
+                            <div class="flex items-center justify-between mb-2">
+                                <span class="text-primary font-medium">C-${i+1}</span>
+                                <span class="text-xs ${c.healthy ? 'text-accent' : 'text-red-500'}">${c.healthy ? '🟢' : '🔴'}</span>
+                            </div>
+                            <div class="text-xs text-secondary truncate" title="${c.hostname}">${c.hostname}</div>
+                        </div>
+                    `).join('');
+                } else {
+                    section.style.display = 'none';
+                }
+            } catch (e) {
+                console.error('Failed to fetch containers:', e);
+            }
+        }
+
        // ========== Filter change handler ==========
        document.getElementById('filter-requests')?.addEventListener('change', fetchRequests);

        // ========== Initialize ==========
+        // Fetch containers info on load
+        fetchContainers();
        // Try WebSocket first, fallback to polling on failure
        connectWebSocket();
    </script>
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -203,4 +203,51 @@ def get_container_memory_percent() -> float:
    except:
        # Non-container or unsupported: fallback to host
        import psutil
-        return psutil.virtual_memory().percent
+        return psutil.virtual_memory().percent
+
+
+def get_container_id() -> str:
+    """Get current container ID (hostname in Docker)."""
+    import socket
+    return socket.gethostname()
+
+
+def detect_deployment_mode() -> tuple[str, list[dict]]:
+    """Detect if running in single/swarm/compose mode and get container list.
+
+    Returns:
+        (mode, containers) where mode is "single"|"swarm"|"compose"
+        containers is list of {id, hostname, healthy}
+    """
+    import socket
+    my_hostname = socket.gethostname()
+
+    # Check if we're behind nginx (Compose mode indicator)
+    # In Compose, service name resolves to multiple IPs
+    try:
+        import socket as sock
+        # Try to resolve "crawl4ai" service name (Compose service)
+        try:
+            addrs = sock.getaddrinfo("crawl4ai", None)
+            unique_ips = set(addr[4][0] for addr in addrs)
+            if len(unique_ips) > 1:
+                # Multiple IPs = Compose with replicas
+                containers = [
+                    {"id": f"container-{i+1}", "hostname": f"crawl4ai-{i+1}", "healthy": True}
+                    for i in range(len(unique_ips))
+                ]
+                return "compose", containers
+        except:
+            pass
+
+        # Check for Swarm mode (TODO: needs swarm-specific detection)
+        # For now, if hostname pattern matches swarm, detect it
+        if "." in my_hostname and len(my_hostname.split(".")) > 2:
+            # Swarm hostname format: service.slot.task_id
+            return "swarm", [{"id": my_hostname, "hostname": my_hostname, "healthy": True}]
+
+    except:
+        pass
+
+    # Default: single container
+    return "single", [{"id": my_hostname, "hostname": my_hostname, "healthy": True}]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,43 +1,18 @@
 version: '3.8'

-# Shared configuration for all environments
-x-base-config: &base-config
-  ports:
-    - "11235:11235"  # Gunicorn port
-  env_file:
-    - .llm.env       # API keys (create from .llm.env.example)
-  environment:
-    - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-    - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
-    - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
-    - GROQ_API_KEY=${GROQ_API_KEY:-}
-    - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
-    - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
-    - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
-    - LLM_PROVIDER=${LLM_PROVIDER:-}  # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
-  volumes:
-    - /dev/shm:/dev/shm  # Chromium performance
-  deploy:
-    resources:
-      limits:
-        memory: 4G
-      reservations:
-        memory: 1G
-  restart: unless-stopped
-  healthcheck:
-    test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
-    interval: 30s
-    timeout: 10s
-    retries: 3
-    start_period: 40s
-  user: "appuser"
-
 services:
+  redis:
+    image: redis:alpine
+    command: redis-server --appendonly yes
+    volumes:
+      - redis_data:/data
+    networks:
+      - crawl4ai_net
+    restart: unless-stopped
+
  crawl4ai:
-    # 1. Default: Pull multi-platform test image from Docker Hub
-    # 2. Override with local image via: IMAGE=local-test docker compose up
    image: ${IMAGE:-unclecode/crawl4ai:${TAG:-latest}}
-    
+
    # Local build config (used with --build)
    build:
      context: .
@@ -45,6 +20,58 @@ services:
      args:
        INSTALL_TYPE: ${INSTALL_TYPE:-default}
        ENABLE_GPU: ${ENABLE_GPU:-false}
-    
-    # Inherit shared config
-    <<: *base-config
+
+    # No ports exposed - access via nginx only
+    env_file:
+      - .llm.env
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+      - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+      - GROQ_API_KEY=${GROQ_API_KEY:-}
+      - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
+      - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
+      - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
+      - LLM_PROVIDER=${LLM_PROVIDER:-}
+      - REDIS_HOST=redis
+      - REDIS_PORT=6379
+    volumes:
+      - /dev/shm:/dev/shm  # Chromium performance
+    deploy:
+      replicas: 3  # Default to 3 replicas (can override with --scale)
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 1G
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+    user: "appuser"
+    depends_on:
+      - redis
+    networks:
+      - crawl4ai_net
+
+  nginx:
+    image: nginx:alpine
+    ports:
+      - "11235:80"  # Expose port 11235 to host
+    volumes:
+      - ./crawl4ai/templates/nginx.conf.template:/etc/nginx/nginx.conf:ro
+    depends_on:
+      - crawl4ai
+    networks:
+      - crawl4ai_net
+    restart: unless-stopped
+
+networks:
+  crawl4ai_net:
+    driver: bridge
+
+volumes:
+  redis_data: