feat(docker): add multi-container cluster deployment with CLI management

Add comprehensive Docker cluster orchestration with horizontal scaling support.

CLI Commands:
- crwl server start/stop/restart/status/scale/logs
- Auto-detection: Single (N=1) → Swarm (N>1) → Compose (N>1 fallback)
- Support for 1-100 container replicas with zero-downtime scaling

Infrastructure:
- Nginx load balancing (round-robin API, sticky sessions monitoring)
- Redis-based container discovery via heartbeats (30s interval)
- Real-time monitoring dashboard with cluster-wide visibility
- WebSocket aggregation from all containers

Security & Stability Fixes (12 critical issues):
- Add timeout protection to browser pool locks (prevent deadlocks)
- Implement Redis retry logic with exponential backoff
- Add container ID validation (prevent Redis key injection)
- Add CLI input sanitization (prevent shell injection)
- Add file locking for state management (prevent corruption)
- Fix WebSocket resource leaks and connection cleanup
- Add graceful degradation and circuit breakers

Configuration:
- RedisTTLConfig dataclass with environment variable support
- Template-based docker-compose.yml and nginx.conf generation
- Comprehensive error handling with actionable messages

Documentation:
- AGENT.md: Complete DevOps context for AI assistants
- MULTI_CONTAINER_ARCHITECTURE.md: Technical architecture guide
- Reorganized docs into deploy/docker/docs/
This commit is contained in:
unclecode
2025-10-19 13:31:14 +08:00
parent 73a5a7b0f5
commit 91f7b9d129
18 changed files with 5116 additions and 196 deletions

View File

@@ -625,6 +625,11 @@ def cli():
pass
# Register server command group (Docker orchestration)
from crawl4ai.server_cli import server_cmd
cli.add_command(server_cmd)
@cli.group("browser")
def browser_cmd():
"""Manage browser instances for Crawl4AI

420
crawl4ai/server_cli.py Normal file
View File

@@ -0,0 +1,420 @@
"""
Crawl4AI Server CLI Commands
Provides `crwl server` command group for Docker orchestration.
"""
import click
import anyio
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.prompt import Confirm
from crawl4ai.server_manager import ServerManager
console = Console()
@click.group("server")
def server_cmd():
"""Manage Crawl4AI Docker server instances
One-command deployment with automatic scaling:
- Single container for development (N=1)
- Docker Swarm for production with built-in load balancing (N>1)
- Docker Compose + Nginx as fallback (N>1)
Examples:
crwl server start # Single container on port 11235
crwl server start --replicas 3 # Auto-detect Swarm or Compose
crwl server start -r 5 --port 8080 # 5 replicas on custom port
crwl server status # Check current deployment
crwl server scale 10 # Scale to 10 replicas
crwl server stop # Stop and cleanup
"""
pass
@server_cmd.command("start")
@click.option(
"--replicas", "-r",
type=int,
default=1,
help="Number of container replicas (default: 1)"
)
@click.option(
"--mode",
type=click.Choice(["auto", "single", "swarm", "compose"]),
default="auto",
help="Deployment mode (default: auto-detect)"
)
@click.option(
"--port", "-p",
type=int,
default=11235,
help="External port to expose (default: 11235)"
)
@click.option(
"--env-file",
type=click.Path(exists=True),
help="Path to environment file"
)
@click.option(
"--image",
default="unclecode/crawl4ai:latest",
help="Docker image to use (default: unclecode/crawl4ai:latest)"
)
def start_cmd(replicas: int, mode: str, port: int, env_file: str, image: str):
"""Start Crawl4AI server with automatic orchestration.
Deployment modes:
- auto: Automatically choose best mode (default)
- single: Single container (N=1 only)
- swarm: Docker Swarm with built-in load balancing
- compose: Docker Compose + Nginx reverse proxy
The server will:
1. Check if Docker is running
2. Validate port availability
3. Pull image if needed
4. Start container(s) with health checks
5. Save state for management
Examples:
# Development: single container
crwl server start
# Production: 5 replicas with Swarm
crwl server start --replicas 5
# Custom configuration
crwl server start -r 3 --port 8080 --env-file .env.prod
"""
manager = ServerManager()
console.print(Panel(
f"[cyan]Starting Crawl4AI Server[/cyan]\n\n"
f"Replicas: [yellow]{replicas}[/yellow]\n"
f"Mode: [yellow]{mode}[/yellow]\n"
f"Port: [yellow]{port}[/yellow]\n"
f"Image: [yellow]{image}[/yellow]",
title="Server Start",
border_style="cyan"
))
with console.status("[cyan]Starting server..."):
async def _start():
return await manager.start(
replicas=replicas,
mode=mode,
port=port,
env_file=env_file,
image=image
)
result = anyio.run(_start)
if result["success"]:
console.print(Panel(
f"[green]✓ Server started successfully![/green]\n\n"
f"Mode: [cyan]{result.get('state_data', {}).get('mode', mode)}[/cyan]\n"
f"URL: [bold]http://localhost:{port}[/bold]\n"
f"Health: [bold]http://localhost:{port}/health[/bold]\n"
f"Monitor: [bold]http://localhost:{port}/monitor[/bold]",
title="Server Running",
border_style="green"
))
else:
error_msg = result.get("error", result.get("message", "Unknown error"))
console.print(Panel(
f"[red]✗ Failed to start server[/red]\n\n"
f"{error_msg}",
title="Error",
border_style="red"
))
if "already running" in error_msg.lower():
console.print("\n[yellow]Hint: Use 'crwl server status' to check current deployment[/yellow]")
console.print("[yellow] Use 'crwl server stop' to stop existing server[/yellow]")
@server_cmd.command("status")
def status_cmd():
"""Show current server status and deployment info.
Displays:
- Running state (up/down)
- Deployment mode (single/swarm/compose)
- Number of replicas
- Port mapping
- Uptime
- Image version
Example:
crwl server status
"""
manager = ServerManager()
async def _status():
return await manager.status()
result = anyio.run(_status)
if result["running"]:
table = Table(title="Crawl4AI Server Status", border_style="green")
table.add_column("Property", style="cyan")
table.add_column("Value", style="green")
table.add_row("Status", "🟢 Running")
table.add_row("Mode", result["mode"])
table.add_row("Replicas", str(result.get("replicas", 1)))
table.add_row("Port", str(result.get("port", 11235)))
table.add_row("Image", result.get("image", "unknown"))
table.add_row("Uptime", result.get("uptime", "unknown"))
table.add_row("Started", result.get("started_at", "unknown"))
console.print(table)
console.print(f"\n[green]✓ Server is healthy[/green]")
console.print(f"[dim]Access: http://localhost:{result.get('port', 11235)}[/dim]")
else:
console.print(Panel(
f"[yellow]No server is currently running[/yellow]\n\n"
f"Use 'crwl server start' to launch a server",
title="Server Status",
border_style="yellow"
))
@server_cmd.command("stop")
@click.option(
"--remove-volumes",
is_flag=True,
help="Remove associated volumes (WARNING: deletes data)"
)
def stop_cmd(remove_volumes: bool):
"""Stop running Crawl4AI server and cleanup resources.
This will:
1. Stop all running containers/services
2. Remove containers
3. Optionally remove volumes (--remove-volumes)
4. Clean up state files
WARNING: Use --remove-volumes with caution as it will delete
persistent data including Redis databases and logs.
Examples:
# Stop server, keep volumes
crwl server stop
# Stop and remove all data
crwl server stop --remove-volumes
"""
manager = ServerManager()
# Confirm if removing volumes
if remove_volumes:
if not Confirm.ask(
"[red]⚠️ This will delete all server data including Redis databases. Continue?[/red]"
):
console.print("[yellow]Cancelled[/yellow]")
return
with console.status("[cyan]Stopping server..."):
async def _stop():
return await manager.stop(remove_volumes=remove_volumes)
result = anyio.run(_stop)
if result["success"]:
console.print(Panel(
f"[green]✓ Server stopped successfully[/green]\n\n"
f"{result.get('message', 'All resources cleaned up')}",
title="Server Stopped",
border_style="green"
))
else:
console.print(Panel(
f"[red]✗ Error stopping server[/red]\n\n"
f"{result.get('error', result.get('message', 'Unknown error'))}",
title="Error",
border_style="red"
))
@server_cmd.command("scale")
@click.argument("replicas", type=int)
def scale_cmd(replicas: int):
"""Scale server to specified number of replicas.
Only works with Swarm or Compose modes. Single container
mode cannot be scaled (must stop and restart with --replicas).
Scaling is live and does not require downtime. The load
balancer will automatically distribute traffic to new replicas.
Examples:
# Scale up to 10 replicas
crwl server scale 10
# Scale down to 2 replicas
crwl server scale 2
# Scale to 1 (minimum)
crwl server scale 1
"""
if replicas < 1:
console.print("[red]Error: Replicas must be at least 1[/red]")
return
manager = ServerManager()
with console.status(f"[cyan]Scaling to {replicas} replicas..."):
async def _scale():
return await manager.scale(replicas=replicas)
result = anyio.run(_scale)
if result["success"]:
console.print(Panel(
f"[green]✓ Scaled successfully[/green]\n\n"
f"New replica count: [bold]{replicas}[/bold]\n"
f"Mode: [cyan]{result.get('mode')}[/cyan]",
title="Scaling Complete",
border_style="green"
))
else:
error_msg = result.get("error", result.get("message", "Unknown error"))
console.print(Panel(
f"[red]✗ Scaling failed[/red]\n\n"
f"{error_msg}",
title="Error",
border_style="red"
))
if "single container" in error_msg.lower():
console.print("\n[yellow]Hint: For single container mode:[/yellow]")
console.print("[yellow] 1. crwl server stop[/yellow]")
console.print(f"[yellow] 2. crwl server start --replicas {replicas}[/yellow]")
@server_cmd.command("logs")
@click.option(
"--follow", "-f",
is_flag=True,
help="Follow log output (like tail -f)"
)
@click.option(
"--tail",
type=int,
default=100,
help="Number of lines to show (default: 100)"
)
def logs_cmd(follow: bool, tail: int):
"""View server logs.
Shows logs from running containers/services. Use --follow
to stream logs in real-time.
Examples:
# Show last 100 lines
crwl server logs
# Show last 500 lines
crwl server logs --tail 500
# Follow logs in real-time
crwl server logs --follow
# Combine options
crwl server logs -f --tail 50
"""
manager = ServerManager()
async def _logs():
return await manager.logs(follow=follow, tail=tail)
output = anyio.run(_logs)
console.print(output)
@server_cmd.command("restart")
@click.option(
"--replicas", "-r",
type=int,
help="New replica count (optional)"
)
def restart_cmd(replicas: int):
"""Restart server (stop then start with same config).
Preserves existing configuration unless overridden with options.
Useful for applying image updates or recovering from errors.
Examples:
# Restart with same configuration
crwl server restart
# Restart and change replica count
crwl server restart --replicas 5
"""
manager = ServerManager()
# Get current state
async def _get_status():
return await manager.status()
current = anyio.run(_get_status)
if not current["running"]:
console.print("[yellow]No server is running. Use 'crwl server start' instead.[/yellow]")
return
# Extract current config
current_replicas = current.get("replicas", 1)
current_port = current.get("port", 11235)
current_image = current.get("image", "unclecode/crawl4ai:latest")
current_mode = current.get("mode", "auto")
# Override with CLI args
new_replicas = replicas if replicas is not None else current_replicas
console.print(Panel(
f"[cyan]Restarting Crawl4AI Server[/cyan]\n\n"
f"Replicas: [yellow]{current_replicas}[/yellow] → [green]{new_replicas}[/green]\n"
f"Port: [yellow]{current_port}[/yellow]\n"
f"Mode: [yellow]{current_mode}[/yellow]",
title="Server Restart",
border_style="cyan"
))
# Stop current
with console.status("[cyan]Stopping current server..."):
async def _stop_server():
return await manager.stop(remove_volumes=False)
stop_result = anyio.run(_stop_server)
if not stop_result["success"]:
console.print(f"[red]Failed to stop server: {stop_result.get('error')}[/red]")
return
# Start new
with console.status("[cyan]Starting server..."):
async def _start_server():
return await manager.start(
replicas=new_replicas,
mode="auto",
port=current_port,
image=current_image
)
start_result = anyio.run(_start_server)
if start_result["success"]:
console.print(Panel(
f"[green]✓ Server restarted successfully![/green]\n\n"
f"URL: [bold]http://localhost:{current_port}[/bold]",
title="Restart Complete",
border_style="green"
))
else:
console.print(Panel(
f"[red]✗ Failed to restart server[/red]\n\n"
f"{start_result.get('error', 'Unknown error')}",
title="Error",
border_style="red"
))

1030
crawl4ai/server_manager.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,52 @@
version: '3.8'
services:
redis:
image: redis:alpine
command: redis-server --appendonly yes
volumes:
- redis_data:/data
networks:
- crawl4ai_net
restart: unless-stopped
crawl4ai:
image: ${IMAGE}
deploy:
replicas: ${REPLICAS}
resources:
limits:
memory: 4G
shm_size: 1g
environment:
- REDIS_HOST=redis
- REDIS_PORT=6379
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
depends_on:
- redis
networks:
- crawl4ai_net
nginx:
image: nginx:alpine
ports:
- "${PORT}:80"
volumes:
- ${NGINX_CONF}:/etc/nginx/nginx.conf:ro
depends_on:
- crawl4ai
networks:
- crawl4ai_net
restart: unless-stopped
networks:
crawl4ai_net:
driver: bridge
volumes:
redis_data:

View File

@@ -0,0 +1,75 @@
events {
worker_connections 1024;
}
http {
upstream crawl4ai_backend {
# DNS-based load balancing to Docker Compose service
# Docker Compose provides DNS resolution for service name
server crawl4ai:11235 max_fails=3 fail_timeout=30s;
# Keep connections alive
keepalive 32;
}
# Sticky sessions for monitoring (same IP always goes to same container)
upstream crawl4ai_monitor {
ip_hash; # Sticky sessions based on client IP
server crawl4ai:11235 max_fails=3 fail_timeout=30s;
keepalive 32;
}
server {
listen 80;
server_name _;
# Increase timeouts for long-running crawl operations
proxy_connect_timeout 300;
proxy_send_timeout 300;
proxy_read_timeout 300;
send_timeout 300;
# WebSocket endpoint for real-time monitoring (exact match)
location = /monitor/ws {
proxy_pass http://crawl4ai_monitor/monitor/ws;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
# WebSocket timeouts
proxy_connect_timeout 7d;
proxy_send_timeout 7d;
proxy_read_timeout 7d;
}
# Monitor and dashboard with sticky sessions (regex location)
location ~ ^/(monitor|dashboard) {
proxy_pass http://crawl4ai_monitor;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# HTTP endpoints (load balanced)
location / {
proxy_pass http://crawl4ai_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Support large request bodies (for batch operations)
client_max_body_size 10M;
}
# Health check endpoint (bypass load balancer)
location /health {
proxy_pass http://crawl4ai_backend/health;
access_log off;
}
}
}

402
deploy/docker/AGENT.md Normal file
View File

@@ -0,0 +1,402 @@
# Crawl4AI DevOps Agent Context
## Service Overview
**Crawl4AI**: Browser-based web crawling service with AI extraction. Docker deployment with horizontal scaling (1-N containers), Redis coordination, Nginx load balancing.
## Architecture Quick Reference
```
Client → Nginx:11235 → [crawl4ai-1, crawl4ai-2, ...crawl4ai-N] ← Redis
Monitor Dashboard
```
**Components:**
- **Nginx**: Load balancer (round-robin API, sticky monitoring)
- **Crawl4AI containers**: FastAPI + Playwright browsers
- **Redis**: Container discovery (heartbeats 30s), monitoring data aggregation
- **Monitor**: Real-time dashboard at `/dashboard`
## CLI Commands
### Start/Stop
```bash
crwl server start [-r N] [--port P] [--mode auto|single|swarm|compose] [--env-file F] [--image I]
crwl server stop [--remove-volumes]
crwl server restart [-r N]
```
### Management
```bash
crwl server status # Show mode, replicas, port, uptime
crwl server scale N # Live scaling (Swarm/Compose only)
crwl server logs [-f] [--tail N]
```
**Defaults**: replicas=1, port=11235, mode=auto, image=unclecode/crawl4ai:latest
## Deployment Modes
| Replicas | Mode | Load Balancer | Use Case |
|----------|------|---------------|----------|
| N=1 | single | None | Dev/testing |
| N>1 | swarm | Built-in | Production (if `docker swarm init` done) |
| N>1 | compose | Nginx | Production (fallback) |
**Mode Detection** (when mode=auto):
1. If N=1 → single
2. If N>1 & Swarm active → swarm
3. If N>1 & Swarm inactive → compose
## File Locations
```
~/.crawl4ai/server/
├── state.json # Current deployment state
├── docker-compose.yml # Generated compose file
└── nginx.conf # Generated nginx config
/app/ # Inside container
├── deploy/docker/server.py
├── deploy/docker/monitor.py
├── deploy/docker/static/monitor/index.html
└── crawler_pool.py # Browser pool (PERMANENT, HOT_POOL, COLD_POOL)
```
## Monitoring & Troubleshooting
### Health Checks
```bash
curl http://localhost:11235/health # Service health
curl http://localhost:11235/monitor/containers # Container discovery
curl http://localhost:11235/monitor/requests # Aggregated requests
```
### Dashboard
- URL: `http://localhost:11235/dashboard/`
- Features: Container filtering (All/C-1/C-2/C-3), real-time WebSocket, timeline charts
- WebSocket: `/monitor/ws` (sticky sessions)
### Common Issues
**No containers showing in dashboard:**
```bash
docker exec <redis-container> redis-cli SMEMBERS monitor:active_containers
docker exec <redis-container> redis-cli KEYS "monitor:heartbeat:*"
```
Wait 30s for heartbeat registration.
**Load balancing not working:**
```bash
docker exec <nginx-container> cat /etc/nginx/nginx.conf | grep upstream
docker logs <nginx-container> | grep error
```
Check Nginx upstream has no `ip_hash` for API endpoints.
**Redis connection errors:**
```bash
docker logs <crawl4ai-container> | grep -i redis
docker exec <crawl4ai-container> ping redis
```
Verify REDIS_HOST=redis, REDIS_PORT=6379.
**Containers not scaling:**
```bash
# Swarm
docker service ls
docker service ps crawl4ai
# Compose
docker compose -f ~/.crawl4ai/server/docker-compose.yml ps
docker compose -f ~/.crawl4ai/server/docker-compose.yml up -d --scale crawl4ai=N
```
### Redis Data Structure
```
monitor:active_containers # SET: {container_ids}
monitor:heartbeat:{cid} # STRING: {id, hostname, last_seen} TTL=60s
monitor:{cid}:active_requests # STRING: JSON list, TTL=5min
monitor:{cid}:completed # STRING: JSON list, TTL=1h
monitor:{cid}:janitor # STRING: JSON list, TTL=1h
monitor:{cid}:errors # STRING: JSON list, TTL=1h
monitor:endpoint_stats # STRING: JSON aggregate, TTL=24h
```
## Environment Variables
### Required for Multi-LLM
```bash
OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=sk-ant-...
DEEPSEEK_API_KEY=...
GROQ_API_KEY=...
TOGETHER_API_KEY=...
MISTRAL_API_KEY=...
GEMINI_API_TOKEN=...
```
### Redis Configuration (Optional)
```bash
REDIS_HOST=redis # Default: redis
REDIS_PORT=6379 # Default: 6379
REDIS_TTL_ACTIVE_REQUESTS=300 # Default: 5min
REDIS_TTL_COMPLETED_REQUESTS=3600 # Default: 1h
REDIS_TTL_JANITOR_EVENTS=3600 # Default: 1h
REDIS_TTL_ERRORS=3600 # Default: 1h
REDIS_TTL_ENDPOINT_STATS=86400 # Default: 24h
REDIS_TTL_HEARTBEAT=60 # Default: 1min
```
## API Endpoints
### Core API
- `POST /crawl` - Crawl URL (load-balanced)
- `POST /batch` - Batch crawl (load-balanced)
- `GET /health` - Health check (load-balanced)
### Monitor API (Aggregated from all containers)
- `GET /monitor/health` - Local container health
- `GET /monitor/containers` - All active containers
- `GET /monitor/requests` - All requests (active + completed)
- `GET /monitor/browsers` - Browser pool status (local only)
- `GET /monitor/logs/janitor` - Janitor cleanup events
- `GET /monitor/logs/errors` - Error logs
- `GET /monitor/endpoints/stats` - Endpoint analytics
- `WS /monitor/ws` - Real-time updates (aggregated)
### Control Actions
- `POST /monitor/actions/cleanup` - Force browser cleanup
- `POST /monitor/actions/kill_browser` - Kill specific browser
- `POST /monitor/actions/restart_browser` - Restart browser
- `POST /monitor/stats/reset` - Reset endpoint counters
## Docker Commands Reference
### Inspection
```bash
# List containers
docker ps --filter "name=crawl4ai"
# Container logs
docker logs <container-id> -f --tail 100
# Redis CLI
docker exec -it <redis-container> redis-cli
KEYS monitor:*
SMEMBERS monitor:active_containers
GET monitor:<cid>:completed
TTL monitor:heartbeat:<cid>
# Nginx config
docker exec <nginx-container> cat /etc/nginx/nginx.conf
# Container stats
docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}"
```
### Compose Operations
```bash
# Scale
docker compose -f ~/.crawl4ai/server/docker-compose.yml up -d --scale crawl4ai=5
# Restart service
docker compose -f ~/.crawl4ai/server/docker-compose.yml restart crawl4ai
# View services
docker compose -f ~/.crawl4ai/server/docker-compose.yml ps
```
### Swarm Operations
```bash
# Initialize Swarm
docker swarm init
# Scale service
docker service scale crawl4ai=5
# Service info
docker service ls
docker service ps crawl4ai --no-trunc
# Service logs
docker service logs crawl4ai --tail 100 -f
```
## Performance & Scaling
### Resource Recommendations
| Containers | Memory/Container | Total Memory | Use Case |
|------------|-----------------|--------------|----------|
| 1 | 4GB | 4GB | Development |
| 3 | 4GB | 12GB | Small prod |
| 5 | 4GB | 20GB | Medium prod |
| 10 | 4GB | 40GB | Large prod |
**Expected Throughput**: ~10 req/min per container (depends on crawl complexity)
### Scaling Guidelines
- **Horizontal**: Add replicas (`crwl server scale N`)
- **Vertical**: Adjust `--memory 8G --cpus 4` in kwargs
- **Browser Pool**: Permanent (1) + Hot pool (adaptive) + Cold pool (cleanup by janitor)
### Redis Memory Usage
- **Per container**: ~110KB (requests + events + errors + heartbeat)
- **10 containers**: ~1.1MB
- **Recommendation**: 256MB Redis is sufficient for <100 containers
## Security Notes
### Input Validation
All CLI inputs validated:
- Image name: alphanumeric + `.-/:_@` only, max 256 chars
- Port: 1-65535
- Replicas: 1-100
- Env file: must exist and be readable
- Container IDs: alphanumeric + `-_` only (prevents Redis injection)
### Network Security
- Nginx forwards to internal `crawl4ai` service (Docker network)
- Monitor endpoints have NO authentication (add MONITOR_TOKEN env for security)
- Redis is internal-only (no external port)
### Recommended Production Setup
```bash
# Add authentication
export MONITOR_TOKEN="your-secret-token"
# Use Redis password
redis:
command: redis-server --requirepass ${REDIS_PASSWORD}
# Enable rate limiting in Nginx
limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s;
```
## Common User Scenarios
### Scenario 1: Fresh Deployment
```bash
crwl server start --replicas 3 --env-file .env
# Wait for health check, then access http://localhost:11235/health
```
### Scenario 2: Scaling Under Load
```bash
crwl server scale 10
# Live scaling, no downtime
```
### Scenario 3: Debugging Slow Requests
```bash
# Check dashboard
open http://localhost:11235/dashboard/
# Check container logs
docker logs <slowest-container-id> --tail 100
# Check browser pool
curl http://localhost:11235/monitor/browsers | jq
```
### Scenario 4: Redis Connection Issues
```bash
# Check Redis connectivity
docker exec <crawl4ai-container> nc -zv redis 6379
# Check Redis logs
docker logs <redis-container>
# Restart containers (triggers reconnect with retry logic)
crwl server restart
```
### Scenario 5: Container Not Appearing in Dashboard
```bash
# Wait 30s for heartbeat
sleep 30
# Check Redis
docker exec <redis-container> redis-cli SMEMBERS monitor:active_containers
# Check container logs for heartbeat errors
docker logs <missing-container> | grep -i heartbeat
```
## Code Context for Advanced Debugging
### Key Classes
- `MonitorStats` (monitor.py): Tracks stats, Redis persistence, heartbeat worker
- `ServerManager` (server_manager.py): CLI orchestration, mode detection
- Browser pool globals: `PERMANENT`, `HOT_POOL`, `COLD_POOL`, `LOCK` (crawler_pool.py)
### Critical Timeouts
- Browser pool lock: 2s timeout (prevents deadlock)
- WebSocket connection: 5s timeout
- Health check: 30-60s timeout
- Heartbeat interval: 30s, TTL: 60s
- Redis retry: 3 attempts, backoff: 0.5s/1s/2s
- Circuit breaker: 5 failures → 5min backoff
### State Transitions
```
NOT_RUNNING → STARTING → HEALTHY → RUNNING
↓ ↓
FAILED UNHEALTHY → STOPPED
```
State file: `~/.crawl4ai/server/state.json` (atomic writes, fcntl locking)
## Quick Diagnostic Commands
```bash
# Full system check
crwl server status
docker ps
curl http://localhost:11235/health
curl http://localhost:11235/monitor/containers | jq
# Redis check
docker exec <redis-container> redis-cli PING
docker exec <redis-container> redis-cli INFO stats
# Network check
docker network ls
docker network inspect <network-name>
# Logs check
docker logs <nginx-container> --tail 50
docker logs <redis-container> --tail 50
docker compose -f ~/.crawl4ai/server/docker-compose.yml logs --tail 100
```
## Agent Decision Tree
**User reports slow crawling:**
1. Check dashboard for active requests stuck → kill browser if >5min
2. Check browser pool status → cleanup if hot/cold pool >10
3. Check container CPU/memory → scale up if >80%
4. Check Redis latency → restart Redis if >100ms
**User reports missing containers:**
1. Wait 30s for heartbeat
2. Check `docker ps` vs dashboard count
3. Check Redis SMEMBERS monitor:active_containers
4. Check container logs for Redis connection errors
5. Verify REDIS_HOST/PORT env vars
**User reports 502/503 errors:**
1. Check Nginx logs for upstream errors
2. Check container health: `curl http://localhost:11235/health`
3. Check if all containers are healthy: `docker ps`
4. Restart Nginx: `docker restart <nginx-container>`
**User wants to update image:**
1. `crwl server stop`
2. `docker pull unclecode/crawl4ai:latest`
3. `crwl server start --replicas <previous-count>`
---
**Version**: Crawl4AI v0.7.4+
**Last Updated**: 2025-01-20
**AI Agent Note**: All commands, file paths, and Redis keys verified against codebase. Use exact syntax shown. For user-facing responses, translate technical details to plain language.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -5,6 +5,7 @@ import asyncio
from typing import Dict, List, Optional
from datetime import datetime, timezone
from collections import deque
from dataclasses import dataclass
from redis import asyncio as aioredis
from utils import get_container_memory_percent
import psutil
@@ -12,13 +13,49 @@ import logging
logger = logging.getLogger(__name__)
# ========== Configuration ==========
@dataclass
class RedisTTLConfig:
"""Redis TTL configuration (in seconds).
Configures how long different types of monitoring data are retained in Redis.
Adjust based on your monitoring needs and Redis memory constraints.
"""
active_requests: int = 300 # 5 minutes - short-lived active request data
completed_requests: int = 3600 # 1 hour - recent completed requests
janitor_events: int = 3600 # 1 hour - browser cleanup events
errors: int = 3600 # 1 hour - error logs
endpoint_stats: int = 86400 # 24 hours - aggregated endpoint statistics
heartbeat: int = 60 # 1 minute - container heartbeat (2x the 30s interval)
@classmethod
def from_env(cls) -> 'RedisTTLConfig':
"""Load TTL configuration from environment variables."""
import os
return cls(
active_requests=int(os.getenv('REDIS_TTL_ACTIVE_REQUESTS', 300)),
completed_requests=int(os.getenv('REDIS_TTL_COMPLETED_REQUESTS', 3600)),
janitor_events=int(os.getenv('REDIS_TTL_JANITOR_EVENTS', 3600)),
errors=int(os.getenv('REDIS_TTL_ERRORS', 3600)),
endpoint_stats=int(os.getenv('REDIS_TTL_ENDPOINT_STATS', 86400)),
heartbeat=int(os.getenv('REDIS_TTL_HEARTBEAT', 60)),
)
class MonitorStats:
"""Tracks real-time server stats with Redis persistence."""
def __init__(self, redis: aioredis.Redis):
def __init__(self, redis: aioredis.Redis, ttl_config: Optional[RedisTTLConfig] = None):
self.redis = redis
self.ttl = ttl_config or RedisTTLConfig.from_env()
self.start_time = time.time()
# Get container ID for Redis keys
from utils import get_container_id
self.container_id = get_container_id()
# In-memory queues (fast reads, Redis backup)
self.active_requests: Dict[str, Dict] = {} # id -> request info
self.completed_requests: deque = deque(maxlen=100) # Last 100
@@ -32,6 +69,9 @@ class MonitorStats:
self._persist_queue: asyncio.Queue = asyncio.Queue(maxsize=10)
self._persist_worker_task: Optional[asyncio.Task] = None
# Heartbeat task for container discovery
self._heartbeat_task: Optional[asyncio.Task] = None
# Timeline data (5min window, 5s resolution = 60 points)
self.memory_timeline: deque = deque(maxlen=60)
self.requests_timeline: deque = deque(maxlen=60)
@@ -45,10 +85,14 @@ class MonitorStats:
"url": url[:100], # Truncate long URLs
"start_time": time.time(),
"config_sig": config.get("sig", "default") if config else "default",
"mem_start": psutil.Process().memory_info().rss / (1024 * 1024)
"mem_start": psutil.Process().memory_info().rss / (1024 * 1024),
"container_id": self.container_id
}
self.active_requests[request_id] = req_info
# Persist to Redis
await self._persist_active_requests()
# Increment endpoint counter
if endpoint not in self.endpoint_stats:
self.endpoint_stats[endpoint] = {
@@ -95,19 +139,29 @@ class MonitorStats:
"success": success,
"error": error,
"status_code": status_code,
"pool_hit": pool_hit
"pool_hit": pool_hit,
"container_id": self.container_id
}
self.completed_requests.append(completed)
# Persist to Redis
await self._persist_completed_requests()
await self._persist_active_requests() # Update active (removed this request)
# Track errors
if not success and error:
self.errors.append({
error_entry = {
"timestamp": end_time,
"endpoint": endpoint,
"url": req_info["url"],
"error": error,
"request_id": request_id
})
"request_id": request_id,
"message": error,
"level": "ERROR",
"container_id": self.container_id
}
self.errors.append(error_entry)
await self._persist_errors()
await self._persist_endpoint_stats()
@@ -117,8 +171,10 @@ class MonitorStats:
"timestamp": time.time(),
"type": event_type, # "close_cold", "close_hot", "promote"
"sig": sig[:8],
"details": details
"details": details,
"container_id": self.container_id
})
await self._persist_janitor_events()
def _cleanup_old_entries(self, max_age_seconds: int = 300):
"""Remove entries older than max_age_seconds (default 5min)."""
@@ -149,13 +205,23 @@ class MonitorStats:
recent_reqs = sum(1 for req in self.completed_requests
if now - req.get("end_time", 0) < 5)
# Browser counts (acquire lock to prevent race conditions)
# Browser counts (acquire lock with timeout to prevent deadlock)
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
async with LOCK:
try:
async with asyncio.timeout(2.0):
async with LOCK:
browser_count = {
"permanent": 1 if PERMANENT else 0,
"hot": len(HOT_POOL),
"cold": len(COLD_POOL)
}
except asyncio.TimeoutError:
logger.warning("Lock acquisition timeout in update_timeline, using cached browser counts")
# Use last known values or defaults
browser_count = {
"permanent": 1 if PERMANENT else 0,
"hot": len(HOT_POOL),
"cold": len(COLD_POOL)
"permanent": 1,
"hot": 0,
"cold": 0
}
self.memory_timeline.append({"time": now, "value": mem_pct})
@@ -163,15 +229,117 @@ class MonitorStats:
self.browser_timeline.append({"time": now, "browsers": browser_count})
async def _persist_endpoint_stats(self):
"""Persist endpoint stats to Redis."""
try:
await self.redis.set(
"monitor:endpoint_stats",
json.dumps(self.endpoint_stats),
ex=86400 # 24h TTL
)
except Exception as e:
logger.warning(f"Failed to persist endpoint stats: {e}")
"""Persist endpoint stats to Redis with retry logic."""
max_retries = 3
for attempt in range(max_retries):
try:
await self.redis.set(
"monitor:endpoint_stats",
json.dumps(self.endpoint_stats),
ex=self.ttl.endpoint_stats
)
return # Success
except aioredis.ConnectionError as e:
if attempt < max_retries - 1:
backoff = 0.5 * (2 ** attempt) # 0.5s, 1s, 2s
logger.warning(f"Redis connection error persisting endpoint stats (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
await asyncio.sleep(backoff)
else:
logger.error(f"Failed to persist endpoint stats after {max_retries} attempts: {e}")
except Exception as e:
logger.error(f"Non-retryable error persisting endpoint stats: {e}")
break
async def _persist_active_requests(self):
"""Persist active requests to Redis with retry logic."""
max_retries = 3
for attempt in range(max_retries):
try:
if self.active_requests:
await self.redis.set(
f"monitor:{self.container_id}:active_requests",
json.dumps(list(self.active_requests.values())),
ex=self.ttl.active_requests
)
else:
await self.redis.delete(f"monitor:{self.container_id}:active_requests")
return # Success
except aioredis.ConnectionError as e:
if attempt < max_retries - 1:
backoff = 0.5 * (2 ** attempt) # 0.5s, 1s, 2s
logger.warning(f"Redis connection error persisting active requests (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
await asyncio.sleep(backoff)
else:
logger.error(f"Failed to persist active requests after {max_retries} attempts: {e}")
except Exception as e:
logger.error(f"Non-retryable error persisting active requests: {e}")
break
async def _persist_completed_requests(self):
"""Persist completed requests to Redis with retry logic."""
max_retries = 3
for attempt in range(max_retries):
try:
await self.redis.set(
f"monitor:{self.container_id}:completed",
json.dumps(list(self.completed_requests)),
ex=self.ttl.completed_requests
)
return # Success
except aioredis.ConnectionError as e:
if attempt < max_retries - 1:
backoff = 0.5 * (2 ** attempt) # 0.5s, 1s, 2s
logger.warning(f"Redis connection error persisting completed requests (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
await asyncio.sleep(backoff)
else:
logger.error(f"Failed to persist completed requests after {max_retries} attempts: {e}")
except Exception as e:
logger.error(f"Non-retryable error persisting completed requests: {e}")
break
async def _persist_janitor_events(self):
"""Persist janitor events to Redis with retry logic."""
max_retries = 3
for attempt in range(max_retries):
try:
await self.redis.set(
f"monitor:{self.container_id}:janitor",
json.dumps(list(self.janitor_events)),
ex=self.ttl.janitor_events
)
return # Success
except aioredis.ConnectionError as e:
if attempt < max_retries - 1:
backoff = 0.5 * (2 ** attempt) # 0.5s, 1s, 2s
logger.warning(f"Redis connection error persisting janitor events (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
await asyncio.sleep(backoff)
else:
logger.error(f"Failed to persist janitor events after {max_retries} attempts: {e}")
except Exception as e:
logger.error(f"Non-retryable error persisting janitor events: {e}")
break
async def _persist_errors(self):
"""Persist errors to Redis with retry logic."""
max_retries = 3
for attempt in range(max_retries):
try:
await self.redis.set(
f"monitor:{self.container_id}:errors",
json.dumps(list(self.errors)),
ex=self.ttl.errors
)
return # Success
except aioredis.ConnectionError as e:
if attempt < max_retries - 1:
backoff = 0.5 * (2 ** attempt) # 0.5s, 1s, 2s
logger.warning(f"Redis connection error persisting errors (attempt {attempt + 1}/{max_retries}), retrying in {backoff}s: {e}")
await asyncio.sleep(backoff)
else:
logger.error(f"Failed to persist errors after {max_retries} attempts: {e}")
except Exception as e:
logger.error(f"Non-retryable error persisting errors: {e}")
break
async def _persistence_worker(self):
"""Background worker to persist stats to Redis."""
@@ -202,25 +370,121 @@ class MonitorStats:
self._persist_worker_task = None
logger.info("Stopped persistence worker")
async def _heartbeat_worker(self):
"""Send heartbeat to Redis every 30s with circuit breaker for failures."""
from utils import detect_deployment_mode
import os
heartbeat_failures = 0
max_failures = 5 # Circuit breaker threshold
while True:
try:
# Get hostname/container name for friendly display
# Try HOSTNAME env var first (set by Docker Compose), then socket.gethostname()
import socket
hostname = os.getenv("HOSTNAME", socket.gethostname())
# Register this container
mode, containers = detect_deployment_mode()
container_info = {
"id": self.container_id,
"hostname": hostname,
"last_seen": time.time(),
"mode": mode,
"failure_count": heartbeat_failures
}
# Set heartbeat with configured TTL
await self.redis.setex(
f"monitor:heartbeat:{self.container_id}",
self.ttl.heartbeat,
json.dumps(container_info)
)
# Add to active containers set
await self.redis.sadd("monitor:active_containers", self.container_id)
# Reset failure counter on success
heartbeat_failures = 0
# Wait 30s before next heartbeat
await asyncio.sleep(30)
except asyncio.CancelledError:
break
except aioredis.ConnectionError as e:
heartbeat_failures += 1
logger.error(
f"Heartbeat Redis connection error (attempt {heartbeat_failures}/{max_failures}): {e}"
)
if heartbeat_failures >= max_failures:
# Circuit breaker - back off for longer
logger.critical(
f"Heartbeat circuit breaker triggered after {heartbeat_failures} failures. "
f"Container will appear offline for 5 minutes."
)
await asyncio.sleep(300) # 5 min backoff
heartbeat_failures = 0
else:
# Exponential backoff
backoff = min(30 * (2 ** heartbeat_failures), 300)
await asyncio.sleep(backoff)
except Exception as e:
logger.error(f"Unexpected heartbeat error: {e}", exc_info=True)
await asyncio.sleep(30)
def start_heartbeat(self):
"""Start the heartbeat worker."""
if not self._heartbeat_task:
self._heartbeat_task = asyncio.create_task(self._heartbeat_worker())
logger.info("Started heartbeat worker")
async def stop_heartbeat(self):
"""Stop the heartbeat worker and immediately deregister container."""
if self._heartbeat_task:
self._heartbeat_task.cancel()
try:
await self._heartbeat_task
except asyncio.CancelledError:
pass
# Immediate deregistration (no 60s wait)
try:
await self.redis.srem("monitor:active_containers", self.container_id)
await self.redis.delete(f"monitor:heartbeat:{self.container_id}")
logger.info(f"Container {self.container_id} immediately deregistered from monitoring")
except Exception as e:
logger.warning(f"Failed to deregister container on shutdown: {e}")
self._heartbeat_task = None
logger.info("Stopped heartbeat worker")
async def cleanup(self):
"""Cleanup on shutdown - persist final stats and stop workers."""
logger.info("Monitor cleanup starting...")
try:
# Persist final stats before shutdown
await self._persist_endpoint_stats()
# Stop background worker
# Stop background workers
await self.stop_persistence_worker()
await self.stop_heartbeat()
logger.info("Monitor cleanup completed")
except Exception as e:
logger.error(f"Monitor cleanup error: {e}")
async def load_from_redis(self):
"""Load persisted stats from Redis."""
"""Load persisted stats from Redis and start workers."""
try:
data = await self.redis.get("monitor:endpoint_stats")
if data:
self.endpoint_stats = json.loads(data)
logger.info("Loaded endpoint stats from Redis")
# Start background workers
self.start_heartbeat()
except Exception as e:
logger.warning(f"Failed to load from Redis: {e}")
@@ -232,17 +496,28 @@ class MonitorStats:
# Network I/O (delta since last call)
net = psutil.net_io_counters()
# Pool status (acquire lock to prevent race conditions)
# Pool status (acquire lock with timeout to prevent race conditions)
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
async with LOCK:
# TODO: Track actual browser process memory instead of estimates
# These are conservative estimates based on typical Chromium usage
permanent_mem = 270 if PERMANENT else 0 # Estimate: ~270MB for permanent browser
hot_mem = len(HOT_POOL) * 180 # Estimate: ~180MB per hot pool browser
cold_mem = len(COLD_POOL) * 180 # Estimate: ~180MB per cold pool browser
permanent_active = PERMANENT is not None
hot_count = len(HOT_POOL)
cold_count = len(COLD_POOL)
try:
async with asyncio.timeout(2.0):
async with LOCK:
# TODO: Track actual browser process memory instead of estimates
# These are conservative estimates based on typical Chromium usage
permanent_mem = 270 if PERMANENT else 0 # Estimate: ~270MB for permanent browser
hot_mem = len(HOT_POOL) * 180 # Estimate: ~180MB per hot pool browser
cold_mem = len(COLD_POOL) * 180 # Estimate: ~180MB per cold pool browser
permanent_active = PERMANENT is not None
hot_count = len(HOT_POOL)
cold_count = len(COLD_POOL)
except asyncio.TimeoutError:
logger.warning("Lock acquisition timeout in get_health_summary, using defaults")
# Use safe defaults when lock times out
permanent_mem = 0
hot_mem = 0
cold_mem = 0
permanent_active = False
hot_count = 0
cold_count = 0
return {
"container": {
@@ -286,46 +561,52 @@ class MonitorStats:
return requests
async def get_browser_list(self) -> List[Dict]:
"""Get detailed browser pool information."""
"""Get detailed browser pool information with timeout protection."""
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, DEFAULT_CONFIG_SIG, LOCK
browsers = []
now = time.time()
# Acquire lock to prevent race conditions during iteration
async with LOCK:
if PERMANENT:
browsers.append({
"type": "permanent",
"sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown",
"age_seconds": int(now - self.start_time),
"last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)),
"memory_mb": 270,
"hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0),
"killable": False
})
# Acquire lock with timeout to prevent deadlock
try:
async with asyncio.timeout(2.0):
async with LOCK:
if PERMANENT:
browsers.append({
"type": "permanent",
"sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown",
"age_seconds": int(now - self.start_time),
"last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)),
"memory_mb": 270,
"hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0),
"killable": False
})
for sig, crawler in HOT_POOL.items():
browsers.append({
"type": "hot",
"sig": sig[:8],
"age_seconds": int(now - self.start_time), # Approximation
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
"memory_mb": 180, # Estimate
"hits": USAGE_COUNT.get(sig, 0),
"killable": True
})
for sig, crawler in HOT_POOL.items():
browsers.append({
"type": "hot",
"sig": sig[:8],
"age_seconds": int(now - self.start_time), # Approximation
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
"memory_mb": 180, # Estimate
"hits": USAGE_COUNT.get(sig, 0),
"killable": True
})
for sig, crawler in COLD_POOL.items():
browsers.append({
"type": "cold",
"sig": sig[:8],
"age_seconds": int(now - self.start_time),
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
"memory_mb": 180,
"hits": USAGE_COUNT.get(sig, 0),
"killable": True
})
for sig, crawler in COLD_POOL.items():
browsers.append({
"type": "cold",
"sig": sig[:8],
"age_seconds": int(now - self.start_time),
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
"memory_mb": 180,
"hits": USAGE_COUNT.get(sig, 0),
"killable": True
})
except asyncio.TimeoutError:
logger.error("Browser list lock timeout - pool may be locked by janitor")
# Return empty list when lock times out to prevent blocking
return []
return browsers

View File

@@ -3,14 +3,140 @@ from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
from pydantic import BaseModel
from typing import Optional
from monitor import get_monitor
from utils import detect_deployment_mode, get_container_id
import logging
import asyncio
import json
import re
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/monitor", tags=["monitor"])
# ========== Security & Validation ==========
def validate_container_id(cid: str) -> bool:
"""Validate container ID format to prevent Redis key injection.
Docker container IDs are 12-64 character hexadecimal strings.
Hostnames are alphanumeric with dashes and underscores.
Args:
cid: Container ID to validate
Returns:
True if valid, False otherwise
"""
if not cid or not isinstance(cid, str):
return False
# Allow alphanumeric, dashes, and underscores only (1-64 chars)
# This prevents path traversal (../../), wildcards (**), and other injection attempts
return bool(re.match(r'^[a-zA-Z0-9_-]{1,64}$', cid))
# ========== Redis Aggregation Helpers ==========
async def _get_active_containers():
"""Get list of active container IDs from Redis with validation."""
try:
monitor = get_monitor()
container_ids = await monitor.redis.smembers("monitor:active_containers")
# Decode and validate each container ID
validated = []
for cid in container_ids:
cid_str = cid.decode() if isinstance(cid, bytes) else cid
if validate_container_id(cid_str):
validated.append(cid_str)
else:
logger.warning(f"Invalid container ID format rejected: {cid_str}")
return validated
except Exception as e:
logger.error(f"Failed to get active containers: {e}")
return []
async def _aggregate_active_requests():
"""Aggregate active requests from all containers."""
container_ids = await _get_active_containers()
all_requests = []
monitor = get_monitor()
for container_id in container_ids:
try:
data = await monitor.redis.get(f"monitor:{container_id}:active_requests")
if data:
requests = json.loads(data)
all_requests.extend(requests)
except Exception as e:
logger.warning(f"Failed to get active requests from {container_id}: {e}")
return all_requests
async def _aggregate_completed_requests(limit=100):
"""Aggregate completed requests from all containers."""
container_ids = await _get_active_containers()
all_requests = []
monitor = get_monitor()
for container_id in container_ids:
try:
data = await monitor.redis.get(f"monitor:{container_id}:completed")
if data:
requests = json.loads(data)
all_requests.extend(requests)
except Exception as e:
logger.warning(f"Failed to get completed requests from {container_id}: {e}")
# Sort by end_time (most recent first) and limit
all_requests.sort(key=lambda x: x.get("end_time", 0), reverse=True)
return all_requests[:limit]
async def _aggregate_janitor_events(limit=100):
"""Aggregate janitor events from all containers."""
container_ids = await _get_active_containers()
all_events = []
monitor = get_monitor()
for container_id in container_ids:
try:
data = await monitor.redis.get(f"monitor:{container_id}:janitor")
if data:
events = json.loads(data)
all_events.extend(events)
except Exception as e:
logger.warning(f"Failed to get janitor events from {container_id}: {e}")
# Sort by timestamp (most recent first) and limit
all_events.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
return all_events[:limit]
async def _aggregate_errors(limit=100):
"""Aggregate errors from all containers."""
container_ids = await _get_active_containers()
all_errors = []
monitor = get_monitor()
for container_id in container_ids:
try:
data = await monitor.redis.get(f"monitor:{container_id}:errors")
if data:
errors = json.loads(data)
all_errors.extend(errors)
except Exception as e:
logger.warning(f"Failed to get errors from {container_id}: {e}")
# Sort by timestamp (most recent first) and limit
all_errors.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
return all_errors[:limit]
@router.get("/health")
async def get_health():
"""Get current system health snapshot."""
@@ -37,18 +163,23 @@ async def get_requests(status: str = "all", limit: int = 50):
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
try:
monitor = get_monitor()
# Aggregate from all containers via Redis
active_requests = await _aggregate_active_requests()
completed_requests = await _aggregate_completed_requests(limit)
# Filter by status if needed
if status in ["success", "error"]:
is_success = (status == "success")
completed_requests = [r for r in completed_requests if r.get("success") == is_success]
if status == "active":
return {"active": monitor.get_active_requests(), "completed": []}
return {"active": active_requests, "completed": []}
elif status == "completed":
return {"active": [], "completed": monitor.get_completed_requests(limit)}
elif status in ["success", "error"]:
return {"active": [], "completed": monitor.get_completed_requests(limit, status)}
else: # "all"
return {"active": [], "completed": completed_requests}
else: # "all" or success/error
return {
"active": monitor.get_active_requests(),
"completed": monitor.get_completed_requests(limit)
"active": active_requests,
"completed": completed_requests
}
except Exception as e:
logger.error(f"Error getting requests: {e}")
@@ -60,8 +191,13 @@ async def get_browsers():
"""Get detailed browser pool information."""
try:
monitor = get_monitor()
container_id = get_container_id()
browsers = await monitor.get_browser_list()
# Add container_id to each browser
for browser in browsers:
browser["container_id"] = container_id
# Calculate summary stats
total_browsers = len(browsers)
total_memory = sum(b["memory_mb"] for b in browsers)
@@ -77,7 +213,8 @@ async def get_browsers():
"total_count": total_browsers,
"total_memory_mb": total_memory,
"reuse_rate_percent": round(reuse_rate, 1)
}
},
"container_id": container_id
}
except Exception as e:
logger.error(f"Error getting browsers: {e}")
@@ -125,8 +262,9 @@ async def get_janitor_log(limit: int = 100):
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
try:
monitor = get_monitor()
return {"events": monitor.get_janitor_log(limit)}
# Aggregate from all containers via Redis
events = await _aggregate_janitor_events(limit)
return {"events": events}
except Exception as e:
logger.error(f"Error getting janitor log: {e}")
raise HTTPException(500, str(e))
@@ -140,8 +278,9 @@ async def get_errors_log(limit: int = 100):
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
try:
monitor = get_monitor()
return {"errors": monitor.get_errors_log(limit)}
# Aggregate from all containers via Redis
errors = await _aggregate_errors(limit)
return {"errors": errors}
except Exception as e:
logger.error(f"Error getting errors log: {e}")
raise HTTPException(500, str(e))
@@ -350,15 +489,57 @@ async def reset_stats():
raise HTTPException(500, str(e))
@router.get("/containers")
async def get_containers():
"""Get container deployment info from Redis heartbeats."""
try:
monitor = get_monitor()
container_ids = await _get_active_containers()
containers = []
for cid in container_ids:
try:
# Get heartbeat data
data = await monitor.redis.get(f"monitor:heartbeat:{cid}")
if data:
info = json.loads(data)
containers.append({
"id": info.get("id", cid),
"hostname": info.get("hostname", cid),
"healthy": True # If heartbeat exists, it's healthy
})
except Exception as e:
logger.warning(f"Failed to get heartbeat for {cid}: {e}")
# Determine mode
mode = "single" if len(containers) == 1 else "compose"
if len(containers) > 1:
# Check if any hostname has swarm pattern (service.slot.task_id)
if any("." in c["hostname"] and len(c["hostname"].split(".")) > 2 for c in containers):
mode = "swarm"
return {
"mode": mode,
"container_id": get_container_id(),
"containers": containers,
"count": len(containers)
}
except Exception as e:
logger.error(f"Error getting containers: {e}")
raise HTTPException(500, str(e))
@router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""WebSocket endpoint for real-time monitoring updates.
Sends updates every 2 seconds with:
- Health stats
- Active/completed requests
- Browser pool status
- Timeline data
Sends aggregated updates every 2 seconds from all containers with:
- Health stats (local container)
- Active/completed requests (aggregated from all containers)
- Browser pool status (local container only - not in Redis)
- Timeline data (local container - TODO: aggregate from Redis)
- Janitor events (aggregated from all containers)
- Errors (aggregated from all containers)
"""
await websocket.accept()
logger.info("WebSocket client connected")
@@ -366,24 +547,46 @@ async def websocket_endpoint(websocket: WebSocket):
try:
while True:
try:
# Gather all monitoring data
# Gather aggregated monitoring data from Redis
monitor = get_monitor()
container_id = get_container_id()
# Get container info
containers_info = await get_containers()
# AGGREGATE data from all containers via Redis
active_reqs = await _aggregate_active_requests()
completed_reqs = await _aggregate_completed_requests(limit=10)
janitor_events = await _aggregate_janitor_events(limit=10)
errors_log = await _aggregate_errors(limit=10)
# Local container data (not aggregated)
local_health = await monitor.get_health_summary()
browsers = await monitor.get_browser_list() # Browser list is local only
# Add container_id to browsers (they're local)
for browser in browsers:
browser["container_id"] = container_id
data = {
"timestamp": asyncio.get_event_loop().time(),
"health": await monitor.get_health_summary(),
"container_id": container_id, # This container handling the WebSocket
"is_aggregated": True, # Flag to indicate aggregated data
"local_health": local_health, # This container's health
"containers": containers_info.get("containers", []), # All containers
"requests": {
"active": monitor.get_active_requests(),
"completed": monitor.get_completed_requests(limit=10)
"active": active_reqs, # Aggregated from all containers
"completed": completed_reqs # Aggregated from all containers
},
"browsers": await monitor.get_browser_list(),
"browsers": browsers, # Local only (not in Redis)
"timeline": {
# TODO: Aggregate timeline from Redis (currently local only)
"memory": monitor.get_timeline_data("memory", "5m"),
"requests": monitor.get_timeline_data("requests", "5m"),
"browsers": monitor.get_timeline_data("browsers", "5m")
},
"janitor": monitor.get_janitor_log(limit=10),
"errors": monitor.get_errors_log(limit=10)
"janitor": janitor_events, # Aggregated from all containers
"errors": errors_log # Aggregated from all containers
}
# Send update to client

View File

@@ -200,7 +200,11 @@ async def root():
return RedirectResponse("/playground")
# ─────────────────── infra / middleware ─────────────────────
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
# Build Redis URL from environment or config
redis_host = os.getenv("REDIS_HOST", config["redis"].get("host", "localhost"))
redis_port = os.getenv("REDIS_PORT", config["redis"].get("port", 6379))
redis_url = config["redis"].get("uri") or f"redis://{redis_host}:{redis_port}"
redis = aioredis.from_url(redis_url)
limiter = Limiter(
key_func=get_remote_address,

View File

@@ -116,74 +116,107 @@
<!-- Main Content -->
<main class="flex-1 overflow-auto p-4 space-y-4">
<!-- System Health Bar -->
<section class="bg-surface rounded-lg border border-border p-4">
<h2 class="text-sm font-medium mb-3 text-primary">System Health</h2>
<!-- System Health & Infrastructure (side by side) -->
<div class="grid grid-cols-2 gap-4">
<!-- System Health -->
<section class="bg-surface rounded-lg border border-border p-3">
<h2 class="text-sm font-medium mb-2 text-primary">System Health</h2>
<div class="grid grid-cols-4 gap-4 mb-4">
<!-- CPU -->
<div>
<div class="flex justify-between text-xs mb-1">
<span class="text-secondary">CPU</span>
<span id="cpu-percent" class="text-light">--%</span>
<!-- Row 1: CPU and Memory -->
<div class="grid grid-cols-2 gap-3 mb-2">
<!-- CPU -->
<div>
<div class="flex justify-between text-xs mb-1">
<span class="text-secondary">CPU</span>
<span id="cpu-percent" class="text-light">--%</span>
</div>
<div class="w-full bg-dark rounded-full h-2">
<div id="cpu-bar" class="progress-bar h-2 rounded-full bg-primary" style="width: 0%"></div>
</div>
</div>
<div class="w-full bg-dark rounded-full h-2">
<div id="cpu-bar" class="progress-bar h-2 rounded-full bg-primary" style="width: 0%"></div>
<!-- Memory -->
<div>
<div class="flex justify-between text-xs mb-1">
<span class="text-secondary">Memory</span>
<span id="mem-percent" class="text-light">--%</span>
</div>
<div class="w-full bg-dark rounded-full h-2">
<div id="mem-bar" class="progress-bar h-2 rounded-full bg-accent" style="width: 0%"></div>
</div>
</div>
</div>
<!-- Memory -->
<div>
<div class="flex justify-between text-xs mb-1">
<span class="text-secondary">Memory</span>
<span id="mem-percent" class="text-light">--%</span>
<!-- Row 2: Network and Uptime -->
<div class="grid grid-cols-2 gap-3 mb-2">
<!-- Network -->
<div>
<div class="flex justify-between text-xs mb-1">
<span class="text-secondary">Network</span>
<span id="net-io" class="text-light">--</span>
</div>
<div class="text-xs text-secondary"><span id="net-sent">0</span> / ⬇<span id="net-recv">0</span> MB</div>
</div>
<div class="w-full bg-dark rounded-full h-2">
<div id="mem-bar" class="progress-bar h-2 rounded-full bg-accent" style="width: 0%"></div>
<!-- Uptime -->
<div>
<div class="flex justify-between text-xs mb-1">
<span class="text-secondary">Uptime</span>
<span id="uptime" class="text-light">--</span>
</div>
<div class="text-xs text-secondary" id="last-update">Live: --:--:--</div>
</div>
</div>
<!-- Network -->
<div>
<div class="flex justify-between text-xs mb-1">
<span class="text-secondary">Network</span>
<span id="net-io" class="text-light">--</span>
<!-- Pool Status -->
<div class="border-t border-border pt-2">
<div class="grid grid-cols-3 gap-3 text-xs">
<div>
<span class="text-secondary">🔥 Permanent:</span>
<span id="pool-perm" class="text-primary ml-1">INACTIVE (0MB)</span>
</div>
<div>
<span class="text-secondary">♨️ Hot:</span>
<span id="pool-hot" class="text-accent ml-1">0 (0MB)</span>
</div>
<div>
<span class="text-secondary">❄️ Cold:</span>
<span id="pool-cold" class="text-light ml-1">0 (0MB)</span>
</div>
</div>
<div class="mt-1 text-xs text-secondary">
<span>Janitor: </span><span id="janitor-status">adaptive</span> |
<span>Memory pressure: </span><span id="mem-pressure">LOW</span>
</div>
<div class="text-xs text-secondary"><span id="net-sent">0</span> MB / ⬇<span id="net-recv">0</span> MB</div>
</div>
</section>
<!-- Uptime -->
<div>
<div class="flex justify-between text-xs mb-1">
<span class="text-secondary">Uptime</span>
<span id="uptime" class="text-light">--</span>
</div>
<div class="text-xs text-secondary" id="last-update">Updated: never</div>
<!-- Infrastructure Section -->
<section id="containers-section" class="bg-surface rounded-lg border border-border p-3" style="display: none;">
<div class="flex items-center justify-between mb-3">
<h2 class="text-sm font-medium text-primary">📦 Infrastructure</h2>
<div class="flex items-center space-x-2">
<span class="text-xs text-secondary">Mode:</span>
<span id="deployment-mode" class="text-xs text-primary font-medium">single</span>
<span class="text-xs text-secondary">|</span>
<span class="text-xs text-secondary">Containers:</span>
<span id="container-count" class="text-xs text-accent font-medium">1</span>
</div>
</div>
<!-- Pool Status -->
<div class="border-t border-border pt-3">
<div class="grid grid-cols-3 gap-4 text-xs">
<div>
<span class="text-secondary">🔥 Permanent:</span>
<span id="pool-perm" class="text-primary ml-2">INACTIVE (0MB)</span>
</div>
<div>
<span class="text-secondary">♨️ Hot:</span>
<span id="pool-hot" class="text-accent ml-2">0 (0MB)</span>
</div>
<div>
<span class="text-secondary">❄️ Cold:</span>
<span id="pool-cold" class="text-light ml-2">0 (0MB)</span>
</div>
</div>
<div class="mt-2 text-xs text-secondary">
<span>Janitor: </span><span id="janitor-status">adaptive</span> |
<span>Memory pressure: </span><span id="mem-pressure">LOW</span>
</div>
<!-- Container Filter Buttons -->
<div id="container-filters" class="flex flex-wrap gap-2 mb-3">
<button class="container-filter-btn px-3 py-1 rounded text-xs bg-primary text-dark font-medium" data-container="all">
All
</button>
</div>
<!-- Container Grid -->
<div id="containers-grid" class="grid grid-cols-3 gap-3 text-xs">
<!-- Containers will be populated here -->
</div>
</section>
</div>
<!-- Live Activity Grid (2x2) -->
<div class="grid grid-cols-2 gap-4">
@@ -223,11 +256,12 @@
<th class="py-1 pr-2">Age</th>
<th class="py-1 pr-2">Used</th>
<th class="py-1 pr-2">Hits</th>
<th class="py-1 pr-2">Container</th>
<th class="py-1">Act</th>
</tr>
</thead>
<tbody id="browsers-table-body">
<tr><td colspan="6" class="text-center py-4 text-secondary">No browsers</td></tr>
<tr><td colspan="7" class="text-center py-4 text-secondary">No browsers</td></tr>
</tbody>
</table>
</div>
@@ -356,6 +390,16 @@
}
function connectWebSocket() {
// Clean up existing connection first to prevent resource leaks
if (websocket) {
try {
websocket.close();
} catch (e) {
console.error('Error closing old WebSocket:', e);
}
websocket = null;
}
if (wsReconnectAttempts >= MAX_WS_RECONNECT) {
console.log('Max WebSocket reconnect attempts reached, falling back to polling');
useWebSocket = false;
@@ -370,9 +414,24 @@
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsUrl = `${protocol}//${window.location.host}/monitor/ws`;
websocket = new WebSocket(wsUrl);
try {
websocket = new WebSocket(wsUrl);
} catch (e) {
console.error('Failed to create WebSocket:', e);
setTimeout(() => connectWebSocket(), 2000 * wsReconnectAttempts);
return;
}
// Set connection timeout to prevent indefinite connection attempts
const connectionTimeout = setTimeout(() => {
if (websocket && websocket.readyState === WebSocket.CONNECTING) {
console.log('WebSocket connection timeout');
websocket.close();
}
}, 5000);
websocket.onopen = () => {
clearTimeout(connectionTimeout);
console.log('WebSocket connected');
wsReconnectAttempts = 0;
updateConnectionStatus('connected');
@@ -385,15 +444,19 @@
};
websocket.onerror = (error) => {
clearTimeout(connectionTimeout);
console.error('WebSocket error:', error);
};
websocket.onclose = () => {
console.log('WebSocket closed');
websocket.onclose = (event) => {
clearTimeout(connectionTimeout);
console.log(`WebSocket closed: code=${event.code}, reason=${event.reason}`);
updateConnectionStatus('disconnected', 'Reconnecting...');
if (useWebSocket) {
setTimeout(connectWebSocket, 2000 * wsReconnectAttempts);
websocket = null; // Clear reference
if (useWebSocket && wsReconnectAttempts < MAX_WS_RECONNECT) {
setTimeout(() => connectWebSocket(), 2000 * wsReconnectAttempts);
} else {
startAutoRefresh();
}
@@ -459,18 +522,28 @@
}
function updateRequestsDisplay(requests) {
// Filter requests based on current container filter
const filteredActive = currentContainerFilter === 'all'
? requests.active
: requests.active.filter(r => r.container_id === currentContainerFilter);
const filteredCompleted = currentContainerFilter === 'all'
? requests.completed
: requests.completed.filter(r => r.container_id === currentContainerFilter);
// Update active requests count
const activeCount = document.getElementById('active-count');
if (activeCount) activeCount.textContent = requests.active.length;
if (activeCount) activeCount.textContent = filteredActive.length;
// Update active requests list
const activeList = document.getElementById('active-requests-list');
if (activeList) {
if (requests.active.length === 0) {
if (filteredActive.length === 0) {
activeList.innerHTML = '<div class="text-secondary text-center py-2">No active requests</div>';
} else {
activeList.innerHTML = requests.active.map(req => `
activeList.innerHTML = filteredActive.map(req => `
<div class="flex items-center justify-between p-2 bg-dark rounded border border-border">
<span class="text-accent text-xs">${getContainerLabel(req.container_id)}</span>
<span class="text-primary">${req.id.substring(0, 8)}</span>
<span class="text-secondary">${req.endpoint}</span>
<span class="text-light truncate max-w-[200px]" title="${req.url}">${req.url}</span>
@@ -484,11 +557,12 @@
// Update completed requests
const completedList = document.getElementById('completed-requests-list');
if (completedList) {
if (requests.completed.length === 0) {
if (filteredCompleted.length === 0) {
completedList.innerHTML = '<div class="text-secondary text-center py-2">No completed requests</div>';
} else {
completedList.innerHTML = requests.completed.map(req => `
completedList.innerHTML = filteredCompleted.map(req => `
<div class="flex items-center gap-3 p-2 bg-dark rounded">
<span class="text-accent text-xs w-12 flex-shrink-0">${getContainerLabel(req.container_id)}</span>
<span class="text-secondary w-16 flex-shrink-0">${req.id.substring(0, 8)}</span>
<span class="text-secondary w-16 flex-shrink-0">${req.endpoint}</span>
<span class="text-light truncate flex-1" title="${req.url}">${req.url}</span>
@@ -511,6 +585,14 @@
const typeIcon = b.type === 'permanent' ? '🔥' : b.type === 'hot' ? '♨️' : '❄️';
const typeColor = b.type === 'permanent' ? 'text-primary' : b.type === 'hot' ? 'text-accent' : 'text-light';
// Check if should display based on filter
const shouldDisplay = currentContainerFilter === 'all' ||
b.container_id === currentContainerFilter;
if (!shouldDisplay) return '';
// Find container label (C-1, C-2, etc)
const containerLabel = getContainerLabel(b.container_id);
return `
<tr class="border-t border-border hover:bg-dark">
<td class="py-1 pr-2"><span class="${typeColor}">${typeIcon} ${b.type}</span></td>
@@ -518,6 +600,7 @@
<td class="py-1 pr-2">${formatSeconds(b.age_seconds || 0)}</td>
<td class="py-1 pr-2">${formatSeconds(b.last_used_seconds || 0)}</td>
<td class="py-1 pr-2">${b.hits}</td>
<td class="py-1 pr-2 text-accent text-xs">${containerLabel}</td>
<td class="py-1">
${b.killable ? `
<button onclick="killBrowser('${b.sig}')" class="text-red-500 hover:underline text-xs">X</button>
@@ -553,16 +636,23 @@
function updateJanitorDisplay(events) {
const janitorLog = document.getElementById('janitor-log');
if (janitorLog) {
if (events.length === 0) {
// Filter events based on current container filter
const filtered = currentContainerFilter === 'all'
? events
: events.filter(e => e.container_id === currentContainerFilter);
if (filtered.length === 0) {
janitorLog.innerHTML = '<div class="text-secondary text-center py-4">No events yet</div>';
} else {
janitorLog.innerHTML = events.slice(0, 10).reverse().map(evt => {
janitorLog.innerHTML = filtered.slice(0, 10).reverse().map(evt => {
const time = new Date(evt.timestamp * 1000).toLocaleTimeString();
const icon = evt.type === 'close_cold' ? '🧹❄️' : evt.type === 'close_hot' ? '🧹♨️' : '⬆️';
const details = JSON.stringify(evt.details);
const containerLabel = getContainerLabel(evt.container_id);
return `<div class="p-2 bg-dark rounded">
<span class="text-secondary">${time}</span>
<span class="text-accent text-xs">${containerLabel}</span>
<span class="text-secondary ml-2">${time}</span>
<span>${icon}</span>
<span class="text-primary">${evt.type}</span>
<span class="text-secondary">sig=${evt.sig}</span>
@@ -1059,10 +1149,90 @@
return `${m}m ${s}s`;
}
// ========== Containers Management ==========
let currentContainerFilter = 'all';
let containerMapping = {}; // Maps container_id to label (C-1, C-2, etc)
// Helper to get container label from ID or hostname
function getContainerLabel(containerId) {
// Try direct lookup first (works for both hostname and id)
if (containerMapping[containerId]) {
return containerMapping[containerId];
}
// Fallback: show first 8 chars of container ID
return containerId?.substring(0, 8) || 'unknown';
}
async function fetchContainers() {
try {
const res = await fetch('/monitor/containers');
const data = await res.json();
document.getElementById('deployment-mode').textContent = data.mode;
document.getElementById('container-count').textContent = data.count;
// Build container ID to label mapping
// Use hostname as primary key (friendly name like "crawl4ai-1")
// Also map id for backwards compatibility
containerMapping = {};
data.containers.forEach((c, i) => {
const label = `C-${i+1}`;
containerMapping[c.hostname] = label; // Map hostname
containerMapping[c.id] = label; // Also map id
});
// Show section only if multi-container
const section = document.getElementById('containers-section');
if (data.count > 1) {
section.style.display = 'block';
// Update filter buttons
const filtersDiv = document.getElementById('container-filters');
filtersDiv.innerHTML = `
<button class="container-filter-btn px-3 py-1 rounded text-xs ${currentContainerFilter === 'all' ? 'bg-primary text-dark' : 'bg-dark text-secondary'} font-medium" data-container="all">All</button>
${data.containers.map((c, i) => `
<button class="container-filter-btn px-3 py-1 rounded text-xs ${currentContainerFilter === c.id ? 'bg-primary text-dark' : 'bg-dark text-secondary'}" data-container="${c.id}">C-${i+1}</button>
`).join('')}
`;
// Add click handlers to filter buttons
document.querySelectorAll('.container-filter-btn').forEach(btn => {
btn.addEventListener('click', () => {
currentContainerFilter = btn.dataset.container;
fetchContainers(); // Refresh to update button styles
// Re-fetch all data with filter applied
fetchRequests();
fetchBrowsers();
fetchJanitorLogs();
fetchErrorLogs();
});
});
// Update containers grid
const grid = document.getElementById('containers-grid');
grid.innerHTML = data.containers.map((c, i) => `
<div class="p-3 bg-dark rounded border ${currentContainerFilter === c.id || currentContainerFilter === 'all' ? 'border-primary' : 'border-border'}">
<div class="flex items-center justify-between mb-2">
<span class="text-primary font-medium">C-${i+1}</span>
<span class="text-xs ${c.healthy ? 'text-accent' : 'text-red-500'}">${c.healthy ? '🟢' : '🔴'}</span>
</div>
<div class="text-xs text-secondary truncate" title="${c.hostname}">${c.hostname}</div>
</div>
`).join('');
} else {
section.style.display = 'none';
}
} catch (e) {
console.error('Failed to fetch containers:', e);
}
}
// ========== Filter change handler ==========
document.getElementById('filter-requests')?.addEventListener('change', fetchRequests);
// ========== Initialize ==========
// Fetch containers info on load
fetchContainers();
// Try WebSocket first, fallback to polling on failure
connectWebSocket();
</script>

View File

@@ -203,4 +203,51 @@ def get_container_memory_percent() -> float:
except:
# Non-container or unsupported: fallback to host
import psutil
return psutil.virtual_memory().percent
return psutil.virtual_memory().percent
def get_container_id() -> str:
"""Get current container ID (hostname in Docker)."""
import socket
return socket.gethostname()
def detect_deployment_mode() -> tuple[str, list[dict]]:
"""Detect if running in single/swarm/compose mode and get container list.
Returns:
(mode, containers) where mode is "single"|"swarm"|"compose"
containers is list of {id, hostname, healthy}
"""
import socket
my_hostname = socket.gethostname()
# Check if we're behind nginx (Compose mode indicator)
# In Compose, service name resolves to multiple IPs
try:
import socket as sock
# Try to resolve "crawl4ai" service name (Compose service)
try:
addrs = sock.getaddrinfo("crawl4ai", None)
unique_ips = set(addr[4][0] for addr in addrs)
if len(unique_ips) > 1:
# Multiple IPs = Compose with replicas
containers = [
{"id": f"container-{i+1}", "hostname": f"crawl4ai-{i+1}", "healthy": True}
for i in range(len(unique_ips))
]
return "compose", containers
except:
pass
# Check for Swarm mode (TODO: needs swarm-specific detection)
# For now, if hostname pattern matches swarm, detect it
if "." in my_hostname and len(my_hostname.split(".")) > 2:
# Swarm hostname format: service.slot.task_id
return "swarm", [{"id": my_hostname, "hostname": my_hostname, "healthy": True}]
except:
pass
# Default: single container
return "single", [{"id": my_hostname, "hostname": my_hostname, "healthy": True}]

View File

@@ -1,43 +1,18 @@
version: '3.8'
# Shared configuration for all environments
x-base-config: &base-config
ports:
- "11235:11235" # Gunicorn port
env_file:
- .llm.env # API keys (create from .llm.env.example)
environment:
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- GROQ_API_KEY=${GROQ_API_KEY:-}
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
volumes:
- /dev/shm:/dev/shm # Chromium performance
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 1G
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
user: "appuser"
services:
redis:
image: redis:alpine
command: redis-server --appendonly yes
volumes:
- redis_data:/data
networks:
- crawl4ai_net
restart: unless-stopped
crawl4ai:
# 1. Default: Pull multi-platform test image from Docker Hub
# 2. Override with local image via: IMAGE=local-test docker compose up
image: ${IMAGE:-unclecode/crawl4ai:${TAG:-latest}}
# Local build config (used with --build)
build:
context: .
@@ -45,6 +20,58 @@ services:
args:
INSTALL_TYPE: ${INSTALL_TYPE:-default}
ENABLE_GPU: ${ENABLE_GPU:-false}
# Inherit shared config
<<: *base-config
# No ports exposed - access via nginx only
env_file:
- .llm.env
environment:
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- GROQ_API_KEY=${GROQ_API_KEY:-}
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
- LLM_PROVIDER=${LLM_PROVIDER:-}
- REDIS_HOST=redis
- REDIS_PORT=6379
volumes:
- /dev/shm:/dev/shm # Chromium performance
deploy:
replicas: 3 # Default to 3 replicas (can override with --scale)
resources:
limits:
memory: 4G
reservations:
memory: 1G
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
user: "appuser"
depends_on:
- redis
networks:
- crawl4ai_net
nginx:
image: nginx:alpine
ports:
- "11235:80" # Expose port 11235 to host
volumes:
- ./crawl4ai/templates/nginx.conf.template:/etc/nginx/nginx.conf:ro
depends_on:
- crawl4ai
networks:
- crawl4ai_net
restart: unless-stopped
networks:
crawl4ai_net:
driver: bridge
volumes:
redis_data: