feat(docker): add multi-container cluster deployment with CLI management

Add comprehensive Docker cluster orchestration with horizontal scaling support.

CLI Commands:
- crwl server start/stop/restart/status/scale/logs
- Auto-detection: Single (N=1) → Swarm (N>1) → Compose (N>1 fallback)
- Support for 1-100 container replicas with zero-downtime scaling

Infrastructure:
- Nginx load balancing (round-robin API, sticky sessions monitoring)
- Redis-based container discovery via heartbeats (30s interval)
- Real-time monitoring dashboard with cluster-wide visibility
- WebSocket aggregation from all containers

Security & Stability Fixes (12 critical issues):
- Add timeout protection to browser pool locks (prevent deadlocks)
- Implement Redis retry logic with exponential backoff
- Add container ID validation (prevent Redis key injection)
- Add CLI input sanitization (prevent shell injection)
- Add file locking for state management (prevent corruption)
- Fix WebSocket resource leaks and connection cleanup
- Add graceful degradation and circuit breakers

Configuration:
- RedisTTLConfig dataclass with environment variable support
- Template-based docker-compose.yml and nginx.conf generation
- Comprehensive error handling with actionable messages

Documentation:
- AGENT.md: Complete DevOps context for AI assistants
- MULTI_CONTAINER_ARCHITECTURE.md: Technical architecture guide
- Reorganized docs into deploy/docker/docs/
This commit is contained in:
unclecode
2025-10-19 13:31:14 +08:00
parent 73a5a7b0f5
commit 91f7b9d129
18 changed files with 5116 additions and 196 deletions

View File

@@ -3,14 +3,140 @@ from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
from pydantic import BaseModel
from typing import Optional
from monitor import get_monitor
from utils import detect_deployment_mode, get_container_id
import logging
import asyncio
import json
import re
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/monitor", tags=["monitor"])
# ========== Security & Validation ==========
def validate_container_id(cid: str) -> bool:
"""Validate container ID format to prevent Redis key injection.
Docker container IDs are 12-64 character hexadecimal strings.
Hostnames are alphanumeric with dashes and underscores.
Args:
cid: Container ID to validate
Returns:
True if valid, False otherwise
"""
if not cid or not isinstance(cid, str):
return False
# Allow alphanumeric, dashes, and underscores only (1-64 chars)
# This prevents path traversal (../../), wildcards (**), and other injection attempts
return bool(re.match(r'^[a-zA-Z0-9_-]{1,64}$', cid))
# ========== Redis Aggregation Helpers ==========
async def _get_active_containers():
"""Get list of active container IDs from Redis with validation."""
try:
monitor = get_monitor()
container_ids = await monitor.redis.smembers("monitor:active_containers")
# Decode and validate each container ID
validated = []
for cid in container_ids:
cid_str = cid.decode() if isinstance(cid, bytes) else cid
if validate_container_id(cid_str):
validated.append(cid_str)
else:
logger.warning(f"Invalid container ID format rejected: {cid_str}")
return validated
except Exception as e:
logger.error(f"Failed to get active containers: {e}")
return []
async def _aggregate_active_requests():
"""Aggregate active requests from all containers."""
container_ids = await _get_active_containers()
all_requests = []
monitor = get_monitor()
for container_id in container_ids:
try:
data = await monitor.redis.get(f"monitor:{container_id}:active_requests")
if data:
requests = json.loads(data)
all_requests.extend(requests)
except Exception as e:
logger.warning(f"Failed to get active requests from {container_id}: {e}")
return all_requests
async def _aggregate_completed_requests(limit=100):
"""Aggregate completed requests from all containers."""
container_ids = await _get_active_containers()
all_requests = []
monitor = get_monitor()
for container_id in container_ids:
try:
data = await monitor.redis.get(f"monitor:{container_id}:completed")
if data:
requests = json.loads(data)
all_requests.extend(requests)
except Exception as e:
logger.warning(f"Failed to get completed requests from {container_id}: {e}")
# Sort by end_time (most recent first) and limit
all_requests.sort(key=lambda x: x.get("end_time", 0), reverse=True)
return all_requests[:limit]
async def _aggregate_janitor_events(limit=100):
"""Aggregate janitor events from all containers."""
container_ids = await _get_active_containers()
all_events = []
monitor = get_monitor()
for container_id in container_ids:
try:
data = await monitor.redis.get(f"monitor:{container_id}:janitor")
if data:
events = json.loads(data)
all_events.extend(events)
except Exception as e:
logger.warning(f"Failed to get janitor events from {container_id}: {e}")
# Sort by timestamp (most recent first) and limit
all_events.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
return all_events[:limit]
async def _aggregate_errors(limit=100):
"""Aggregate errors from all containers."""
container_ids = await _get_active_containers()
all_errors = []
monitor = get_monitor()
for container_id in container_ids:
try:
data = await monitor.redis.get(f"monitor:{container_id}:errors")
if data:
errors = json.loads(data)
all_errors.extend(errors)
except Exception as e:
logger.warning(f"Failed to get errors from {container_id}: {e}")
# Sort by timestamp (most recent first) and limit
all_errors.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
return all_errors[:limit]
@router.get("/health")
async def get_health():
"""Get current system health snapshot."""
@@ -37,18 +163,23 @@ async def get_requests(status: str = "all", limit: int = 50):
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
try:
monitor = get_monitor()
# Aggregate from all containers via Redis
active_requests = await _aggregate_active_requests()
completed_requests = await _aggregate_completed_requests(limit)
# Filter by status if needed
if status in ["success", "error"]:
is_success = (status == "success")
completed_requests = [r for r in completed_requests if r.get("success") == is_success]
if status == "active":
return {"active": monitor.get_active_requests(), "completed": []}
return {"active": active_requests, "completed": []}
elif status == "completed":
return {"active": [], "completed": monitor.get_completed_requests(limit)}
elif status in ["success", "error"]:
return {"active": [], "completed": monitor.get_completed_requests(limit, status)}
else: # "all"
return {"active": [], "completed": completed_requests}
else: # "all" or success/error
return {
"active": monitor.get_active_requests(),
"completed": monitor.get_completed_requests(limit)
"active": active_requests,
"completed": completed_requests
}
except Exception as e:
logger.error(f"Error getting requests: {e}")
@@ -60,8 +191,13 @@ async def get_browsers():
"""Get detailed browser pool information."""
try:
monitor = get_monitor()
container_id = get_container_id()
browsers = await monitor.get_browser_list()
# Add container_id to each browser
for browser in browsers:
browser["container_id"] = container_id
# Calculate summary stats
total_browsers = len(browsers)
total_memory = sum(b["memory_mb"] for b in browsers)
@@ -77,7 +213,8 @@ async def get_browsers():
"total_count": total_browsers,
"total_memory_mb": total_memory,
"reuse_rate_percent": round(reuse_rate, 1)
}
},
"container_id": container_id
}
except Exception as e:
logger.error(f"Error getting browsers: {e}")
@@ -125,8 +262,9 @@ async def get_janitor_log(limit: int = 100):
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
try:
monitor = get_monitor()
return {"events": monitor.get_janitor_log(limit)}
# Aggregate from all containers via Redis
events = await _aggregate_janitor_events(limit)
return {"events": events}
except Exception as e:
logger.error(f"Error getting janitor log: {e}")
raise HTTPException(500, str(e))
@@ -140,8 +278,9 @@ async def get_errors_log(limit: int = 100):
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
try:
monitor = get_monitor()
return {"errors": monitor.get_errors_log(limit)}
# Aggregate from all containers via Redis
errors = await _aggregate_errors(limit)
return {"errors": errors}
except Exception as e:
logger.error(f"Error getting errors log: {e}")
raise HTTPException(500, str(e))
@@ -350,15 +489,57 @@ async def reset_stats():
raise HTTPException(500, str(e))
@router.get("/containers")
async def get_containers():
"""Get container deployment info from Redis heartbeats."""
try:
monitor = get_monitor()
container_ids = await _get_active_containers()
containers = []
for cid in container_ids:
try:
# Get heartbeat data
data = await monitor.redis.get(f"monitor:heartbeat:{cid}")
if data:
info = json.loads(data)
containers.append({
"id": info.get("id", cid),
"hostname": info.get("hostname", cid),
"healthy": True # If heartbeat exists, it's healthy
})
except Exception as e:
logger.warning(f"Failed to get heartbeat for {cid}: {e}")
# Determine mode
mode = "single" if len(containers) == 1 else "compose"
if len(containers) > 1:
# Check if any hostname has swarm pattern (service.slot.task_id)
if any("." in c["hostname"] and len(c["hostname"].split(".")) > 2 for c in containers):
mode = "swarm"
return {
"mode": mode,
"container_id": get_container_id(),
"containers": containers,
"count": len(containers)
}
except Exception as e:
logger.error(f"Error getting containers: {e}")
raise HTTPException(500, str(e))
@router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""WebSocket endpoint for real-time monitoring updates.
Sends updates every 2 seconds with:
- Health stats
- Active/completed requests
- Browser pool status
- Timeline data
Sends aggregated updates every 2 seconds from all containers with:
- Health stats (local container)
- Active/completed requests (aggregated from all containers)
- Browser pool status (local container only - not in Redis)
- Timeline data (local container - TODO: aggregate from Redis)
- Janitor events (aggregated from all containers)
- Errors (aggregated from all containers)
"""
await websocket.accept()
logger.info("WebSocket client connected")
@@ -366,24 +547,46 @@ async def websocket_endpoint(websocket: WebSocket):
try:
while True:
try:
# Gather all monitoring data
# Gather aggregated monitoring data from Redis
monitor = get_monitor()
container_id = get_container_id()
# Get container info
containers_info = await get_containers()
# AGGREGATE data from all containers via Redis
active_reqs = await _aggregate_active_requests()
completed_reqs = await _aggregate_completed_requests(limit=10)
janitor_events = await _aggregate_janitor_events(limit=10)
errors_log = await _aggregate_errors(limit=10)
# Local container data (not aggregated)
local_health = await monitor.get_health_summary()
browsers = await monitor.get_browser_list() # Browser list is local only
# Add container_id to browsers (they're local)
for browser in browsers:
browser["container_id"] = container_id
data = {
"timestamp": asyncio.get_event_loop().time(),
"health": await monitor.get_health_summary(),
"container_id": container_id, # This container handling the WebSocket
"is_aggregated": True, # Flag to indicate aggregated data
"local_health": local_health, # This container's health
"containers": containers_info.get("containers", []), # All containers
"requests": {
"active": monitor.get_active_requests(),
"completed": monitor.get_completed_requests(limit=10)
"active": active_reqs, # Aggregated from all containers
"completed": completed_reqs # Aggregated from all containers
},
"browsers": await monitor.get_browser_list(),
"browsers": browsers, # Local only (not in Redis)
"timeline": {
# TODO: Aggregate timeline from Redis (currently local only)
"memory": monitor.get_timeline_data("memory", "5m"),
"requests": monitor.get_timeline_data("requests", "5m"),
"browsers": monitor.get_timeline_data("browsers", "5m")
},
"janitor": monitor.get_janitor_log(limit=10),
"errors": monitor.get_errors_log(limit=10)
"janitor": janitor_events, # Aggregated from all containers
"errors": errors_log # Aggregated from all containers
}
# Send update to client