- Reorganized server management code: - Moved server_cli.py -> deploy/docker/cnode_cli.py - Moved server_manager.py -> deploy/docker/server_manager.py - Created fast Python-based installation (0.1s startup): - deploy/installer/cnode_pkg/ - Standalone package - deploy/installer/install-cnode.sh - Local installer - deploy/installer/deploy.sh - Remote installer for users - Added backward compatibility: - crawl4ai/cli.py: 'crwl server' redirects to 'cnode' - Updated tests to match new CLI structure (12/12 passing) - Automated sync workflow: - .githooks/pre-commit - Auto-syncs source to package - setup-hooks.sh - One-time setup for contributors - deploy/installer/sync-cnode.sh - Manual sync script Performance: - Startup time: 0.1s (49x faster than PyInstaller) - Size: ~50KB wrapper vs 8.8MB binary Commands: cnode start [--replicas N] # Start server/cluster cnode status # Check status cnode scale N # Scale replicas cnode logs [-f] # View logs cnode stop # Stop server
1155 lines
39 KiB
Python
1155 lines
39 KiB
Python
"""
|
|
Crawl4AI Docker Server Manager
|
|
|
|
Orchestrates single-node Docker deployments with automatic scaling:
|
|
- Single container (N=1)
|
|
- Docker Swarm (N>1, if available)
|
|
- Docker Compose + Nginx (N>1, fallback)
|
|
"""
|
|
|
|
import json
|
|
import subprocess
|
|
import time
|
|
import re
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Dict, Optional, Literal
|
|
from datetime import datetime
|
|
import socket
|
|
|
|
|
|
ServerMode = Literal["single", "swarm", "compose"]
|
|
|
|
|
|
# ========== Input Validation Functions ==========
|
|
|
|
def validate_docker_image(image: str) -> bool:
|
|
"""Validate Docker image name format.
|
|
|
|
Allows: registry.com/namespace/repo:tag
|
|
Format: [registry/][namespace/]repo[:tag][@digest]
|
|
|
|
Args:
|
|
image: Docker image string
|
|
|
|
Returns:
|
|
True if valid, False otherwise
|
|
"""
|
|
if not image or not isinstance(image, str):
|
|
return False
|
|
|
|
# Length check
|
|
if len(image) > 256:
|
|
return False
|
|
|
|
# Basic pattern: alphanumeric, dots, slashes, colons, dashes, underscores
|
|
# No shell metacharacters allowed
|
|
pattern = r'^[a-zA-Z0-9.\-/:_@]+$'
|
|
if not re.match(pattern, image):
|
|
return False
|
|
|
|
# Additional safety: no consecutive special chars that could be exploited
|
|
if '..' in image or '//' in image:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def validate_port(port: int) -> bool:
|
|
"""Validate port number is in valid range.
|
|
|
|
Args:
|
|
port: Port number
|
|
|
|
Returns:
|
|
True if valid (1-65535), False otherwise
|
|
"""
|
|
return isinstance(port, int) and 1 <= port <= 65535
|
|
|
|
|
|
def validate_env_file(path: str) -> bool:
|
|
"""Validate environment file path exists and is readable.
|
|
|
|
Args:
|
|
path: File path to validate
|
|
|
|
Returns:
|
|
True if file exists and is readable, False otherwise
|
|
"""
|
|
if not path or not isinstance(path, str):
|
|
return False
|
|
|
|
try:
|
|
file_path = Path(path).resolve()
|
|
return file_path.exists() and file_path.is_file() and os.access(file_path, os.R_OK)
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def validate_replicas(replicas: int) -> bool:
|
|
"""Validate replica count is in reasonable range.
|
|
|
|
Args:
|
|
replicas: Number of replicas
|
|
|
|
Returns:
|
|
True if valid (1-100), False otherwise
|
|
"""
|
|
return isinstance(replicas, int) and 1 <= replicas <= 100
|
|
|
|
|
|
class ServerManager:
|
|
"""Manages Crawl4AI Docker server lifecycle and orchestration."""
|
|
|
|
def __init__(self):
|
|
self.state_dir = Path.home() / ".crawl4ai" / "server"
|
|
self.state_file = self.state_dir / "state.json"
|
|
self.compose_file = self.state_dir / "docker-compose.yml"
|
|
self.nginx_conf = self.state_dir / "nginx.conf"
|
|
self.state_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# ========== Public API ==========
|
|
|
|
async def start(
|
|
self,
|
|
replicas: int = 1,
|
|
mode: str = "auto",
|
|
port: int = 11235,
|
|
env_file: Optional[str] = None,
|
|
image: str = "unclecode/crawl4ai:latest",
|
|
**kwargs
|
|
) -> Dict:
|
|
"""Start Crawl4AI server with specified configuration.
|
|
|
|
Args:
|
|
replicas: Number of container replicas (default: 1)
|
|
mode: Deployment mode - 'auto', 'single', 'swarm', or 'compose'
|
|
port: External port to expose (default: 11235)
|
|
env_file: Path to environment file
|
|
image: Docker image to use
|
|
**kwargs: Additional docker run arguments
|
|
|
|
Returns:
|
|
Dict with status and deployment info
|
|
"""
|
|
# Check if already running
|
|
state = self._load_state()
|
|
if state:
|
|
return {
|
|
"success": False,
|
|
"message": "Server already running",
|
|
"current_state": state
|
|
}
|
|
|
|
# Validate Docker is available
|
|
if not self._is_docker_available():
|
|
return {
|
|
"success": False,
|
|
"error": "Docker daemon not running. Please start Docker first."
|
|
}
|
|
|
|
# Check port availability
|
|
if not self._is_port_available(port):
|
|
return {
|
|
"success": False,
|
|
"error": f"Port {port} is already in use"
|
|
}
|
|
|
|
# Detect deployment mode
|
|
detected_mode = self._detect_mode(replicas, mode)
|
|
|
|
# Ensure image is available
|
|
if not self._ensure_image(image):
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to pull image {image}"
|
|
}
|
|
|
|
# Start based on mode
|
|
if detected_mode == "single":
|
|
result = self._start_single(port, env_file, image, **kwargs)
|
|
elif detected_mode == "swarm":
|
|
result = self._start_swarm(replicas, port, env_file, image, **kwargs)
|
|
elif detected_mode == "compose":
|
|
result = self._start_compose(replicas, port, env_file, image, **kwargs)
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": f"Unknown mode: {detected_mode}"
|
|
}
|
|
|
|
if result["success"]:
|
|
# Save state
|
|
self._save_state({
|
|
"mode": detected_mode,
|
|
"replicas": replicas,
|
|
"port": port,
|
|
"image": image,
|
|
"env_file": env_file,
|
|
"started_at": datetime.now().isoformat(),
|
|
**result.get("state_data", {})
|
|
})
|
|
|
|
return result
|
|
|
|
async def status(self) -> Dict:
|
|
"""Get current server status."""
|
|
state = self._load_state()
|
|
|
|
if not state:
|
|
return {
|
|
"running": False,
|
|
"message": "No server is currently running"
|
|
}
|
|
|
|
mode = state["mode"]
|
|
|
|
# Check actual container status
|
|
if mode == "single":
|
|
running = self._check_container_running(state.get("container_id"))
|
|
elif mode == "swarm":
|
|
running = self._check_service_running(state.get("service_name"))
|
|
elif mode == "compose":
|
|
running = self._check_compose_running(state.get("compose_project"))
|
|
else:
|
|
running = False
|
|
|
|
if not running:
|
|
# State file exists but containers are gone - clean up
|
|
self._clear_state()
|
|
return {
|
|
"running": False,
|
|
"message": "State file exists but containers stopped externally"
|
|
}
|
|
|
|
return {
|
|
"running": True,
|
|
"mode": mode,
|
|
"replicas": state.get("replicas", 1),
|
|
"port": state.get("port", 11235),
|
|
"image": state.get("image"),
|
|
"started_at": state.get("started_at"),
|
|
"uptime": self._calculate_uptime(state.get("started_at"))
|
|
}
|
|
|
|
async def stop(self, remove_volumes: bool = False) -> Dict:
|
|
"""Stop running server.
|
|
|
|
Args:
|
|
remove_volumes: Remove associated volumes
|
|
|
|
Returns:
|
|
Dict with stop status
|
|
"""
|
|
state = self._load_state()
|
|
|
|
if not state:
|
|
return {
|
|
"success": False,
|
|
"message": "No server is running"
|
|
}
|
|
|
|
mode = state["mode"]
|
|
|
|
try:
|
|
if mode == "single":
|
|
self._stop_single(state.get("container_id"), remove_volumes)
|
|
elif mode == "swarm":
|
|
self._stop_swarm(state.get("service_name"))
|
|
elif mode == "compose":
|
|
self._stop_compose(state.get("compose_project"), remove_volumes)
|
|
|
|
self._clear_state()
|
|
|
|
return {
|
|
"success": True,
|
|
"message": f"Server stopped ({mode} mode)"
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def cleanup(self, force: bool = False) -> Dict:
|
|
"""Force cleanup of all Crawl4AI Docker resources.
|
|
|
|
Args:
|
|
force: Force cleanup even if state file doesn't exist
|
|
|
|
Returns:
|
|
Dict with cleanup status
|
|
"""
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
removed_count = 0
|
|
messages = []
|
|
|
|
try:
|
|
# Try to stop via state file first
|
|
if not force:
|
|
state = self._load_state()
|
|
if state:
|
|
stop_result = await self.stop(remove_volumes=True)
|
|
if stop_result["success"]:
|
|
return {
|
|
"success": True,
|
|
"removed": 1,
|
|
"message": "Stopped via state file"
|
|
}
|
|
|
|
# Force cleanup - find and remove all Crawl4AI resources
|
|
logger.info("Force cleanup: removing all Crawl4AI Docker resources")
|
|
|
|
# Remove all crawl4ai containers
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "ps", "-a", "--filter", "name=crawl4ai", "--format", "{{.ID}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10
|
|
)
|
|
container_ids = result.stdout.strip().split('\n')
|
|
container_ids = [cid for cid in container_ids if cid]
|
|
|
|
for cid in container_ids:
|
|
subprocess.run(["docker", "rm", "-f", cid], capture_output=True, timeout=10)
|
|
removed_count += 1
|
|
messages.append(f"Removed {len(container_ids)} crawl4ai containers")
|
|
except Exception as e:
|
|
logger.warning(f"Error removing containers: {e}")
|
|
|
|
# Remove nginx containers
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "ps", "-a", "--filter", "name=nginx", "--format", "{{.ID}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10
|
|
)
|
|
nginx_ids = result.stdout.strip().split('\n')
|
|
nginx_ids = [nid for nid in nginx_ids if nid]
|
|
|
|
for nid in nginx_ids:
|
|
subprocess.run(["docker", "rm", "-f", nid], capture_output=True, timeout=10)
|
|
removed_count += len(nginx_ids)
|
|
if nginx_ids:
|
|
messages.append(f"Removed {len(nginx_ids)} nginx containers")
|
|
except Exception as e:
|
|
logger.warning(f"Error removing nginx: {e}")
|
|
|
|
# Remove redis containers
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "ps", "-a", "--filter", "name=redis", "--format", "{{.ID}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10
|
|
)
|
|
redis_ids = result.stdout.strip().split('\n')
|
|
redis_ids = [rid for rid in redis_ids if rid]
|
|
|
|
for rid in redis_ids:
|
|
subprocess.run(["docker", "rm", "-f", rid], capture_output=True, timeout=10)
|
|
removed_count += len(redis_ids)
|
|
if redis_ids:
|
|
messages.append(f"Removed {len(redis_ids)} redis containers")
|
|
except Exception as e:
|
|
logger.warning(f"Error removing redis: {e}")
|
|
|
|
# Clean up compose projects
|
|
for project in ["crawl4ai", "fix-docker"]:
|
|
try:
|
|
subprocess.run(
|
|
["docker", "compose", "-p", project, "down", "-v"],
|
|
capture_output=True,
|
|
timeout=30,
|
|
cwd=str(self.state_dir)
|
|
)
|
|
messages.append(f"Cleaned compose project: {project}")
|
|
except Exception:
|
|
pass
|
|
|
|
# Remove networks
|
|
try:
|
|
subprocess.run(["docker", "network", "prune", "-f"], capture_output=True, timeout=10)
|
|
messages.append("Pruned networks")
|
|
except Exception as e:
|
|
logger.warning(f"Error pruning networks: {e}")
|
|
|
|
# Clear state file
|
|
self._clear_state()
|
|
messages.append("Cleared state file")
|
|
|
|
return {
|
|
"success": True,
|
|
"removed": removed_count,
|
|
"message": "; ".join(messages)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cleanup error: {e}")
|
|
return {
|
|
"success": False,
|
|
"message": f"Cleanup failed: {str(e)}"
|
|
}
|
|
|
|
async def scale(self, replicas: int) -> Dict:
|
|
"""Scale server to specified replica count.
|
|
|
|
Args:
|
|
replicas: Target number of replicas
|
|
|
|
Returns:
|
|
Dict with scaling status
|
|
"""
|
|
state = self._load_state()
|
|
|
|
if not state:
|
|
return {
|
|
"success": False,
|
|
"message": "No server is running"
|
|
}
|
|
|
|
mode = state["mode"]
|
|
|
|
if mode == "single":
|
|
return {
|
|
"success": False,
|
|
"error": "Cannot scale single container mode. Use 'crwl server stop' then 'crwl server start --replicas N'"
|
|
}
|
|
|
|
try:
|
|
if mode == "swarm":
|
|
self._scale_swarm(state["service_name"], replicas)
|
|
elif mode == "compose":
|
|
self._scale_compose(state["compose_project"], replicas)
|
|
|
|
# Update state
|
|
state["replicas"] = replicas
|
|
self._save_state(state)
|
|
|
|
return {
|
|
"success": True,
|
|
"message": f"Scaled to {replicas} replicas",
|
|
"mode": mode
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def logs(self, follow: bool = False, tail: int = 100) -> str:
|
|
"""Get server logs.
|
|
|
|
Args:
|
|
follow: Follow log output
|
|
tail: Number of lines to show
|
|
|
|
Returns:
|
|
Log output as string
|
|
"""
|
|
state = self._load_state()
|
|
|
|
if not state:
|
|
return "No server is running"
|
|
|
|
mode = state["mode"]
|
|
|
|
try:
|
|
if mode == "single":
|
|
return self._logs_single(state["container_id"], follow, tail)
|
|
elif mode == "swarm":
|
|
return self._logs_swarm(state["service_name"], follow, tail)
|
|
elif mode == "compose":
|
|
return self._logs_compose(state["compose_project"], follow, tail)
|
|
except Exception as e:
|
|
return f"Error getting logs: {e}"
|
|
|
|
# ========== Mode Detection ==========
|
|
|
|
def _detect_mode(self, replicas: int, mode: str) -> ServerMode:
|
|
"""Detect deployment mode based on replicas and user preference."""
|
|
if mode != "auto":
|
|
return mode
|
|
|
|
if replicas == 1:
|
|
return "single"
|
|
|
|
# N>1: prefer Swarm if available, fallback to Compose
|
|
if self._is_swarm_available():
|
|
return "swarm"
|
|
|
|
return "compose"
|
|
|
|
def _is_swarm_available(self) -> bool:
|
|
"""Check if Docker Swarm is initialized and available."""
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "info", "--format", "{{.Swarm.LocalNodeState}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
return result.stdout.strip() == "active"
|
|
except Exception:
|
|
return False
|
|
|
|
def _is_docker_available(self) -> bool:
|
|
"""Check if Docker daemon is running."""
|
|
try:
|
|
subprocess.run(
|
|
["docker", "ps"],
|
|
capture_output=True,
|
|
timeout=5,
|
|
check=True
|
|
)
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
def _is_port_available(self, port: int) -> bool:
|
|
"""Check if port is available for binding."""
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
try:
|
|
s.bind(('0.0.0.0', port))
|
|
return True
|
|
except OSError:
|
|
return False
|
|
|
|
def _ensure_image(self, image: str) -> bool:
|
|
"""Ensure Docker image is available locally, pull if needed."""
|
|
try:
|
|
# Check if image exists locally
|
|
result = subprocess.run(
|
|
["docker", "image", "inspect", image],
|
|
capture_output=True,
|
|
timeout=5
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
return True
|
|
|
|
# Determine if this looks like a registry image
|
|
# Registry images have format: [registry/][namespace/]repository[:tag]
|
|
# Examples: unclecode/crawl4ai:latest, docker.io/library/nginx:latest
|
|
# Local-only: crawl4ai-local:latest, my-image:v1
|
|
|
|
# If it has a dot in the first part (before any slash), it's likely a registry
|
|
# Or if it has a slash, it's likely registry/namespace/repo format
|
|
parts = image.split("/")
|
|
is_registry_image = (
|
|
len(parts) > 1 and # Has slash
|
|
"." not in parts[0] and # First part isn't a domain (localhost.localdomain)
|
|
not parts[0].startswith("localhost") # Not localhost registry
|
|
)
|
|
|
|
if not is_registry_image:
|
|
return False # Local image doesn't exist
|
|
|
|
# Try to pull from registry
|
|
subprocess.run(
|
|
["docker", "pull", image],
|
|
capture_output=True,
|
|
check=True,
|
|
timeout=300
|
|
)
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
# ========== Single Container Mode ==========
|
|
|
|
def _start_single(self, port: int, env_file: Optional[str], image: str, **kwargs) -> Dict:
|
|
"""Start single container with docker run."""
|
|
# Validate inputs to prevent injection attacks
|
|
if not validate_port(port):
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid port number: {port}. Must be between 1-65535."
|
|
}
|
|
|
|
if not validate_docker_image(image):
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid Docker image format: {image}"
|
|
}
|
|
|
|
if env_file and not validate_env_file(env_file):
|
|
return {
|
|
"success": False,
|
|
"error": f"Environment file not found or not readable: {env_file}"
|
|
}
|
|
|
|
cmd = [
|
|
"docker", "run",
|
|
"-d", # Detached
|
|
"--name", "crawl4ai_server",
|
|
"-p", f"{port}:11235",
|
|
"--shm-size=1g", # Important for browser
|
|
]
|
|
|
|
if env_file:
|
|
# Use absolute path to prevent path traversal
|
|
abs_env_file = str(Path(env_file).resolve())
|
|
cmd.extend(["--env-file", abs_env_file])
|
|
|
|
# Whitelist allowed Docker flags to prevent privilege escalation
|
|
allowed_flags = {"--memory", "--cpus", "--restart", "--network"}
|
|
for key, value in kwargs.items():
|
|
if key in allowed_flags:
|
|
cmd.append(key)
|
|
if value is not True: # Handle boolean flags
|
|
cmd.append(str(value))
|
|
else:
|
|
# Log ignored flags for debugging
|
|
import logging
|
|
logging.warning(f"Ignoring non-whitelisted Docker flag: {key}")
|
|
|
|
cmd.append(image)
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
container_id = result.stdout.strip()
|
|
|
|
# Wait for health check
|
|
if self._wait_for_health(f"http://localhost:{port}/health"):
|
|
return {
|
|
"success": True,
|
|
"message": f"Server started on port {port}",
|
|
"state_data": {"container_id": container_id}
|
|
}
|
|
else:
|
|
# Cleanup failed container
|
|
subprocess.run(["docker", "rm", "-f", container_id], capture_output=True)
|
|
return {
|
|
"success": False,
|
|
"error": "Container started but health check failed"
|
|
}
|
|
except subprocess.CalledProcessError as e:
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to start container: {e.stderr}"
|
|
}
|
|
|
|
def _stop_single(self, container_id: str, remove_volumes: bool):
|
|
"""Stop single container."""
|
|
cmd = ["docker", "rm", "-f"]
|
|
if remove_volumes:
|
|
cmd.append("-v")
|
|
cmd.append(container_id)
|
|
subprocess.run(cmd, check=True)
|
|
|
|
def _check_container_running(self, container_id: str) -> bool:
|
|
"""Check if container is running."""
|
|
if not container_id:
|
|
return False
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "inspect", "-f", "{{.State.Running}}", container_id],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
return result.stdout.strip() == "true"
|
|
except Exception:
|
|
return False
|
|
|
|
def _logs_single(self, container_id: str, follow: bool, tail: int) -> str:
|
|
"""Get logs from single container."""
|
|
cmd = ["docker", "logs", "--tail", str(tail)]
|
|
if follow:
|
|
cmd.append("-f")
|
|
cmd.append(container_id)
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
return result.stdout
|
|
|
|
# ========== Swarm Mode ==========
|
|
|
|
def _start_swarm(self, replicas: int, port: int, env_file: Optional[str], image: str, **kwargs) -> Dict:
|
|
"""Start service in Swarm mode."""
|
|
# Validate inputs to prevent injection attacks
|
|
if not validate_replicas(replicas):
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid replica count: {replicas}. Must be between 1-100."
|
|
}
|
|
|
|
if not validate_port(port):
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid port number: {port}. Must be between 1-65535."
|
|
}
|
|
|
|
if not validate_docker_image(image):
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid Docker image format: {image}"
|
|
}
|
|
|
|
if env_file and not validate_env_file(env_file):
|
|
return {
|
|
"success": False,
|
|
"error": f"Environment file not found or not readable: {env_file}"
|
|
}
|
|
|
|
service_name = "crawl4ai" # Static name (safe)
|
|
|
|
# Initialize swarm if needed
|
|
if not self._is_swarm_available():
|
|
init_result = self._init_swarm()
|
|
if not init_result:
|
|
return {
|
|
"success": False,
|
|
"error": "Failed to initialize Docker Swarm. Use 'docker swarm init' manually."
|
|
}
|
|
|
|
cmd = [
|
|
"docker", "service", "create",
|
|
"--name", service_name,
|
|
"--replicas", str(replicas),
|
|
"--publish", f"{port}:11235",
|
|
"--mount", "type=tmpfs,target=/dev/shm,tmpfs-size=1g",
|
|
"--limit-memory", "4G",
|
|
]
|
|
|
|
if env_file:
|
|
# Use absolute path to prevent path traversal
|
|
abs_env_file = str(Path(env_file).resolve())
|
|
cmd.extend(["--env-file", abs_env_file])
|
|
|
|
cmd.append(image)
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
service_id = result.stdout.strip()
|
|
|
|
# Wait for service to be ready (check replicas)
|
|
if self._wait_for_service(service_name, replicas):
|
|
return {
|
|
"success": True,
|
|
"message": f"Swarm service started with {replicas} replicas",
|
|
"state_data": {
|
|
"service_name": service_name,
|
|
"service_id": service_id
|
|
}
|
|
}
|
|
else:
|
|
# Cleanup failed service
|
|
subprocess.run(["docker", "service", "rm", service_name], capture_output=True)
|
|
return {
|
|
"success": False,
|
|
"error": "Service created but replicas failed to start"
|
|
}
|
|
except subprocess.CalledProcessError as e:
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to create Swarm service: {e.stderr}"
|
|
}
|
|
|
|
def _init_swarm(self) -> bool:
|
|
"""Initialize Docker Swarm if not already initialized."""
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "swarm", "init"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10
|
|
)
|
|
return result.returncode == 0
|
|
except Exception:
|
|
return False
|
|
|
|
def _wait_for_service(self, service_name: str, expected_replicas: int, timeout: int = 60) -> bool:
|
|
"""Wait for Swarm service replicas to be running."""
|
|
import time
|
|
start = time.time()
|
|
|
|
while time.time() - start < timeout:
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "service", "ls", "--filter", f"name={service_name}", "--format", "{{.Replicas}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
# Format is "2/3" (running/desired)
|
|
replicas_str = result.stdout.strip()
|
|
if "/" in replicas_str:
|
|
running, desired = replicas_str.split("/")
|
|
if int(running) == expected_replicas and int(desired) == expected_replicas:
|
|
return True
|
|
|
|
time.sleep(2)
|
|
except Exception:
|
|
time.sleep(2)
|
|
|
|
return False
|
|
|
|
def _stop_swarm(self, service_name: str):
|
|
"""Stop Swarm service."""
|
|
subprocess.run(
|
|
["docker", "service", "rm", service_name],
|
|
check=True,
|
|
capture_output=True
|
|
)
|
|
|
|
def _scale_swarm(self, service_name: str, replicas: int):
|
|
"""Scale Swarm service."""
|
|
subprocess.run(
|
|
["docker", "service", "scale", f"{service_name}={replicas}"],
|
|
check=True,
|
|
capture_output=True
|
|
)
|
|
|
|
def _check_service_running(self, service_name: str) -> bool:
|
|
"""Check if Swarm service is running."""
|
|
if not service_name:
|
|
return False
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "service", "ls", "--filter", f"name={service_name}", "--format", "{{.Name}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
return service_name in result.stdout
|
|
except Exception:
|
|
return False
|
|
|
|
def _logs_swarm(self, service_name: str, follow: bool, tail: int) -> str:
|
|
"""Get logs from Swarm service."""
|
|
cmd = ["docker", "service", "logs", "--tail", str(tail)]
|
|
if follow:
|
|
cmd.append("-f")
|
|
cmd.append(service_name)
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
return result.stdout
|
|
|
|
# ========== Compose Mode ==========
|
|
|
|
def _start_compose(self, replicas: int, port: int, env_file: Optional[str], image: str, **kwargs) -> Dict:
|
|
"""Start with Docker Compose + Nginx."""
|
|
# Validate inputs to prevent injection attacks
|
|
if not validate_replicas(replicas):
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid replica count: {replicas}. Must be between 1-100."
|
|
}
|
|
|
|
if not validate_port(port):
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid port number: {port}. Must be between 1-65535."
|
|
}
|
|
|
|
if not validate_docker_image(image):
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid Docker image format: {image}"
|
|
}
|
|
|
|
if env_file and not validate_env_file(env_file):
|
|
return {
|
|
"success": False,
|
|
"error": f"Environment file not found or not readable: {env_file}"
|
|
}
|
|
|
|
project_name = "crawl4ai" # Static name (safe)
|
|
|
|
# Generate compose and nginx config files
|
|
try:
|
|
self._generate_compose_file(replicas, port, env_file or "", image)
|
|
self._generate_nginx_config()
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to generate config files: {e}"
|
|
}
|
|
|
|
# Start compose stack - use absolute path for compose file
|
|
cmd = [
|
|
"docker", "compose",
|
|
"-f", str(self.compose_file.resolve()),
|
|
"-p", project_name,
|
|
"up", "-d",
|
|
"--scale", f"crawl4ai={replicas}"
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True, cwd=str(self.state_dir))
|
|
|
|
# Wait for services to be healthy
|
|
if self._wait_for_compose_healthy(project_name, timeout=60):
|
|
return {
|
|
"success": True,
|
|
"message": f"Compose stack started with {replicas} replicas",
|
|
"state_data": {
|
|
"compose_project": project_name
|
|
}
|
|
}
|
|
else:
|
|
# Cleanup failed deployment
|
|
subprocess.run(
|
|
["docker", "compose", "-f", str(self.compose_file), "-p", project_name, "down"],
|
|
capture_output=True,
|
|
cwd=str(self.state_dir)
|
|
)
|
|
return {
|
|
"success": False,
|
|
"error": "Compose stack started but health checks failed"
|
|
}
|
|
except subprocess.CalledProcessError as e:
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to start Compose stack: {e.stderr}"
|
|
}
|
|
|
|
def _generate_compose_file(self, replicas: int, port: int, env_file: str, image: str):
|
|
"""Generate docker-compose.yml from template with validation."""
|
|
import os
|
|
|
|
# Get template path - check if we're in the package or dev environment
|
|
template_path = Path(__file__).parent / "templates" / "docker-compose.template.yml"
|
|
|
|
if not template_path.exists():
|
|
raise FileNotFoundError(
|
|
f"Docker Compose template not found: {template_path}\n"
|
|
f"Please ensure crawl4ai package is correctly installed.\n"
|
|
f"Try: pip install --force-reinstall crawl4ai"
|
|
)
|
|
|
|
try:
|
|
with open(template_path) as f:
|
|
template = f.read()
|
|
except IOError as e:
|
|
raise RuntimeError(f"Failed to read template {template_path}: {e}")
|
|
|
|
# Validate template has required placeholders
|
|
required_vars = {"${IMAGE}", "${REPLICAS}", "${PORT}", "${NGINX_CONF}"}
|
|
missing = required_vars - set(re.findall(r'\$\{[A-Z_]+\}', template))
|
|
if missing:
|
|
raise ValueError(f"Template missing required variables: {missing}")
|
|
|
|
# Substitute variables
|
|
content = template.replace("${IMAGE}", image)
|
|
content = content.replace("${REPLICAS}", str(replicas))
|
|
content = content.replace("${PORT}", str(port))
|
|
content = content.replace("${NGINX_CONF}", str(self.nginx_conf))
|
|
|
|
# Verify no unsubstituted variables remain
|
|
remaining = re.findall(r'\$\{[A-Z_]+\}', content)
|
|
if remaining:
|
|
import logging
|
|
logging.warning(f"Unsubstituted variables in template: {remaining}")
|
|
|
|
try:
|
|
with open(self.compose_file, "w") as f:
|
|
f.write(content)
|
|
except IOError as e:
|
|
raise RuntimeError(f"Failed to write compose file {self.compose_file}: {e}")
|
|
|
|
def _generate_nginx_config(self):
|
|
"""Generate nginx.conf from template with validation."""
|
|
template_path = Path(__file__).parent / "templates" / "nginx.conf.template"
|
|
|
|
if not template_path.exists():
|
|
raise FileNotFoundError(
|
|
f"Nginx template not found: {template_path}\n"
|
|
f"Please ensure crawl4ai package is correctly installed.\n"
|
|
f"Try: pip install --force-reinstall crawl4ai"
|
|
)
|
|
|
|
try:
|
|
with open(template_path) as f:
|
|
content = f.read()
|
|
except IOError as e:
|
|
raise RuntimeError(f"Failed to read nginx template {template_path}: {e}")
|
|
|
|
# Nginx template doesn't need variable substitution currently
|
|
try:
|
|
with open(self.nginx_conf, "w") as f:
|
|
f.write(content)
|
|
except IOError as e:
|
|
raise RuntimeError(f"Failed to write nginx config {self.nginx_conf}: {e}")
|
|
|
|
def _wait_for_compose_healthy(self, project: str, timeout: int = 60) -> bool:
|
|
"""Wait for Compose services to be healthy."""
|
|
import time
|
|
start = time.time()
|
|
|
|
while time.time() - start < timeout:
|
|
try:
|
|
# Check if nginx service is running (it depends on crawl4ai)
|
|
result = subprocess.run(
|
|
["docker", "compose", "-f", str(self.compose_file), "-p", project, "ps", "--format", "json"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
cwd=str(self.state_dir)
|
|
)
|
|
|
|
if result.returncode == 0 and result.stdout:
|
|
import json
|
|
services = [json.loads(line) for line in result.stdout.strip().split('\n') if line]
|
|
|
|
# Check if nginx is running (implies crawl4ai instances are up)
|
|
nginx_running = any(
|
|
s.get("Service") == "nginx" and s.get("State") == "running"
|
|
for s in services
|
|
)
|
|
|
|
if nginx_running:
|
|
return True
|
|
|
|
time.sleep(2)
|
|
except Exception:
|
|
time.sleep(2)
|
|
|
|
return False
|
|
|
|
def _stop_compose(self, project: str, remove_volumes: bool):
|
|
"""Stop Compose stack."""
|
|
cmd = ["docker", "compose", "-f", str(self.compose_file), "-p", project, "down"]
|
|
if remove_volumes:
|
|
cmd.append("-v")
|
|
|
|
subprocess.run(cmd, check=True, capture_output=True, cwd=str(self.state_dir))
|
|
|
|
def _scale_compose(self, project: str, replicas: int):
|
|
"""Scale Compose service."""
|
|
subprocess.run(
|
|
["docker", "compose", "-f", str(self.compose_file), "-p", project, "up", "-d", "--scale", f"crawl4ai={replicas}", "--no-recreate"],
|
|
check=True,
|
|
capture_output=True,
|
|
cwd=str(self.state_dir)
|
|
)
|
|
|
|
def _check_compose_running(self, project: str) -> bool:
|
|
"""Check if Compose stack is running."""
|
|
if not project or not self.compose_file.exists():
|
|
return False
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "compose", "-f", str(self.compose_file), "-p", project, "ps", "-q"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
cwd=str(self.state_dir)
|
|
)
|
|
# If there are any container IDs, the stack is running
|
|
return bool(result.stdout.strip())
|
|
except Exception:
|
|
return False
|
|
|
|
def _logs_compose(self, project: str, follow: bool, tail: int) -> str:
|
|
"""Get logs from Compose stack."""
|
|
cmd = ["docker", "compose", "-f", str(self.compose_file), "-p", project, "logs", "--tail", str(tail)]
|
|
if follow:
|
|
cmd.append("-f")
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(self.state_dir))
|
|
return result.stdout
|
|
|
|
# ========== State Management ==========
|
|
|
|
def _save_state(self, state: Dict):
|
|
"""Persist server state to disk with atomic write and file locking."""
|
|
import fcntl
|
|
|
|
self.state_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Atomic write with exclusive lock
|
|
temp_file = self.state_file.with_suffix('.tmp')
|
|
try:
|
|
with open(temp_file, 'w') as f:
|
|
fcntl.flock(f.fileno(), fcntl.LOCK_EX) # Exclusive lock
|
|
json.dump(state, f, indent=2)
|
|
f.flush()
|
|
os.fsync(f.fileno()) # Force write to disk
|
|
fcntl.flock(f.fileno(), fcntl.LOCK_UN) # Unlock
|
|
|
|
# Atomic rename
|
|
temp_file.replace(self.state_file)
|
|
except Exception as e:
|
|
# Cleanup temp file on error
|
|
temp_file.unlink(missing_ok=True)
|
|
raise RuntimeError(f"Failed to save state: {e}")
|
|
|
|
def _load_state(self) -> Optional[Dict]:
|
|
"""Load server state from disk with file locking."""
|
|
import fcntl
|
|
|
|
if not self.state_file.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(self.state_file) as f:
|
|
fcntl.flock(f.fileno(), fcntl.LOCK_SH) # Shared lock (read)
|
|
state = json.load(f)
|
|
fcntl.flock(f.fileno(), fcntl.LOCK_UN) # Unlock
|
|
return state
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
# Log and remove corrupted state file
|
|
import logging
|
|
logging.error(f"Corrupted state file, removing: {e}")
|
|
self.state_file.unlink(missing_ok=True)
|
|
return None
|
|
|
|
def _clear_state(self):
|
|
"""Remove state file with locking."""
|
|
import fcntl
|
|
|
|
if self.state_file.exists():
|
|
try:
|
|
# Acquire lock before deletion to prevent race
|
|
with open(self.state_file, 'r') as f:
|
|
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
# Lock acquired, now delete
|
|
self.state_file.unlink(missing_ok=True)
|
|
except Exception:
|
|
# If lock fails, force delete anyway
|
|
self.state_file.unlink(missing_ok=True)
|
|
|
|
# ========== Helpers ==========
|
|
|
|
def _wait_for_health(self, url: str, timeout: int = 30) -> bool:
|
|
"""Wait for health endpoint to respond."""
|
|
import urllib.request
|
|
|
|
start = time.time()
|
|
while time.time() - start < timeout:
|
|
try:
|
|
urllib.request.urlopen(url, timeout=2)
|
|
return True
|
|
except Exception:
|
|
time.sleep(1)
|
|
return False
|
|
|
|
def _calculate_uptime(self, started_at: str) -> str:
|
|
"""Calculate uptime from ISO timestamp."""
|
|
if not started_at:
|
|
return "unknown"
|
|
|
|
try:
|
|
start = datetime.fromisoformat(started_at)
|
|
delta = datetime.now() - start
|
|
|
|
hours = delta.seconds // 3600
|
|
minutes = (delta.seconds % 3600) // 60
|
|
|
|
if delta.days > 0:
|
|
return f"{delta.days}d {hours}h {minutes}m"
|
|
elif hours > 0:
|
|
return f"{hours}h {minutes}m"
|
|
else:
|
|
return f"{minutes}m"
|
|
except Exception:
|
|
return "unknown"
|