Files
crawl4ai/deploy/installer/cnode_pkg/server_manager.py
unclecode cd02616218 feat(cnode): add standalone CLI for Docker server management
- Reorganized server management code:
  - Moved server_cli.py -> deploy/docker/cnode_cli.py
  - Moved server_manager.py -> deploy/docker/server_manager.py

- Created fast Python-based installation (0.1s startup):
  - deploy/installer/cnode_pkg/ - Standalone package
  - deploy/installer/install-cnode.sh - Local installer
  - deploy/installer/deploy.sh - Remote installer for users

- Added backward compatibility:
  - crawl4ai/cli.py: 'crwl server' redirects to 'cnode'
  - Updated tests to match new CLI structure (12/12 passing)

- Automated sync workflow:
  - .githooks/pre-commit - Auto-syncs source to package
  - setup-hooks.sh - One-time setup for contributors
  - deploy/installer/sync-cnode.sh - Manual sync script

Performance:
  - Startup time: 0.1s (49x faster than PyInstaller)
  - Size: ~50KB wrapper vs 8.8MB binary

Commands:
  cnode start [--replicas N]  # Start server/cluster
  cnode status                # Check status
  cnode scale N               # Scale replicas
  cnode logs [-f]             # View logs
  cnode stop                  # Stop server
2025-10-21 09:31:18 +08:00

1155 lines
39 KiB
Python

"""
Crawl4AI Docker Server Manager
Orchestrates single-node Docker deployments with automatic scaling:
- Single container (N=1)
- Docker Swarm (N>1, if available)
- Docker Compose + Nginx (N>1, fallback)
"""
import json
import subprocess
import time
import re
import os
from pathlib import Path
from typing import Dict, Optional, Literal
from datetime import datetime
import socket
ServerMode = Literal["single", "swarm", "compose"]
# ========== Input Validation Functions ==========
def validate_docker_image(image: str) -> bool:
"""Validate Docker image name format.
Allows: registry.com/namespace/repo:tag
Format: [registry/][namespace/]repo[:tag][@digest]
Args:
image: Docker image string
Returns:
True if valid, False otherwise
"""
if not image or not isinstance(image, str):
return False
# Length check
if len(image) > 256:
return False
# Basic pattern: alphanumeric, dots, slashes, colons, dashes, underscores
# No shell metacharacters allowed
pattern = r'^[a-zA-Z0-9.\-/:_@]+$'
if not re.match(pattern, image):
return False
# Additional safety: no consecutive special chars that could be exploited
if '..' in image or '//' in image:
return False
return True
def validate_port(port: int) -> bool:
"""Validate port number is in valid range.
Args:
port: Port number
Returns:
True if valid (1-65535), False otherwise
"""
return isinstance(port, int) and 1 <= port <= 65535
def validate_env_file(path: str) -> bool:
"""Validate environment file path exists and is readable.
Args:
path: File path to validate
Returns:
True if file exists and is readable, False otherwise
"""
if not path or not isinstance(path, str):
return False
try:
file_path = Path(path).resolve()
return file_path.exists() and file_path.is_file() and os.access(file_path, os.R_OK)
except Exception:
return False
def validate_replicas(replicas: int) -> bool:
"""Validate replica count is in reasonable range.
Args:
replicas: Number of replicas
Returns:
True if valid (1-100), False otherwise
"""
return isinstance(replicas, int) and 1 <= replicas <= 100
class ServerManager:
"""Manages Crawl4AI Docker server lifecycle and orchestration."""
def __init__(self):
self.state_dir = Path.home() / ".crawl4ai" / "server"
self.state_file = self.state_dir / "state.json"
self.compose_file = self.state_dir / "docker-compose.yml"
self.nginx_conf = self.state_dir / "nginx.conf"
self.state_dir.mkdir(parents=True, exist_ok=True)
# ========== Public API ==========
async def start(
self,
replicas: int = 1,
mode: str = "auto",
port: int = 11235,
env_file: Optional[str] = None,
image: str = "unclecode/crawl4ai:latest",
**kwargs
) -> Dict:
"""Start Crawl4AI server with specified configuration.
Args:
replicas: Number of container replicas (default: 1)
mode: Deployment mode - 'auto', 'single', 'swarm', or 'compose'
port: External port to expose (default: 11235)
env_file: Path to environment file
image: Docker image to use
**kwargs: Additional docker run arguments
Returns:
Dict with status and deployment info
"""
# Check if already running
state = self._load_state()
if state:
return {
"success": False,
"message": "Server already running",
"current_state": state
}
# Validate Docker is available
if not self._is_docker_available():
return {
"success": False,
"error": "Docker daemon not running. Please start Docker first."
}
# Check port availability
if not self._is_port_available(port):
return {
"success": False,
"error": f"Port {port} is already in use"
}
# Detect deployment mode
detected_mode = self._detect_mode(replicas, mode)
# Ensure image is available
if not self._ensure_image(image):
return {
"success": False,
"error": f"Failed to pull image {image}"
}
# Start based on mode
if detected_mode == "single":
result = self._start_single(port, env_file, image, **kwargs)
elif detected_mode == "swarm":
result = self._start_swarm(replicas, port, env_file, image, **kwargs)
elif detected_mode == "compose":
result = self._start_compose(replicas, port, env_file, image, **kwargs)
else:
return {
"success": False,
"error": f"Unknown mode: {detected_mode}"
}
if result["success"]:
# Save state
self._save_state({
"mode": detected_mode,
"replicas": replicas,
"port": port,
"image": image,
"env_file": env_file,
"started_at": datetime.now().isoformat(),
**result.get("state_data", {})
})
return result
async def status(self) -> Dict:
"""Get current server status."""
state = self._load_state()
if not state:
return {
"running": False,
"message": "No server is currently running"
}
mode = state["mode"]
# Check actual container status
if mode == "single":
running = self._check_container_running(state.get("container_id"))
elif mode == "swarm":
running = self._check_service_running(state.get("service_name"))
elif mode == "compose":
running = self._check_compose_running(state.get("compose_project"))
else:
running = False
if not running:
# State file exists but containers are gone - clean up
self._clear_state()
return {
"running": False,
"message": "State file exists but containers stopped externally"
}
return {
"running": True,
"mode": mode,
"replicas": state.get("replicas", 1),
"port": state.get("port", 11235),
"image": state.get("image"),
"started_at": state.get("started_at"),
"uptime": self._calculate_uptime(state.get("started_at"))
}
async def stop(self, remove_volumes: bool = False) -> Dict:
"""Stop running server.
Args:
remove_volumes: Remove associated volumes
Returns:
Dict with stop status
"""
state = self._load_state()
if not state:
return {
"success": False,
"message": "No server is running"
}
mode = state["mode"]
try:
if mode == "single":
self._stop_single(state.get("container_id"), remove_volumes)
elif mode == "swarm":
self._stop_swarm(state.get("service_name"))
elif mode == "compose":
self._stop_compose(state.get("compose_project"), remove_volumes)
self._clear_state()
return {
"success": True,
"message": f"Server stopped ({mode} mode)"
}
except Exception as e:
return {
"success": False,
"error": str(e)
}
async def cleanup(self, force: bool = False) -> Dict:
"""Force cleanup of all Crawl4AI Docker resources.
Args:
force: Force cleanup even if state file doesn't exist
Returns:
Dict with cleanup status
"""
import logging
logger = logging.getLogger(__name__)
removed_count = 0
messages = []
try:
# Try to stop via state file first
if not force:
state = self._load_state()
if state:
stop_result = await self.stop(remove_volumes=True)
if stop_result["success"]:
return {
"success": True,
"removed": 1,
"message": "Stopped via state file"
}
# Force cleanup - find and remove all Crawl4AI resources
logger.info("Force cleanup: removing all Crawl4AI Docker resources")
# Remove all crawl4ai containers
try:
result = subprocess.run(
["docker", "ps", "-a", "--filter", "name=crawl4ai", "--format", "{{.ID}}"],
capture_output=True,
text=True,
timeout=10
)
container_ids = result.stdout.strip().split('\n')
container_ids = [cid for cid in container_ids if cid]
for cid in container_ids:
subprocess.run(["docker", "rm", "-f", cid], capture_output=True, timeout=10)
removed_count += 1
messages.append(f"Removed {len(container_ids)} crawl4ai containers")
except Exception as e:
logger.warning(f"Error removing containers: {e}")
# Remove nginx containers
try:
result = subprocess.run(
["docker", "ps", "-a", "--filter", "name=nginx", "--format", "{{.ID}}"],
capture_output=True,
text=True,
timeout=10
)
nginx_ids = result.stdout.strip().split('\n')
nginx_ids = [nid for nid in nginx_ids if nid]
for nid in nginx_ids:
subprocess.run(["docker", "rm", "-f", nid], capture_output=True, timeout=10)
removed_count += len(nginx_ids)
if nginx_ids:
messages.append(f"Removed {len(nginx_ids)} nginx containers")
except Exception as e:
logger.warning(f"Error removing nginx: {e}")
# Remove redis containers
try:
result = subprocess.run(
["docker", "ps", "-a", "--filter", "name=redis", "--format", "{{.ID}}"],
capture_output=True,
text=True,
timeout=10
)
redis_ids = result.stdout.strip().split('\n')
redis_ids = [rid for rid in redis_ids if rid]
for rid in redis_ids:
subprocess.run(["docker", "rm", "-f", rid], capture_output=True, timeout=10)
removed_count += len(redis_ids)
if redis_ids:
messages.append(f"Removed {len(redis_ids)} redis containers")
except Exception as e:
logger.warning(f"Error removing redis: {e}")
# Clean up compose projects
for project in ["crawl4ai", "fix-docker"]:
try:
subprocess.run(
["docker", "compose", "-p", project, "down", "-v"],
capture_output=True,
timeout=30,
cwd=str(self.state_dir)
)
messages.append(f"Cleaned compose project: {project}")
except Exception:
pass
# Remove networks
try:
subprocess.run(["docker", "network", "prune", "-f"], capture_output=True, timeout=10)
messages.append("Pruned networks")
except Exception as e:
logger.warning(f"Error pruning networks: {e}")
# Clear state file
self._clear_state()
messages.append("Cleared state file")
return {
"success": True,
"removed": removed_count,
"message": "; ".join(messages)
}
except Exception as e:
logger.error(f"Cleanup error: {e}")
return {
"success": False,
"message": f"Cleanup failed: {str(e)}"
}
async def scale(self, replicas: int) -> Dict:
"""Scale server to specified replica count.
Args:
replicas: Target number of replicas
Returns:
Dict with scaling status
"""
state = self._load_state()
if not state:
return {
"success": False,
"message": "No server is running"
}
mode = state["mode"]
if mode == "single":
return {
"success": False,
"error": "Cannot scale single container mode. Use 'crwl server stop' then 'crwl server start --replicas N'"
}
try:
if mode == "swarm":
self._scale_swarm(state["service_name"], replicas)
elif mode == "compose":
self._scale_compose(state["compose_project"], replicas)
# Update state
state["replicas"] = replicas
self._save_state(state)
return {
"success": True,
"message": f"Scaled to {replicas} replicas",
"mode": mode
}
except Exception as e:
return {
"success": False,
"error": str(e)
}
async def logs(self, follow: bool = False, tail: int = 100) -> str:
"""Get server logs.
Args:
follow: Follow log output
tail: Number of lines to show
Returns:
Log output as string
"""
state = self._load_state()
if not state:
return "No server is running"
mode = state["mode"]
try:
if mode == "single":
return self._logs_single(state["container_id"], follow, tail)
elif mode == "swarm":
return self._logs_swarm(state["service_name"], follow, tail)
elif mode == "compose":
return self._logs_compose(state["compose_project"], follow, tail)
except Exception as e:
return f"Error getting logs: {e}"
# ========== Mode Detection ==========
def _detect_mode(self, replicas: int, mode: str) -> ServerMode:
"""Detect deployment mode based on replicas and user preference."""
if mode != "auto":
return mode
if replicas == 1:
return "single"
# N>1: prefer Swarm if available, fallback to Compose
if self._is_swarm_available():
return "swarm"
return "compose"
def _is_swarm_available(self) -> bool:
"""Check if Docker Swarm is initialized and available."""
try:
result = subprocess.run(
["docker", "info", "--format", "{{.Swarm.LocalNodeState}}"],
capture_output=True,
text=True,
timeout=5
)
return result.stdout.strip() == "active"
except Exception:
return False
def _is_docker_available(self) -> bool:
"""Check if Docker daemon is running."""
try:
subprocess.run(
["docker", "ps"],
capture_output=True,
timeout=5,
check=True
)
return True
except Exception:
return False
def _is_port_available(self, port: int) -> bool:
"""Check if port is available for binding."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(('0.0.0.0', port))
return True
except OSError:
return False
def _ensure_image(self, image: str) -> bool:
"""Ensure Docker image is available locally, pull if needed."""
try:
# Check if image exists locally
result = subprocess.run(
["docker", "image", "inspect", image],
capture_output=True,
timeout=5
)
if result.returncode == 0:
return True
# Determine if this looks like a registry image
# Registry images have format: [registry/][namespace/]repository[:tag]
# Examples: unclecode/crawl4ai:latest, docker.io/library/nginx:latest
# Local-only: crawl4ai-local:latest, my-image:v1
# If it has a dot in the first part (before any slash), it's likely a registry
# Or if it has a slash, it's likely registry/namespace/repo format
parts = image.split("/")
is_registry_image = (
len(parts) > 1 and # Has slash
"." not in parts[0] and # First part isn't a domain (localhost.localdomain)
not parts[0].startswith("localhost") # Not localhost registry
)
if not is_registry_image:
return False # Local image doesn't exist
# Try to pull from registry
subprocess.run(
["docker", "pull", image],
capture_output=True,
check=True,
timeout=300
)
return True
except Exception:
return False
# ========== Single Container Mode ==========
def _start_single(self, port: int, env_file: Optional[str], image: str, **kwargs) -> Dict:
"""Start single container with docker run."""
# Validate inputs to prevent injection attacks
if not validate_port(port):
return {
"success": False,
"error": f"Invalid port number: {port}. Must be between 1-65535."
}
if not validate_docker_image(image):
return {
"success": False,
"error": f"Invalid Docker image format: {image}"
}
if env_file and not validate_env_file(env_file):
return {
"success": False,
"error": f"Environment file not found or not readable: {env_file}"
}
cmd = [
"docker", "run",
"-d", # Detached
"--name", "crawl4ai_server",
"-p", f"{port}:11235",
"--shm-size=1g", # Important for browser
]
if env_file:
# Use absolute path to prevent path traversal
abs_env_file = str(Path(env_file).resolve())
cmd.extend(["--env-file", abs_env_file])
# Whitelist allowed Docker flags to prevent privilege escalation
allowed_flags = {"--memory", "--cpus", "--restart", "--network"}
for key, value in kwargs.items():
if key in allowed_flags:
cmd.append(key)
if value is not True: # Handle boolean flags
cmd.append(str(value))
else:
# Log ignored flags for debugging
import logging
logging.warning(f"Ignoring non-whitelisted Docker flag: {key}")
cmd.append(image)
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
container_id = result.stdout.strip()
# Wait for health check
if self._wait_for_health(f"http://localhost:{port}/health"):
return {
"success": True,
"message": f"Server started on port {port}",
"state_data": {"container_id": container_id}
}
else:
# Cleanup failed container
subprocess.run(["docker", "rm", "-f", container_id], capture_output=True)
return {
"success": False,
"error": "Container started but health check failed"
}
except subprocess.CalledProcessError as e:
return {
"success": False,
"error": f"Failed to start container: {e.stderr}"
}
def _stop_single(self, container_id: str, remove_volumes: bool):
"""Stop single container."""
cmd = ["docker", "rm", "-f"]
if remove_volumes:
cmd.append("-v")
cmd.append(container_id)
subprocess.run(cmd, check=True)
def _check_container_running(self, container_id: str) -> bool:
"""Check if container is running."""
if not container_id:
return False
try:
result = subprocess.run(
["docker", "inspect", "-f", "{{.State.Running}}", container_id],
capture_output=True,
text=True,
timeout=5
)
return result.stdout.strip() == "true"
except Exception:
return False
def _logs_single(self, container_id: str, follow: bool, tail: int) -> str:
"""Get logs from single container."""
cmd = ["docker", "logs", "--tail", str(tail)]
if follow:
cmd.append("-f")
cmd.append(container_id)
result = subprocess.run(cmd, capture_output=True, text=True)
return result.stdout
# ========== Swarm Mode ==========
def _start_swarm(self, replicas: int, port: int, env_file: Optional[str], image: str, **kwargs) -> Dict:
"""Start service in Swarm mode."""
# Validate inputs to prevent injection attacks
if not validate_replicas(replicas):
return {
"success": False,
"error": f"Invalid replica count: {replicas}. Must be between 1-100."
}
if not validate_port(port):
return {
"success": False,
"error": f"Invalid port number: {port}. Must be between 1-65535."
}
if not validate_docker_image(image):
return {
"success": False,
"error": f"Invalid Docker image format: {image}"
}
if env_file and not validate_env_file(env_file):
return {
"success": False,
"error": f"Environment file not found or not readable: {env_file}"
}
service_name = "crawl4ai" # Static name (safe)
# Initialize swarm if needed
if not self._is_swarm_available():
init_result = self._init_swarm()
if not init_result:
return {
"success": False,
"error": "Failed to initialize Docker Swarm. Use 'docker swarm init' manually."
}
cmd = [
"docker", "service", "create",
"--name", service_name,
"--replicas", str(replicas),
"--publish", f"{port}:11235",
"--mount", "type=tmpfs,target=/dev/shm,tmpfs-size=1g",
"--limit-memory", "4G",
]
if env_file:
# Use absolute path to prevent path traversal
abs_env_file = str(Path(env_file).resolve())
cmd.extend(["--env-file", abs_env_file])
cmd.append(image)
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
service_id = result.stdout.strip()
# Wait for service to be ready (check replicas)
if self._wait_for_service(service_name, replicas):
return {
"success": True,
"message": f"Swarm service started with {replicas} replicas",
"state_data": {
"service_name": service_name,
"service_id": service_id
}
}
else:
# Cleanup failed service
subprocess.run(["docker", "service", "rm", service_name], capture_output=True)
return {
"success": False,
"error": "Service created but replicas failed to start"
}
except subprocess.CalledProcessError as e:
return {
"success": False,
"error": f"Failed to create Swarm service: {e.stderr}"
}
def _init_swarm(self) -> bool:
"""Initialize Docker Swarm if not already initialized."""
try:
result = subprocess.run(
["docker", "swarm", "init"],
capture_output=True,
text=True,
timeout=10
)
return result.returncode == 0
except Exception:
return False
def _wait_for_service(self, service_name: str, expected_replicas: int, timeout: int = 60) -> bool:
"""Wait for Swarm service replicas to be running."""
import time
start = time.time()
while time.time() - start < timeout:
try:
result = subprocess.run(
["docker", "service", "ls", "--filter", f"name={service_name}", "--format", "{{.Replicas}}"],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
# Format is "2/3" (running/desired)
replicas_str = result.stdout.strip()
if "/" in replicas_str:
running, desired = replicas_str.split("/")
if int(running) == expected_replicas and int(desired) == expected_replicas:
return True
time.sleep(2)
except Exception:
time.sleep(2)
return False
def _stop_swarm(self, service_name: str):
"""Stop Swarm service."""
subprocess.run(
["docker", "service", "rm", service_name],
check=True,
capture_output=True
)
def _scale_swarm(self, service_name: str, replicas: int):
"""Scale Swarm service."""
subprocess.run(
["docker", "service", "scale", f"{service_name}={replicas}"],
check=True,
capture_output=True
)
def _check_service_running(self, service_name: str) -> bool:
"""Check if Swarm service is running."""
if not service_name:
return False
try:
result = subprocess.run(
["docker", "service", "ls", "--filter", f"name={service_name}", "--format", "{{.Name}}"],
capture_output=True,
text=True,
timeout=5
)
return service_name in result.stdout
except Exception:
return False
def _logs_swarm(self, service_name: str, follow: bool, tail: int) -> str:
"""Get logs from Swarm service."""
cmd = ["docker", "service", "logs", "--tail", str(tail)]
if follow:
cmd.append("-f")
cmd.append(service_name)
result = subprocess.run(cmd, capture_output=True, text=True)
return result.stdout
# ========== Compose Mode ==========
def _start_compose(self, replicas: int, port: int, env_file: Optional[str], image: str, **kwargs) -> Dict:
"""Start with Docker Compose + Nginx."""
# Validate inputs to prevent injection attacks
if not validate_replicas(replicas):
return {
"success": False,
"error": f"Invalid replica count: {replicas}. Must be between 1-100."
}
if not validate_port(port):
return {
"success": False,
"error": f"Invalid port number: {port}. Must be between 1-65535."
}
if not validate_docker_image(image):
return {
"success": False,
"error": f"Invalid Docker image format: {image}"
}
if env_file and not validate_env_file(env_file):
return {
"success": False,
"error": f"Environment file not found or not readable: {env_file}"
}
project_name = "crawl4ai" # Static name (safe)
# Generate compose and nginx config files
try:
self._generate_compose_file(replicas, port, env_file or "", image)
self._generate_nginx_config()
except Exception as e:
return {
"success": False,
"error": f"Failed to generate config files: {e}"
}
# Start compose stack - use absolute path for compose file
cmd = [
"docker", "compose",
"-f", str(self.compose_file.resolve()),
"-p", project_name,
"up", "-d",
"--scale", f"crawl4ai={replicas}"
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True, cwd=str(self.state_dir))
# Wait for services to be healthy
if self._wait_for_compose_healthy(project_name, timeout=60):
return {
"success": True,
"message": f"Compose stack started with {replicas} replicas",
"state_data": {
"compose_project": project_name
}
}
else:
# Cleanup failed deployment
subprocess.run(
["docker", "compose", "-f", str(self.compose_file), "-p", project_name, "down"],
capture_output=True,
cwd=str(self.state_dir)
)
return {
"success": False,
"error": "Compose stack started but health checks failed"
}
except subprocess.CalledProcessError as e:
return {
"success": False,
"error": f"Failed to start Compose stack: {e.stderr}"
}
def _generate_compose_file(self, replicas: int, port: int, env_file: str, image: str):
"""Generate docker-compose.yml from template with validation."""
import os
# Get template path - check if we're in the package or dev environment
template_path = Path(__file__).parent / "templates" / "docker-compose.template.yml"
if not template_path.exists():
raise FileNotFoundError(
f"Docker Compose template not found: {template_path}\n"
f"Please ensure crawl4ai package is correctly installed.\n"
f"Try: pip install --force-reinstall crawl4ai"
)
try:
with open(template_path) as f:
template = f.read()
except IOError as e:
raise RuntimeError(f"Failed to read template {template_path}: {e}")
# Validate template has required placeholders
required_vars = {"${IMAGE}", "${REPLICAS}", "${PORT}", "${NGINX_CONF}"}
missing = required_vars - set(re.findall(r'\$\{[A-Z_]+\}', template))
if missing:
raise ValueError(f"Template missing required variables: {missing}")
# Substitute variables
content = template.replace("${IMAGE}", image)
content = content.replace("${REPLICAS}", str(replicas))
content = content.replace("${PORT}", str(port))
content = content.replace("${NGINX_CONF}", str(self.nginx_conf))
# Verify no unsubstituted variables remain
remaining = re.findall(r'\$\{[A-Z_]+\}', content)
if remaining:
import logging
logging.warning(f"Unsubstituted variables in template: {remaining}")
try:
with open(self.compose_file, "w") as f:
f.write(content)
except IOError as e:
raise RuntimeError(f"Failed to write compose file {self.compose_file}: {e}")
def _generate_nginx_config(self):
"""Generate nginx.conf from template with validation."""
template_path = Path(__file__).parent / "templates" / "nginx.conf.template"
if not template_path.exists():
raise FileNotFoundError(
f"Nginx template not found: {template_path}\n"
f"Please ensure crawl4ai package is correctly installed.\n"
f"Try: pip install --force-reinstall crawl4ai"
)
try:
with open(template_path) as f:
content = f.read()
except IOError as e:
raise RuntimeError(f"Failed to read nginx template {template_path}: {e}")
# Nginx template doesn't need variable substitution currently
try:
with open(self.nginx_conf, "w") as f:
f.write(content)
except IOError as e:
raise RuntimeError(f"Failed to write nginx config {self.nginx_conf}: {e}")
def _wait_for_compose_healthy(self, project: str, timeout: int = 60) -> bool:
"""Wait for Compose services to be healthy."""
import time
start = time.time()
while time.time() - start < timeout:
try:
# Check if nginx service is running (it depends on crawl4ai)
result = subprocess.run(
["docker", "compose", "-f", str(self.compose_file), "-p", project, "ps", "--format", "json"],
capture_output=True,
text=True,
timeout=5,
cwd=str(self.state_dir)
)
if result.returncode == 0 and result.stdout:
import json
services = [json.loads(line) for line in result.stdout.strip().split('\n') if line]
# Check if nginx is running (implies crawl4ai instances are up)
nginx_running = any(
s.get("Service") == "nginx" and s.get("State") == "running"
for s in services
)
if nginx_running:
return True
time.sleep(2)
except Exception:
time.sleep(2)
return False
def _stop_compose(self, project: str, remove_volumes: bool):
"""Stop Compose stack."""
cmd = ["docker", "compose", "-f", str(self.compose_file), "-p", project, "down"]
if remove_volumes:
cmd.append("-v")
subprocess.run(cmd, check=True, capture_output=True, cwd=str(self.state_dir))
def _scale_compose(self, project: str, replicas: int):
"""Scale Compose service."""
subprocess.run(
["docker", "compose", "-f", str(self.compose_file), "-p", project, "up", "-d", "--scale", f"crawl4ai={replicas}", "--no-recreate"],
check=True,
capture_output=True,
cwd=str(self.state_dir)
)
def _check_compose_running(self, project: str) -> bool:
"""Check if Compose stack is running."""
if not project or not self.compose_file.exists():
return False
try:
result = subprocess.run(
["docker", "compose", "-f", str(self.compose_file), "-p", project, "ps", "-q"],
capture_output=True,
text=True,
timeout=5,
cwd=str(self.state_dir)
)
# If there are any container IDs, the stack is running
return bool(result.stdout.strip())
except Exception:
return False
def _logs_compose(self, project: str, follow: bool, tail: int) -> str:
"""Get logs from Compose stack."""
cmd = ["docker", "compose", "-f", str(self.compose_file), "-p", project, "logs", "--tail", str(tail)]
if follow:
cmd.append("-f")
result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(self.state_dir))
return result.stdout
# ========== State Management ==========
def _save_state(self, state: Dict):
"""Persist server state to disk with atomic write and file locking."""
import fcntl
self.state_dir.mkdir(parents=True, exist_ok=True)
# Atomic write with exclusive lock
temp_file = self.state_file.with_suffix('.tmp')
try:
with open(temp_file, 'w') as f:
fcntl.flock(f.fileno(), fcntl.LOCK_EX) # Exclusive lock
json.dump(state, f, indent=2)
f.flush()
os.fsync(f.fileno()) # Force write to disk
fcntl.flock(f.fileno(), fcntl.LOCK_UN) # Unlock
# Atomic rename
temp_file.replace(self.state_file)
except Exception as e:
# Cleanup temp file on error
temp_file.unlink(missing_ok=True)
raise RuntimeError(f"Failed to save state: {e}")
def _load_state(self) -> Optional[Dict]:
"""Load server state from disk with file locking."""
import fcntl
if not self.state_file.exists():
return None
try:
with open(self.state_file) as f:
fcntl.flock(f.fileno(), fcntl.LOCK_SH) # Shared lock (read)
state = json.load(f)
fcntl.flock(f.fileno(), fcntl.LOCK_UN) # Unlock
return state
except (json.JSONDecodeError, IOError) as e:
# Log and remove corrupted state file
import logging
logging.error(f"Corrupted state file, removing: {e}")
self.state_file.unlink(missing_ok=True)
return None
def _clear_state(self):
"""Remove state file with locking."""
import fcntl
if self.state_file.exists():
try:
# Acquire lock before deletion to prevent race
with open(self.state_file, 'r') as f:
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
# Lock acquired, now delete
self.state_file.unlink(missing_ok=True)
except Exception:
# If lock fails, force delete anyway
self.state_file.unlink(missing_ok=True)
# ========== Helpers ==========
def _wait_for_health(self, url: str, timeout: int = 30) -> bool:
"""Wait for health endpoint to respond."""
import urllib.request
start = time.time()
while time.time() - start < timeout:
try:
urllib.request.urlopen(url, timeout=2)
return True
except Exception:
time.sleep(1)
return False
def _calculate_uptime(self, started_at: str) -> str:
"""Calculate uptime from ISO timestamp."""
if not started_at:
return "unknown"
try:
start = datetime.fromisoformat(started_at)
delta = datetime.now() - start
hours = delta.seconds // 3600
minutes = (delta.seconds % 3600) // 60
if delta.days > 0:
return f"{delta.days}d {hours}h {minutes}m"
elif hours > 0:
return f"{hours}h {minutes}m"
else:
return f"{minutes}m"
except Exception:
return "unknown"