feat(monitor): implement code review fixes and real-time WebSocket monitoring
Backend Improvements (11 fixes applied): Critical Fixes: - Add lock protection for browser pool access in monitor stats - Ensure async track_janitor_event across all call sites - Improve error handling in monitor request tracking (already in place) Important Fixes: - Replace fire-and-forget Redis with background persistence worker - Add time-based expiry for completed requests/errors (5min cleanup) - Implement input validation for monitor route parameters - Add 4s timeout to timeline updater to prevent hangs - Add warning when killing browsers with active requests - Implement monitor cleanup on shutdown with final persistence - Document memory estimates with TODO for actual tracking Frontend Enhancements: WebSocket Real-time Updates: - Add WebSocket endpoint at /monitor/ws for live monitoring - Implement auto-reconnect with exponential backoff (max 5 attempts) - Add graceful fallback to HTTP polling on WebSocket failure - Send comprehensive updates every 2 seconds (health, requests, browsers, timeline, events) UI/UX Improvements: - Add live connection status indicator with pulsing animation - Green "Live" = WebSocket connected - Yellow "Connecting..." = Attempting connection - Blue "Polling" = Fallback to HTTP polling - Red "Disconnected" = Connection failed - Restore original beautiful styling for all sections - Improve request table layout with flex-grow for URL column - Add browser type text labels alongside emojis - Add flex layout to browser section header Testing: - Add test-websocket.py for WebSocket validation - All 7 integration tests passing successfully Summary: 563 additions across 6 files
This commit is contained in:
@@ -61,7 +61,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
||||
# Track promotion in monitor
|
||||
try:
|
||||
from monitor import get_monitor
|
||||
get_monitor().track_janitor_event("promote", sig, {"count": USAGE_COUNT[sig]})
|
||||
await get_monitor().track_janitor_event("promote", sig, {"count": USAGE_COUNT[sig]})
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -143,7 +143,7 @@ async def janitor():
|
||||
# Track in monitor
|
||||
try:
|
||||
from monitor import get_monitor
|
||||
get_monitor().track_janitor_event("close_cold", sig, {"idle_seconds": int(idle_time), "ttl": cold_ttl})
|
||||
await get_monitor().track_janitor_event("close_cold", sig, {"idle_seconds": int(idle_time), "ttl": cold_ttl})
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -161,7 +161,7 @@ async def janitor():
|
||||
# Track in monitor
|
||||
try:
|
||||
from monitor import get_monitor
|
||||
get_monitor().track_janitor_event("close_hot", sig, {"idle_seconds": int(idle_time), "ttl": hot_ttl})
|
||||
await get_monitor().track_janitor_event("close_hot", sig, {"idle_seconds": int(idle_time), "ttl": hot_ttl})
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
@@ -28,6 +28,10 @@ class MonitorStats:
|
||||
# Endpoint stats (persisted in Redis)
|
||||
self.endpoint_stats: Dict[str, Dict] = {} # endpoint -> {count, total_time, errors, ...}
|
||||
|
||||
# Background persistence queue (max 10 pending persist requests)
|
||||
self._persist_queue: asyncio.Queue = asyncio.Queue(maxsize=10)
|
||||
self._persist_worker_task: Optional[asyncio.Task] = None
|
||||
|
||||
# Timeline data (5min window, 5s resolution = 60 points)
|
||||
self.memory_timeline: deque = deque(maxlen=60)
|
||||
self.requests_timeline: deque = deque(maxlen=60)
|
||||
@@ -53,8 +57,11 @@ class MonitorStats:
|
||||
}
|
||||
self.endpoint_stats[endpoint]["count"] += 1
|
||||
|
||||
# Persist to Redis (fire and forget)
|
||||
asyncio.create_task(self._persist_endpoint_stats())
|
||||
# Queue persistence (handled by background worker)
|
||||
try:
|
||||
self._persist_queue.put_nowait(True)
|
||||
except asyncio.QueueFull:
|
||||
logger.warning("Persistence queue full, skipping")
|
||||
|
||||
async def track_request_end(self, request_id: str, success: bool, error: str = None,
|
||||
pool_hit: bool = True, status_code: int = 200):
|
||||
@@ -104,7 +111,7 @@ class MonitorStats:
|
||||
|
||||
await self._persist_endpoint_stats()
|
||||
|
||||
def track_janitor_event(self, event_type: str, sig: str, details: Dict):
|
||||
async def track_janitor_event(self, event_type: str, sig: str, details: Dict):
|
||||
"""Track janitor cleanup events."""
|
||||
self.janitor_events.append({
|
||||
"timestamp": time.time(),
|
||||
@@ -113,22 +120,43 @@ class MonitorStats:
|
||||
"details": details
|
||||
})
|
||||
|
||||
def _cleanup_old_entries(self, max_age_seconds: int = 300):
|
||||
"""Remove entries older than max_age_seconds (default 5min)."""
|
||||
now = time.time()
|
||||
cutoff = now - max_age_seconds
|
||||
|
||||
# Clean completed requests
|
||||
while self.completed_requests and self.completed_requests[0].get("end_time", 0) < cutoff:
|
||||
self.completed_requests.popleft()
|
||||
|
||||
# Clean janitor events
|
||||
while self.janitor_events and self.janitor_events[0].get("timestamp", 0) < cutoff:
|
||||
self.janitor_events.popleft()
|
||||
|
||||
# Clean errors
|
||||
while self.errors and self.errors[0].get("timestamp", 0) < cutoff:
|
||||
self.errors.popleft()
|
||||
|
||||
async def update_timeline(self):
|
||||
"""Update timeline data points (called every 5s)."""
|
||||
now = time.time()
|
||||
mem_pct = get_container_memory_percent()
|
||||
|
||||
# Clean old entries (keep last 5 minutes)
|
||||
self._cleanup_old_entries(max_age_seconds=300)
|
||||
|
||||
# Count requests in last 5s
|
||||
recent_reqs = sum(1 for req in self.completed_requests
|
||||
if now - req.get("end_time", 0) < 5)
|
||||
|
||||
# Browser counts (need to import from crawler_pool)
|
||||
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL
|
||||
browser_count = {
|
||||
"permanent": 1 if PERMANENT else 0,
|
||||
"hot": len(HOT_POOL),
|
||||
"cold": len(COLD_POOL)
|
||||
}
|
||||
# Browser counts (acquire lock to prevent race conditions)
|
||||
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
|
||||
async with LOCK:
|
||||
browser_count = {
|
||||
"permanent": 1 if PERMANENT else 0,
|
||||
"hot": len(HOT_POOL),
|
||||
"cold": len(COLD_POOL)
|
||||
}
|
||||
|
||||
self.memory_timeline.append({"time": now, "value": mem_pct})
|
||||
self.requests_timeline.append({"time": now, "value": recent_reqs})
|
||||
@@ -145,6 +173,47 @@ class MonitorStats:
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to persist endpoint stats: {e}")
|
||||
|
||||
async def _persistence_worker(self):
|
||||
"""Background worker to persist stats to Redis."""
|
||||
while True:
|
||||
try:
|
||||
await self._persist_queue.get()
|
||||
await self._persist_endpoint_stats()
|
||||
self._persist_queue.task_done()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Persistence worker error: {e}")
|
||||
|
||||
def start_persistence_worker(self):
|
||||
"""Start the background persistence worker."""
|
||||
if not self._persist_worker_task:
|
||||
self._persist_worker_task = asyncio.create_task(self._persistence_worker())
|
||||
logger.info("Started persistence worker")
|
||||
|
||||
async def stop_persistence_worker(self):
|
||||
"""Stop the background persistence worker."""
|
||||
if self._persist_worker_task:
|
||||
self._persist_worker_task.cancel()
|
||||
try:
|
||||
await self._persist_worker_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._persist_worker_task = None
|
||||
logger.info("Stopped persistence worker")
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup on shutdown - persist final stats and stop workers."""
|
||||
logger.info("Monitor cleanup starting...")
|
||||
try:
|
||||
# Persist final stats before shutdown
|
||||
await self._persist_endpoint_stats()
|
||||
# Stop background worker
|
||||
await self.stop_persistence_worker()
|
||||
logger.info("Monitor cleanup completed")
|
||||
except Exception as e:
|
||||
logger.error(f"Monitor cleanup error: {e}")
|
||||
|
||||
async def load_from_redis(self):
|
||||
"""Load persisted stats from Redis."""
|
||||
try:
|
||||
@@ -155,7 +224,7 @@ class MonitorStats:
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load from Redis: {e}")
|
||||
|
||||
def get_health_summary(self) -> Dict:
|
||||
async def get_health_summary(self) -> Dict:
|
||||
"""Get current system health snapshot."""
|
||||
mem_pct = get_container_memory_percent()
|
||||
cpu_pct = psutil.cpu_percent(interval=0.1)
|
||||
@@ -163,11 +232,17 @@ class MonitorStats:
|
||||
# Network I/O (delta since last call)
|
||||
net = psutil.net_io_counters()
|
||||
|
||||
# Pool status
|
||||
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED
|
||||
permanent_mem = 270 if PERMANENT else 0 # Estimate
|
||||
hot_mem = len(HOT_POOL) * 180 # Estimate 180MB per browser
|
||||
cold_mem = len(COLD_POOL) * 180
|
||||
# Pool status (acquire lock to prevent race conditions)
|
||||
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
|
||||
async with LOCK:
|
||||
# TODO: Track actual browser process memory instead of estimates
|
||||
# These are conservative estimates based on typical Chromium usage
|
||||
permanent_mem = 270 if PERMANENT else 0 # Estimate: ~270MB for permanent browser
|
||||
hot_mem = len(HOT_POOL) * 180 # Estimate: ~180MB per hot pool browser
|
||||
cold_mem = len(COLD_POOL) * 180 # Estimate: ~180MB per cold pool browser
|
||||
permanent_active = PERMANENT is not None
|
||||
hot_count = len(HOT_POOL)
|
||||
cold_count = len(COLD_POOL)
|
||||
|
||||
return {
|
||||
"container": {
|
||||
@@ -178,9 +253,9 @@ class MonitorStats:
|
||||
"uptime_seconds": int(time.time() - self.start_time)
|
||||
},
|
||||
"pool": {
|
||||
"permanent": {"active": PERMANENT is not None, "memory_mb": permanent_mem},
|
||||
"hot": {"count": len(HOT_POOL), "memory_mb": hot_mem},
|
||||
"cold": {"count": len(COLD_POOL), "memory_mb": cold_mem},
|
||||
"permanent": {"active": permanent_active, "memory_mb": permanent_mem},
|
||||
"hot": {"count": hot_count, "memory_mb": hot_mem},
|
||||
"cold": {"count": cold_count, "memory_mb": cold_mem},
|
||||
"total_memory_mb": permanent_mem + hot_mem + cold_mem
|
||||
},
|
||||
"janitor": {
|
||||
@@ -210,45 +285,47 @@ class MonitorStats:
|
||||
requests = [r for r in requests if not r.get("success")]
|
||||
return requests
|
||||
|
||||
def get_browser_list(self) -> List[Dict]:
|
||||
async def get_browser_list(self) -> List[Dict]:
|
||||
"""Get detailed browser pool information."""
|
||||
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, DEFAULT_CONFIG_SIG
|
||||
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, DEFAULT_CONFIG_SIG, LOCK
|
||||
|
||||
browsers = []
|
||||
now = time.time()
|
||||
|
||||
if PERMANENT:
|
||||
browsers.append({
|
||||
"type": "permanent",
|
||||
"sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown",
|
||||
"age_seconds": int(now - self.start_time),
|
||||
"last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)),
|
||||
"memory_mb": 270,
|
||||
"hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0),
|
||||
"killable": False
|
||||
})
|
||||
# Acquire lock to prevent race conditions during iteration
|
||||
async with LOCK:
|
||||
if PERMANENT:
|
||||
browsers.append({
|
||||
"type": "permanent",
|
||||
"sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown",
|
||||
"age_seconds": int(now - self.start_time),
|
||||
"last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)),
|
||||
"memory_mb": 270,
|
||||
"hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0),
|
||||
"killable": False
|
||||
})
|
||||
|
||||
for sig, crawler in HOT_POOL.items():
|
||||
browsers.append({
|
||||
"type": "hot",
|
||||
"sig": sig[:8],
|
||||
"age_seconds": int(now - self.start_time), # Approximation
|
||||
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
|
||||
"memory_mb": 180, # Estimate
|
||||
"hits": USAGE_COUNT.get(sig, 0),
|
||||
"killable": True
|
||||
})
|
||||
for sig, crawler in HOT_POOL.items():
|
||||
browsers.append({
|
||||
"type": "hot",
|
||||
"sig": sig[:8],
|
||||
"age_seconds": int(now - self.start_time), # Approximation
|
||||
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
|
||||
"memory_mb": 180, # Estimate
|
||||
"hits": USAGE_COUNT.get(sig, 0),
|
||||
"killable": True
|
||||
})
|
||||
|
||||
for sig, crawler in COLD_POOL.items():
|
||||
browsers.append({
|
||||
"type": "cold",
|
||||
"sig": sig[:8],
|
||||
"age_seconds": int(now - self.start_time),
|
||||
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
|
||||
"memory_mb": 180,
|
||||
"hits": USAGE_COUNT.get(sig, 0),
|
||||
"killable": True
|
||||
})
|
||||
for sig, crawler in COLD_POOL.items():
|
||||
browsers.append({
|
||||
"type": "cold",
|
||||
"sig": sig[:8],
|
||||
"age_seconds": int(now - self.start_time),
|
||||
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
|
||||
"memory_mb": 180,
|
||||
"hits": USAGE_COUNT.get(sig, 0),
|
||||
"killable": True
|
||||
})
|
||||
|
||||
return browsers
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
# monitor_routes.py - Monitor API endpoints
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from monitor import get_monitor
|
||||
import logging
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/monitor", tags=["monitor"])
|
||||
@@ -14,7 +16,7 @@ async def get_health():
|
||||
"""Get current system health snapshot."""
|
||||
try:
|
||||
monitor = get_monitor()
|
||||
return monitor.get_health_summary()
|
||||
return await monitor.get_health_summary()
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting health: {e}")
|
||||
raise HTTPException(500, str(e))
|
||||
@@ -28,6 +30,12 @@ async def get_requests(status: str = "all", limit: int = 50):
|
||||
status: Filter by 'active', 'completed', 'success', 'error', or 'all'
|
||||
limit: Max number of completed requests to return (default 50)
|
||||
"""
|
||||
# Input validation
|
||||
if status not in ["all", "active", "completed", "success", "error"]:
|
||||
raise HTTPException(400, f"Invalid status: {status}. Must be one of: all, active, completed, success, error")
|
||||
if limit < 1 or limit > 1000:
|
||||
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
|
||||
|
||||
try:
|
||||
monitor = get_monitor()
|
||||
|
||||
@@ -52,7 +60,7 @@ async def get_browsers():
|
||||
"""Get detailed browser pool information."""
|
||||
try:
|
||||
monitor = get_monitor()
|
||||
browsers = monitor.get_browser_list()
|
||||
browsers = await monitor.get_browser_list()
|
||||
|
||||
# Calculate summary stats
|
||||
total_browsers = len(browsers)
|
||||
@@ -95,6 +103,12 @@ async def get_timeline(metric: str = "memory", window: str = "5m"):
|
||||
metric: 'memory', 'requests', or 'browsers'
|
||||
window: Time window (only '5m' supported for now)
|
||||
"""
|
||||
# Input validation
|
||||
if metric not in ["memory", "requests", "browsers"]:
|
||||
raise HTTPException(400, f"Invalid metric: {metric}. Must be one of: memory, requests, browsers")
|
||||
if window != "5m":
|
||||
raise HTTPException(400, f"Invalid window: {window}. Only '5m' is currently supported")
|
||||
|
||||
try:
|
||||
monitor = get_monitor()
|
||||
return monitor.get_timeline_data(metric, window)
|
||||
@@ -106,6 +120,10 @@ async def get_timeline(metric: str = "memory", window: str = "5m"):
|
||||
@router.get("/logs/janitor")
|
||||
async def get_janitor_log(limit: int = 100):
|
||||
"""Get recent janitor cleanup events."""
|
||||
# Input validation
|
||||
if limit < 1 or limit > 1000:
|
||||
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
|
||||
|
||||
try:
|
||||
monitor = get_monitor()
|
||||
return {"events": monitor.get_janitor_log(limit)}
|
||||
@@ -117,6 +135,10 @@ async def get_janitor_log(limit: int = 100):
|
||||
@router.get("/logs/errors")
|
||||
async def get_errors_log(limit: int = 100):
|
||||
"""Get recent errors."""
|
||||
# Input validation
|
||||
if limit < 1 or limit > 1000:
|
||||
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
|
||||
|
||||
try:
|
||||
monitor = get_monitor()
|
||||
return {"errors": monitor.get_errors_log(limit)}
|
||||
@@ -154,7 +176,7 @@ async def force_cleanup():
|
||||
killed_count += 1
|
||||
|
||||
monitor = get_monitor()
|
||||
monitor.track_janitor_event("force_cleanup", "manual", {"killed": killed_count})
|
||||
await monitor.track_janitor_event("force_cleanup", "manual", {"killed": killed_count})
|
||||
|
||||
return {"success": True, "killed_browsers": killed_count}
|
||||
except Exception as e:
|
||||
@@ -200,6 +222,12 @@ async def kill_browser(req: KillBrowserRequest):
|
||||
if not target_sig:
|
||||
raise HTTPException(404, f"Browser with sig={req.sig} not found")
|
||||
|
||||
# Warn if there are active requests (browser might be in use)
|
||||
monitor = get_monitor()
|
||||
active_count = len(monitor.get_active_requests())
|
||||
if active_count > 0:
|
||||
logger.warning(f"Killing browser {target_sig[:8]} while {active_count} requests are active - may cause failures")
|
||||
|
||||
# Kill the browser
|
||||
if pool_type == "hot":
|
||||
browser = HOT_POOL.pop(target_sig)
|
||||
@@ -215,7 +243,7 @@ async def kill_browser(req: KillBrowserRequest):
|
||||
logger.info(f"🔪 Killed {pool_type} browser (sig={target_sig[:8]})")
|
||||
|
||||
monitor = get_monitor()
|
||||
monitor.track_janitor_event("kill_browser", target_sig, {"pool": pool_type, "manual": True})
|
||||
await monitor.track_janitor_event("kill_browser", target_sig, {"pool": pool_type, "manual": True})
|
||||
|
||||
return {"success": True, "killed_sig": target_sig[:8], "pool_type": pool_type}
|
||||
except HTTPException:
|
||||
@@ -298,7 +326,7 @@ async def restart_browser(req: KillBrowserRequest):
|
||||
logger.info(f"🔄 Restarted {pool_type} browser (sig={target_sig[:8]})")
|
||||
|
||||
monitor = get_monitor()
|
||||
monitor.track_janitor_event("restart_browser", target_sig, {"pool": pool_type})
|
||||
await monitor.track_janitor_event("restart_browser", target_sig, {"pool": pool_type})
|
||||
|
||||
return {"success": True, "restarted_sig": target_sig[:8], "note": "Browser will be recreated on next request"}
|
||||
except HTTPException:
|
||||
@@ -320,3 +348,58 @@ async def reset_stats():
|
||||
except Exception as e:
|
||||
logger.error(f"Error resetting stats: {e}")
|
||||
raise HTTPException(500, str(e))
|
||||
|
||||
|
||||
@router.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
"""WebSocket endpoint for real-time monitoring updates.
|
||||
|
||||
Sends updates every 2 seconds with:
|
||||
- Health stats
|
||||
- Active/completed requests
|
||||
- Browser pool status
|
||||
- Timeline data
|
||||
"""
|
||||
await websocket.accept()
|
||||
logger.info("WebSocket client connected")
|
||||
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
# Gather all monitoring data
|
||||
monitor = get_monitor()
|
||||
|
||||
data = {
|
||||
"timestamp": asyncio.get_event_loop().time(),
|
||||
"health": await monitor.get_health_summary(),
|
||||
"requests": {
|
||||
"active": monitor.get_active_requests(),
|
||||
"completed": monitor.get_completed_requests(limit=10)
|
||||
},
|
||||
"browsers": await monitor.get_browser_list(),
|
||||
"timeline": {
|
||||
"memory": monitor.get_timeline_data("memory", "5m"),
|
||||
"requests": monitor.get_timeline_data("requests", "5m"),
|
||||
"browsers": monitor.get_timeline_data("browsers", "5m")
|
||||
},
|
||||
"janitor": monitor.get_janitor_log(limit=10),
|
||||
"errors": monitor.get_errors_log(limit=10)
|
||||
}
|
||||
|
||||
# Send update to client
|
||||
await websocket.send_json(data)
|
||||
|
||||
# Wait 2 seconds before next update
|
||||
await asyncio.sleep(2)
|
||||
|
||||
except WebSocketDisconnect:
|
||||
logger.info("WebSocket client disconnected")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"WebSocket error: {e}", exc_info=True)
|
||||
await asyncio.sleep(2) # Continue trying
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"WebSocket connection error: {e}", exc_info=True)
|
||||
finally:
|
||||
logger.info("WebSocket connection closed")
|
||||
|
||||
@@ -119,6 +119,7 @@ async def lifespan(_: FastAPI):
|
||||
# Initialize monitor
|
||||
monitor_module.monitor_stats = MonitorStats(redis)
|
||||
await monitor_module.monitor_stats.load_from_redis()
|
||||
monitor_module.monitor_stats.start_persistence_worker()
|
||||
|
||||
# Initialize browser pool
|
||||
await init_permanent(BrowserConfig(
|
||||
@@ -135,6 +136,14 @@ async def lifespan(_: FastAPI):
|
||||
# Cleanup
|
||||
app.state.janitor.cancel()
|
||||
app.state.timeline_updater.cancel()
|
||||
|
||||
# Monitor cleanup (persist stats and stop workers)
|
||||
from monitor import get_monitor
|
||||
try:
|
||||
await get_monitor().cleanup()
|
||||
except Exception as e:
|
||||
logger.error(f"Monitor cleanup failed: {e}")
|
||||
|
||||
await close_all()
|
||||
|
||||
async def _timeline_updater():
|
||||
@@ -143,7 +152,9 @@ async def _timeline_updater():
|
||||
while True:
|
||||
await asyncio.sleep(5)
|
||||
try:
|
||||
await get_monitor().update_timeline()
|
||||
await asyncio.wait_for(get_monitor().update_timeline(), timeout=4.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Timeline update timeout after 4s")
|
||||
except Exception as e:
|
||||
logger.warning(f"Timeline update error: {e}")
|
||||
|
||||
|
||||
@@ -35,6 +35,12 @@
|
||||
}
|
||||
.pulse-slow { animation: pulse-slow 2s ease-in-out infinite; }
|
||||
|
||||
@keyframes pulse-fast {
|
||||
0%, 100% { opacity: 1; transform: scale(1); }
|
||||
50% { opacity: 0.6; transform: scale(1.1); }
|
||||
}
|
||||
.pulse-fast { animation: pulse-fast 1s ease-in-out infinite; }
|
||||
|
||||
@keyframes spin-slow {
|
||||
from { transform: rotate(0deg); }
|
||||
to { transform: rotate(360deg); }
|
||||
@@ -87,6 +93,14 @@
|
||||
</h1>
|
||||
|
||||
<div class="ml-auto flex items-center space-x-4">
|
||||
<!-- Connection Status -->
|
||||
<div class="flex items-center space-x-2">
|
||||
<div id="ws-status" class="flex items-center space-x-1">
|
||||
<div class="w-2 h-2 rounded-full bg-gray-500" id="ws-indicator"></div>
|
||||
<span class="text-xs text-secondary" id="ws-text">Connecting...</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Auto-refresh toggle -->
|
||||
<div class="flex items-center space-x-2">
|
||||
<label class="text-xs text-secondary">Auto-refresh:</label>
|
||||
@@ -196,7 +210,7 @@
|
||||
|
||||
<!-- Browsers Section -->
|
||||
<section class="bg-surface rounded-lg border border-border overflow-hidden flex flex-col" style="height: 350px;">
|
||||
<div class="px-4 py-2 border-b border-border">
|
||||
<div class="px-4 py-2 border-b border-border flex items-center justify-between">
|
||||
<h3 class="text-sm font-medium text-primary">🌐 Browsers (<span id="browser-count">0</span>, <span id="browser-mem">0</span>MB)</h3>
|
||||
<div class="text-xs text-secondary">Reuse: <span id="reuse-rate" class="text-primary">--%</span></div>
|
||||
</div>
|
||||
@@ -308,9 +322,279 @@
|
||||
let autoRefresh = true;
|
||||
let refreshInterval;
|
||||
const REFRESH_RATE = 1000; // 1 second
|
||||
let websocket = null;
|
||||
let wsReconnectAttempts = 0;
|
||||
const MAX_WS_RECONNECT = 5;
|
||||
let useWebSocket = true; // Try WebSocket first, fallback to polling
|
||||
|
||||
// No more tabs - all sections visible at once!
|
||||
|
||||
// ========== WebSocket Connection ==========
|
||||
function updateConnectionStatus(status, message) {
|
||||
const indicator = document.getElementById('ws-indicator');
|
||||
const text = document.getElementById('ws-text');
|
||||
|
||||
indicator.className = 'w-2 h-2 rounded-full';
|
||||
|
||||
if (status === 'connected') {
|
||||
indicator.classList.add('bg-green-500', 'pulse-fast');
|
||||
text.textContent = 'Live';
|
||||
text.className = 'text-xs text-green-400';
|
||||
} else if (status === 'connecting') {
|
||||
indicator.classList.add('bg-yellow-500', 'pulse-slow');
|
||||
text.textContent = 'Connecting...';
|
||||
text.className = 'text-xs text-yellow-400';
|
||||
} else if (status === 'polling') {
|
||||
indicator.classList.add('bg-blue-500', 'pulse-slow');
|
||||
text.textContent = 'Polling';
|
||||
text.className = 'text-xs text-blue-400';
|
||||
} else {
|
||||
indicator.classList.add('bg-red-500');
|
||||
text.textContent = message || 'Disconnected';
|
||||
text.className = 'text-xs text-red-400';
|
||||
}
|
||||
}
|
||||
|
||||
function connectWebSocket() {
|
||||
if (wsReconnectAttempts >= MAX_WS_RECONNECT) {
|
||||
console.log('Max WebSocket reconnect attempts reached, falling back to polling');
|
||||
useWebSocket = false;
|
||||
updateConnectionStatus('polling');
|
||||
startAutoRefresh();
|
||||
return;
|
||||
}
|
||||
|
||||
updateConnectionStatus('connecting');
|
||||
wsReconnectAttempts++;
|
||||
|
||||
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||||
const wsUrl = `${protocol}//${window.location.host}/monitor/ws`;
|
||||
|
||||
websocket = new WebSocket(wsUrl);
|
||||
|
||||
websocket.onopen = () => {
|
||||
console.log('WebSocket connected');
|
||||
wsReconnectAttempts = 0;
|
||||
updateConnectionStatus('connected');
|
||||
stopAutoRefresh(); // Stop polling if running
|
||||
};
|
||||
|
||||
websocket.onmessage = (event) => {
|
||||
const data = JSON.parse(event.data);
|
||||
updateDashboard(data);
|
||||
};
|
||||
|
||||
websocket.onerror = (error) => {
|
||||
console.error('WebSocket error:', error);
|
||||
};
|
||||
|
||||
websocket.onclose = () => {
|
||||
console.log('WebSocket closed');
|
||||
updateConnectionStatus('disconnected', 'Reconnecting...');
|
||||
|
||||
if (useWebSocket) {
|
||||
setTimeout(connectWebSocket, 2000 * wsReconnectAttempts);
|
||||
} else {
|
||||
startAutoRefresh();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
function updateDashboard(data) {
|
||||
// Update all dashboard sections with WebSocket data
|
||||
try {
|
||||
if (data.health) {
|
||||
updateHealthDisplay(data.health);
|
||||
}
|
||||
if (data.requests) {
|
||||
updateRequestsDisplay(data.requests);
|
||||
}
|
||||
if (data.browsers) {
|
||||
updateBrowsersDisplay(data.browsers);
|
||||
}
|
||||
if (data.janitor) {
|
||||
updateJanitorDisplay(data.janitor);
|
||||
}
|
||||
if (data.errors && data.errors.length > 0) {
|
||||
updateErrorsDisplay(data.errors);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Error updating dashboard:', e);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions to update displays from WebSocket data
|
||||
function updateHealthDisplay(health) {
|
||||
const cpu = health.container.cpu_percent;
|
||||
const mem = health.container.memory_percent;
|
||||
|
||||
document.getElementById('cpu-percent').textContent = cpu.toFixed(1) + '%';
|
||||
document.getElementById('cpu-bar').style.width = Math.min(cpu, 100) + '%';
|
||||
document.getElementById('cpu-bar').className = `progress-bar h-2 rounded-full ${cpu > 80 ? 'bg-red-500' : cpu > 60 ? 'bg-yellow-500' : 'bg-primary'}`;
|
||||
|
||||
document.getElementById('mem-percent').textContent = mem.toFixed(1) + '%';
|
||||
document.getElementById('mem-bar').style.width = Math.min(mem, 100) + '%';
|
||||
document.getElementById('mem-bar').className = `progress-bar h-2 rounded-full ${mem > 80 ? 'bg-red-500' : mem > 60 ? 'bg-yellow-500' : 'bg-accent'}`;
|
||||
|
||||
document.getElementById('net-sent').textContent = health.container.network_sent_mb.toFixed(1);
|
||||
document.getElementById('net-recv').textContent = health.container.network_recv_mb.toFixed(1);
|
||||
|
||||
const uptime = formatUptime(health.container.uptime_seconds);
|
||||
document.getElementById('uptime').textContent = uptime;
|
||||
|
||||
const perm = health.pool.permanent;
|
||||
document.getElementById('pool-perm').textContent = `${perm.active ? 'ACTIVE' : 'INACTIVE'} (${perm.memory_mb}MB)`;
|
||||
document.getElementById('pool-perm').className = perm.active ? 'text-primary ml-2' : 'text-secondary ml-2';
|
||||
|
||||
document.getElementById('pool-hot').textContent = `${health.pool.hot.count} (${health.pool.hot.memory_mb}MB)`;
|
||||
document.getElementById('pool-cold').textContent = `${health.pool.cold.count} (${health.pool.cold.memory_mb}MB)`;
|
||||
|
||||
document.getElementById('janitor-status').textContent = health.janitor.next_cleanup_estimate;
|
||||
const pressure = health.janitor.memory_pressure;
|
||||
const pressureEl = document.getElementById('mem-pressure');
|
||||
pressureEl.textContent = pressure;
|
||||
pressureEl.className = pressure === 'HIGH' ? 'text-red-500' : pressure === 'MEDIUM' ? 'text-yellow-500' : 'text-green-500';
|
||||
|
||||
document.getElementById('last-update').textContent = 'Live: ' + new Date().toLocaleTimeString();
|
||||
}
|
||||
|
||||
function updateRequestsDisplay(requests) {
|
||||
// Update active requests count
|
||||
const activeCount = document.getElementById('active-count');
|
||||
if (activeCount) activeCount.textContent = requests.active.length;
|
||||
|
||||
// Update active requests list
|
||||
const activeList = document.getElementById('active-requests-list');
|
||||
if (activeList) {
|
||||
if (requests.active.length === 0) {
|
||||
activeList.innerHTML = '<div class="text-secondary text-center py-2">No active requests</div>';
|
||||
} else {
|
||||
activeList.innerHTML = requests.active.map(req => `
|
||||
<div class="flex items-center justify-between p-2 bg-dark rounded border border-border">
|
||||
<span class="text-primary">${req.id.substring(0, 8)}</span>
|
||||
<span class="text-secondary">${req.endpoint}</span>
|
||||
<span class="text-light truncate max-w-[200px]" title="${req.url}">${req.url}</span>
|
||||
<span class="text-accent">${req.elapsed.toFixed(1)}s</span>
|
||||
<span class="pulse-slow">⏳</span>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
}
|
||||
|
||||
// Update completed requests
|
||||
const completedList = document.getElementById('completed-requests-list');
|
||||
if (completedList) {
|
||||
if (requests.completed.length === 0) {
|
||||
completedList.innerHTML = '<div class="text-secondary text-center py-2">No completed requests</div>';
|
||||
} else {
|
||||
completedList.innerHTML = requests.completed.map(req => `
|
||||
<div class="flex items-center gap-3 p-2 bg-dark rounded">
|
||||
<span class="text-secondary w-16 flex-shrink-0">${req.id.substring(0, 8)}</span>
|
||||
<span class="text-secondary w-16 flex-shrink-0">${req.endpoint}</span>
|
||||
<span class="text-light truncate flex-1" title="${req.url}">${req.url}</span>
|
||||
<span class="w-12 flex-shrink-0 text-right">${req.elapsed.toFixed(2)}s</span>
|
||||
<span class="text-secondary w-16 flex-shrink-0 text-right">${req.mem_delta > 0 ? '+' : ''}${req.mem_delta}MB</span>
|
||||
<span class="w-12 flex-shrink-0 text-right">${req.success ? '✅' : '❌'} ${req.status_code}</span>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function updateBrowsersDisplay(browsers) {
|
||||
const tbody = document.getElementById('browsers-table-body');
|
||||
if (tbody) {
|
||||
if (browsers.length === 0) {
|
||||
tbody.innerHTML = '<tr><td colspan="6" class="text-center py-2 text-secondary">No browsers</td></tr>';
|
||||
} else {
|
||||
tbody.innerHTML = browsers.map(b => {
|
||||
const typeIcon = b.type === 'permanent' ? '🔥' : b.type === 'hot' ? '♨️' : '❄️';
|
||||
const typeColor = b.type === 'permanent' ? 'text-primary' : b.type === 'hot' ? 'text-accent' : 'text-light';
|
||||
|
||||
return `
|
||||
<tr class="border-t border-border hover:bg-dark">
|
||||
<td class="py-1 pr-2"><span class="${typeColor}">${typeIcon} ${b.type}</span></td>
|
||||
<td class="py-1 pr-2 font-mono text-xs">${b.sig}</td>
|
||||
<td class="py-1 pr-2">${formatSeconds(b.age_seconds || 0)}</td>
|
||||
<td class="py-1 pr-2">${formatSeconds(b.last_used_seconds || 0)}</td>
|
||||
<td class="py-1 pr-2">${b.hits}</td>
|
||||
<td class="py-1">
|
||||
${b.killable ? `
|
||||
<button onclick="killBrowser('${b.sig}')" class="text-red-500 hover:underline text-xs">X</button>
|
||||
` : `
|
||||
<button onclick="restartBrowser('permanent')" class="text-primary hover:underline text-xs">↻</button>
|
||||
`}
|
||||
</td>
|
||||
</tr>
|
||||
`;
|
||||
}).join('');
|
||||
}
|
||||
}
|
||||
|
||||
// Update browser count and total memory
|
||||
const countEl = document.getElementById('browser-count');
|
||||
if (countEl) countEl.textContent = browsers.length;
|
||||
|
||||
const memEl = document.getElementById('browser-mem');
|
||||
if (memEl) {
|
||||
const totalMem = browsers.reduce((sum, b) => sum + (b.memory_mb || 0), 0);
|
||||
memEl.textContent = totalMem;
|
||||
}
|
||||
|
||||
// Update reuse rate (if available from summary data)
|
||||
// Note: WebSocket sends just browsers array, not summary
|
||||
// Reuse rate calculation would need to be added to monitor.py
|
||||
const reuseEl = document.getElementById('reuse-rate');
|
||||
if (reuseEl) {
|
||||
reuseEl.textContent = '---%'; // Not available in real-time yet
|
||||
}
|
||||
}
|
||||
|
||||
function updateJanitorDisplay(events) {
|
||||
const janitorLog = document.getElementById('janitor-log');
|
||||
if (janitorLog) {
|
||||
if (events.length === 0) {
|
||||
janitorLog.innerHTML = '<div class="text-secondary text-center py-4">No events yet</div>';
|
||||
} else {
|
||||
janitorLog.innerHTML = events.slice(0, 10).reverse().map(evt => {
|
||||
const time = new Date(evt.timestamp * 1000).toLocaleTimeString();
|
||||
const icon = evt.type === 'close_cold' ? '🧹❄️' : evt.type === 'close_hot' ? '🧹♨️' : '⬆️';
|
||||
const details = JSON.stringify(evt.details);
|
||||
|
||||
return `<div class="p-2 bg-dark rounded">
|
||||
<span class="text-secondary">${time}</span>
|
||||
<span>${icon}</span>
|
||||
<span class="text-primary">${evt.type}</span>
|
||||
<span class="text-secondary">sig=${evt.sig}</span>
|
||||
<span class="text-xs text-secondary ml-2">${details}</span>
|
||||
</div>`;
|
||||
}).join('');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function updateErrorsDisplay(errors) {
|
||||
const errorLog = document.getElementById('errors-log');
|
||||
if (errorLog) {
|
||||
if (errors.length === 0) {
|
||||
errorLog.innerHTML = '<div class="text-secondary text-center py-4">No errors</div>';
|
||||
} else {
|
||||
errorLog.innerHTML = errors.slice(0, 10).reverse().map(err => {
|
||||
const time = new Date(err.timestamp * 1000).toLocaleTimeString();
|
||||
|
||||
return `<div class="p-2 bg-dark rounded border border-red-500">
|
||||
<div class="flex justify-between">
|
||||
<span class="text-secondary">${time}</span>
|
||||
<span class="text-red-500">${err.endpoint}</span>
|
||||
</div>
|
||||
<div class="text-xs text-light mt-1">${err.url}</div>
|
||||
<div class="text-xs text-red-400 mt-1 font-mono">${err.error}</div>
|
||||
</div>`;
|
||||
}).join('');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ========== Auto-refresh Toggle ==========
|
||||
document.getElementById('auto-refresh-toggle').addEventListener('click', function() {
|
||||
autoRefresh = !autoRefresh;
|
||||
@@ -426,13 +710,13 @@
|
||||
completedList.innerHTML = '<div class="text-secondary text-center py-2">No completed requests</div>';
|
||||
} else {
|
||||
completedList.innerHTML = data.completed.map(req => `
|
||||
<div class="flex items-center justify-between p-2 bg-dark rounded">
|
||||
<span class="text-secondary">${req.id.substring(0, 8)}</span>
|
||||
<span class="text-secondary">${req.endpoint}</span>
|
||||
<span class="text-light truncate max-w-[180px]" title="${req.url}">${req.url}</span>
|
||||
<span>${req.elapsed.toFixed(2)}s</span>
|
||||
<span class="text-secondary">${req.mem_delta > 0 ? '+' : ''}${req.mem_delta}MB</span>
|
||||
<span>${req.success ? '✅' : '❌'} ${req.status_code}</span>
|
||||
<div class="flex items-center gap-3 p-2 bg-dark rounded">
|
||||
<span class="text-secondary w-16 flex-shrink-0">${req.id.substring(0, 8)}</span>
|
||||
<span class="text-secondary w-16 flex-shrink-0">${req.endpoint}</span>
|
||||
<span class="text-light truncate flex-1" title="${req.url}">${req.url}</span>
|
||||
<span class="w-12 flex-shrink-0 text-right">${req.elapsed.toFixed(2)}s</span>
|
||||
<span class="text-secondary w-16 flex-shrink-0 text-right">${req.mem_delta > 0 ? '+' : ''}${req.mem_delta}MB</span>
|
||||
<span class="w-12 flex-shrink-0 text-right">${req.success ? '✅' : '❌'} ${req.status_code}</span>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
@@ -460,7 +744,7 @@
|
||||
|
||||
return `
|
||||
<tr class="border-t border-border hover:bg-dark">
|
||||
<td class="py-1 pr-2"><span class="${typeColor}">${typeIcon}</span></td>
|
||||
<td class="py-1 pr-2"><span class="${typeColor}">${typeIcon} ${b.type}</span></td>
|
||||
<td class="py-1 pr-2 font-mono text-xs">${b.sig}</td>
|
||||
<td class="py-1 pr-2">${formatSeconds(b.age_seconds)}</td>
|
||||
<td class="py-1 pr-2">${formatSeconds(b.last_used_seconds)}</td>
|
||||
@@ -779,7 +1063,8 @@
|
||||
document.getElementById('filter-requests')?.addEventListener('change', fetchRequests);
|
||||
|
||||
// ========== Initialize ==========
|
||||
startAutoRefresh();
|
||||
// Try WebSocket first, fallback to polling on failure
|
||||
connectWebSocket();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
34
deploy/docker/test-websocket.py
Executable file
34
deploy/docker/test-websocket.py
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick WebSocket test - Connect to monitor WebSocket and print updates
|
||||
"""
|
||||
import asyncio
|
||||
import websockets
|
||||
import json
|
||||
|
||||
async def test_websocket():
|
||||
uri = "ws://localhost:11235/monitor/ws"
|
||||
print(f"Connecting to {uri}...")
|
||||
|
||||
try:
|
||||
async with websockets.connect(uri) as websocket:
|
||||
print("✅ Connected!")
|
||||
|
||||
# Receive and print 5 updates
|
||||
for i in range(5):
|
||||
message = await websocket.recv()
|
||||
data = json.loads(message)
|
||||
print(f"\n📊 Update #{i+1}:")
|
||||
print(f" - Health: CPU {data['health']['container']['cpu_percent']}%, Memory {data['health']['container']['memory_percent']}%")
|
||||
print(f" - Active Requests: {len(data['requests']['active'])}")
|
||||
print(f" - Browsers: {len(data['browsers'])}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return 1
|
||||
|
||||
print("\n✅ WebSocket test passed!")
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(asyncio.run(test_websocket()))
|
||||
Reference in New Issue
Block a user