From e2af031b09aab445ae6969010994f5344e8e03a7 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 17 Oct 2025 21:36:25 +0800 Subject: [PATCH] feat(monitor): add real-time monitoring dashboard with Redis persistence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete observability solution for production deployments with terminal-style UI. **Backend Implementation:** - `monitor.py`: Stats manager tracking requests, browsers, errors, timeline data - `monitor_routes.py`: REST API endpoints for all monitor functionality - GET /monitor/health - System health snapshot - GET /monitor/requests - Active & completed requests - GET /monitor/browsers - Browser pool details - GET /monitor/endpoints/stats - Aggregated endpoint analytics - GET /monitor/timeline - Time-series data (memory, requests, browsers) - GET /monitor/logs/{janitor,errors} - Event logs - POST /monitor/actions/{cleanup,kill_browser,restart_browser} - Control actions - POST /monitor/stats/reset - Reset counters - Redis persistence for endpoint stats (survives restart) - Timeline tracking (5min window, 5s resolution, 60 data points) **Frontend Dashboard** (`/dashboard`): - **System Health Bar**: CPU%, Memory%, Network I/O, Uptime - **Pool Status**: Live counts (permanent/hot/cold browsers + memory) - **Live Activity Tabs**: - Requests: Active (realtime) + recent completed (last 100) - Browsers: Detailed table with actions (kill/restart) - Janitor: Cleanup event log with timestamps - Errors: Recent errors with stack traces - **Endpoint Analytics**: Count, avg latency, success%, pool hit% - **Resource Timeline**: SVG charts (memory/requests/browsers) with terminal aesthetics - **Control Actions**: Force cleanup, restart permanent, reset stats - **Auto-refresh**: 5s polling (toggleable) **Integration:** - Janitor events tracked (close_cold, close_hot, promote) - Crawler pool promotion events logged - Timeline updater background task (5s interval) - Lifespan hooks for monitor initialization **UI Design:** - Terminal vibe matching Crawl4AI theme - Dark background, cyan/pink accents, monospace font - Neon glow effects on charts - Responsive layout, hover interactions - Cross-navigation: Playground ↔ Monitor **Key Features:** - Zero-config: Works out of the box with existing Redis - Real-time visibility into pool efficiency - Manual browser management (kill/restart) - Historical data persistence - DevOps-friendly UX Routes: - API: `/monitor/*` (backend endpoints) - UI: `/dashboard` (static HTML) --- deploy/docker/crawler_pool.py | 28 +- deploy/docker/monitor.py | 305 ++++++++ deploy/docker/monitor_routes.py | 322 ++++++++ deploy/docker/server.py | 42 ++ deploy/docker/static/monitor/index.html | 813 +++++++++++++++++++++ deploy/docker/static/playground/index.html | 13 +- 6 files changed, 1516 insertions(+), 7 deletions(-) create mode 100644 deploy/docker/monitor.py create mode 100644 deploy/docker/monitor_routes.py create mode 100644 deploy/docker/static/monitor/index.html diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 226e3680..95593b3f 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -57,6 +57,14 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: if USAGE_COUNT[sig] >= 3: logger.info(f"⬆️ Promoting to hot pool (sig={sig[:8]}, count={USAGE_COUNT[sig]})") HOT_POOL[sig] = COLD_POOL.pop(sig) + + # Track promotion in monitor + try: + from monitor import get_monitor + get_monitor().track_janitor_event("promote", sig, {"count": USAGE_COUNT[sig]}) + except: + pass + return HOT_POOL[sig] logger.info(f"❄️ Using cold pool browser (sig={sig[:8]})") @@ -124,23 +132,39 @@ async def janitor(): # Clean cold pool for sig in list(COLD_POOL.keys()): if now - LAST_USED.get(sig, now) > cold_ttl: - logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={now - LAST_USED[sig]:.0f}s)") + idle_time = now - LAST_USED[sig] + logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={idle_time:.0f}s)") with suppress(Exception): await COLD_POOL[sig].close() COLD_POOL.pop(sig, None) LAST_USED.pop(sig, None) USAGE_COUNT.pop(sig, None) + # Track in monitor + try: + from monitor import get_monitor + get_monitor().track_janitor_event("close_cold", sig, {"idle_seconds": int(idle_time), "ttl": cold_ttl}) + except: + pass + # Clean hot pool (more conservative) for sig in list(HOT_POOL.keys()): if now - LAST_USED.get(sig, now) > hot_ttl: - logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={now - LAST_USED[sig]:.0f}s)") + idle_time = now - LAST_USED[sig] + logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={idle_time:.0f}s)") with suppress(Exception): await HOT_POOL[sig].close() HOT_POOL.pop(sig, None) LAST_USED.pop(sig, None) USAGE_COUNT.pop(sig, None) + # Track in monitor + try: + from monitor import get_monitor + get_monitor().track_janitor_event("close_hot", sig, {"idle_seconds": int(idle_time), "ttl": hot_ttl}) + except: + pass + # Log pool stats if mem_pct > 60: logger.info(f"📊 Pool: hot={len(HOT_POOL)}, cold={len(COLD_POOL)}, mem={mem_pct:.1f}%") diff --git a/deploy/docker/monitor.py b/deploy/docker/monitor.py new file mode 100644 index 00000000..3735280c --- /dev/null +++ b/deploy/docker/monitor.py @@ -0,0 +1,305 @@ +# monitor.py - Real-time monitoring stats with Redis persistence +import time +import json +import asyncio +from typing import Dict, List, Optional +from datetime import datetime, timezone +from collections import deque +from redis import asyncio as aioredis +from utils import get_container_memory_percent +import psutil +import logging + +logger = logging.getLogger(__name__) + +class MonitorStats: + """Tracks real-time server stats with Redis persistence.""" + + def __init__(self, redis: aioredis.Redis): + self.redis = redis + self.start_time = time.time() + + # In-memory queues (fast reads, Redis backup) + self.active_requests: Dict[str, Dict] = {} # id -> request info + self.completed_requests: deque = deque(maxlen=100) # Last 100 + self.janitor_events: deque = deque(maxlen=100) + self.errors: deque = deque(maxlen=100) + + # Endpoint stats (persisted in Redis) + self.endpoint_stats: Dict[str, Dict] = {} # endpoint -> {count, total_time, errors, ...} + + # Timeline data (5min window, 5s resolution = 60 points) + self.memory_timeline: deque = deque(maxlen=60) + self.requests_timeline: deque = deque(maxlen=60) + self.browser_timeline: deque = deque(maxlen=60) + + async def track_request_start(self, request_id: str, endpoint: str, url: str, config: Dict = None): + """Track new request start.""" + req_info = { + "id": request_id, + "endpoint": endpoint, + "url": url[:100], # Truncate long URLs + "start_time": time.time(), + "config_sig": config.get("sig", "default") if config else "default", + "mem_start": psutil.Process().memory_info().rss / (1024 * 1024) + } + self.active_requests[request_id] = req_info + + # Increment endpoint counter + if endpoint not in self.endpoint_stats: + self.endpoint_stats[endpoint] = { + "count": 0, "total_time": 0, "errors": 0, + "pool_hits": 0, "success": 0 + } + self.endpoint_stats[endpoint]["count"] += 1 + + # Persist to Redis (fire and forget) + asyncio.create_task(self._persist_endpoint_stats()) + + async def track_request_end(self, request_id: str, success: bool, error: str = None, + pool_hit: bool = True, status_code: int = 200): + """Track request completion.""" + if request_id not in self.active_requests: + return + + req_info = self.active_requests.pop(request_id) + end_time = time.time() + elapsed = end_time - req_info["start_time"] + mem_end = psutil.Process().memory_info().rss / (1024 * 1024) + mem_delta = mem_end - req_info["mem_start"] + + # Update stats + endpoint = req_info["endpoint"] + if endpoint in self.endpoint_stats: + self.endpoint_stats[endpoint]["total_time"] += elapsed + if success: + self.endpoint_stats[endpoint]["success"] += 1 + else: + self.endpoint_stats[endpoint]["errors"] += 1 + if pool_hit: + self.endpoint_stats[endpoint]["pool_hits"] += 1 + + # Add to completed queue + completed = { + **req_info, + "end_time": end_time, + "elapsed": round(elapsed, 2), + "mem_delta": round(mem_delta, 1), + "success": success, + "error": error, + "status_code": status_code, + "pool_hit": pool_hit + } + self.completed_requests.append(completed) + + # Track errors + if not success and error: + self.errors.append({ + "timestamp": end_time, + "endpoint": endpoint, + "url": req_info["url"], + "error": error, + "request_id": request_id + }) + + await self._persist_endpoint_stats() + + def track_janitor_event(self, event_type: str, sig: str, details: Dict): + """Track janitor cleanup events.""" + self.janitor_events.append({ + "timestamp": time.time(), + "type": event_type, # "close_cold", "close_hot", "promote" + "sig": sig[:8], + "details": details + }) + + async def update_timeline(self): + """Update timeline data points (called every 5s).""" + now = time.time() + mem_pct = get_container_memory_percent() + + # Count requests in last 5s + recent_reqs = sum(1 for req in self.completed_requests + if now - req.get("end_time", 0) < 5) + + # Browser counts (need to import from crawler_pool) + from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL + browser_count = { + "permanent": 1 if PERMANENT else 0, + "hot": len(HOT_POOL), + "cold": len(COLD_POOL) + } + + self.memory_timeline.append({"time": now, "value": mem_pct}) + self.requests_timeline.append({"time": now, "value": recent_reqs}) + self.browser_timeline.append({"time": now, "browsers": browser_count}) + + async def _persist_endpoint_stats(self): + """Persist endpoint stats to Redis.""" + try: + await self.redis.set( + "monitor:endpoint_stats", + json.dumps(self.endpoint_stats), + ex=86400 # 24h TTL + ) + except Exception as e: + logger.warning(f"Failed to persist endpoint stats: {e}") + + async def load_from_redis(self): + """Load persisted stats from Redis.""" + try: + data = await self.redis.get("monitor:endpoint_stats") + if data: + self.endpoint_stats = json.loads(data) + logger.info("Loaded endpoint stats from Redis") + except Exception as e: + logger.warning(f"Failed to load from Redis: {e}") + + def get_health_summary(self) -> Dict: + """Get current system health snapshot.""" + mem_pct = get_container_memory_percent() + cpu_pct = psutil.cpu_percent(interval=0.1) + + # Network I/O (delta since last call) + net = psutil.net_io_counters() + + # Pool status + from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED + permanent_mem = 270 if PERMANENT else 0 # Estimate + hot_mem = len(HOT_POOL) * 180 # Estimate 180MB per browser + cold_mem = len(COLD_POOL) * 180 + + return { + "container": { + "memory_percent": round(mem_pct, 1), + "cpu_percent": round(cpu_pct, 1), + "network_sent_mb": round(net.bytes_sent / (1024**2), 2), + "network_recv_mb": round(net.bytes_recv / (1024**2), 2), + "uptime_seconds": int(time.time() - self.start_time) + }, + "pool": { + "permanent": {"active": PERMANENT is not None, "memory_mb": permanent_mem}, + "hot": {"count": len(HOT_POOL), "memory_mb": hot_mem}, + "cold": {"count": len(COLD_POOL), "memory_mb": cold_mem}, + "total_memory_mb": permanent_mem + hot_mem + cold_mem + }, + "janitor": { + "next_cleanup_estimate": "adaptive", # Would need janitor state + "memory_pressure": "LOW" if mem_pct < 60 else "MEDIUM" if mem_pct < 80 else "HIGH" + } + } + + def get_active_requests(self) -> List[Dict]: + """Get list of currently active requests.""" + now = time.time() + return [ + { + **req, + "elapsed": round(now - req["start_time"], 1), + "status": "running" + } + for req in self.active_requests.values() + ] + + def get_completed_requests(self, limit: int = 50, filter_status: str = "all") -> List[Dict]: + """Get recent completed requests.""" + requests = list(self.completed_requests)[-limit:] + if filter_status == "success": + requests = [r for r in requests if r.get("success")] + elif filter_status == "error": + requests = [r for r in requests if not r.get("success")] + return requests + + def get_browser_list(self) -> List[Dict]: + """Get detailed browser pool information.""" + from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, DEFAULT_CONFIG_SIG + + browsers = [] + now = time.time() + + if PERMANENT: + browsers.append({ + "type": "permanent", + "sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown", + "age_seconds": int(now - self.start_time), + "last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)), + "memory_mb": 270, + "hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0), + "killable": False + }) + + for sig, crawler in HOT_POOL.items(): + browsers.append({ + "type": "hot", + "sig": sig[:8], + "age_seconds": int(now - self.start_time), # Approximation + "last_used_seconds": int(now - LAST_USED.get(sig, now)), + "memory_mb": 180, # Estimate + "hits": USAGE_COUNT.get(sig, 0), + "killable": True + }) + + for sig, crawler in COLD_POOL.items(): + browsers.append({ + "type": "cold", + "sig": sig[:8], + "age_seconds": int(now - self.start_time), + "last_used_seconds": int(now - LAST_USED.get(sig, now)), + "memory_mb": 180, + "hits": USAGE_COUNT.get(sig, 0), + "killable": True + }) + + return browsers + + def get_endpoint_stats_summary(self) -> Dict[str, Dict]: + """Get aggregated endpoint statistics.""" + summary = {} + for endpoint, stats in self.endpoint_stats.items(): + count = stats["count"] + avg_time = (stats["total_time"] / count) if count > 0 else 0 + success_rate = (stats["success"] / count * 100) if count > 0 else 0 + pool_hit_rate = (stats["pool_hits"] / count * 100) if count > 0 else 0 + + summary[endpoint] = { + "count": count, + "avg_latency_ms": round(avg_time * 1000, 1), + "success_rate_percent": round(success_rate, 1), + "pool_hit_rate_percent": round(pool_hit_rate, 1), + "errors": stats["errors"] + } + return summary + + def get_timeline_data(self, metric: str, window: str = "5m") -> Dict: + """Get timeline data for charts.""" + # For now, only 5m window supported + if metric == "memory": + data = list(self.memory_timeline) + elif metric == "requests": + data = list(self.requests_timeline) + elif metric == "browsers": + data = list(self.browser_timeline) + else: + return {"timestamps": [], "values": []} + + return { + "timestamps": [int(d["time"]) for d in data], + "values": [d.get("value", d.get("browsers")) for d in data] + } + + def get_janitor_log(self, limit: int = 100) -> List[Dict]: + """Get recent janitor events.""" + return list(self.janitor_events)[-limit:] + + def get_errors_log(self, limit: int = 100) -> List[Dict]: + """Get recent errors.""" + return list(self.errors)[-limit:] + +# Global instance (initialized in server.py) +monitor_stats: Optional[MonitorStats] = None + +def get_monitor() -> MonitorStats: + """Get global monitor instance.""" + if monitor_stats is None: + raise RuntimeError("Monitor not initialized") + return monitor_stats diff --git a/deploy/docker/monitor_routes.py b/deploy/docker/monitor_routes.py new file mode 100644 index 00000000..e7451468 --- /dev/null +++ b/deploy/docker/monitor_routes.py @@ -0,0 +1,322 @@ +# monitor_routes.py - Monitor API endpoints +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from typing import Optional +from monitor import get_monitor +import logging + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/monitor", tags=["monitor"]) + + +@router.get("/health") +async def get_health(): + """Get current system health snapshot.""" + try: + monitor = get_monitor() + return monitor.get_health_summary() + except Exception as e: + logger.error(f"Error getting health: {e}") + raise HTTPException(500, str(e)) + + +@router.get("/requests") +async def get_requests(status: str = "all", limit: int = 50): + """Get active and completed requests. + + Args: + status: Filter by 'active', 'completed', 'success', 'error', or 'all' + limit: Max number of completed requests to return (default 50) + """ + try: + monitor = get_monitor() + + if status == "active": + return {"active": monitor.get_active_requests(), "completed": []} + elif status == "completed": + return {"active": [], "completed": monitor.get_completed_requests(limit)} + elif status in ["success", "error"]: + return {"active": [], "completed": monitor.get_completed_requests(limit, status)} + else: # "all" + return { + "active": monitor.get_active_requests(), + "completed": monitor.get_completed_requests(limit) + } + except Exception as e: + logger.error(f"Error getting requests: {e}") + raise HTTPException(500, str(e)) + + +@router.get("/browsers") +async def get_browsers(): + """Get detailed browser pool information.""" + try: + monitor = get_monitor() + browsers = monitor.get_browser_list() + + # Calculate summary stats + total_browsers = len(browsers) + total_memory = sum(b["memory_mb"] for b in browsers) + + # Calculate reuse rate from recent requests + recent = monitor.get_completed_requests(100) + pool_hits = sum(1 for r in recent if r.get("pool_hit", False)) + reuse_rate = (pool_hits / len(recent) * 100) if recent else 0 + + return { + "browsers": browsers, + "summary": { + "total_count": total_browsers, + "total_memory_mb": total_memory, + "reuse_rate_percent": round(reuse_rate, 1) + } + } + except Exception as e: + logger.error(f"Error getting browsers: {e}") + raise HTTPException(500, str(e)) + + +@router.get("/endpoints/stats") +async def get_endpoint_stats(): + """Get aggregated endpoint statistics.""" + try: + monitor = get_monitor() + return monitor.get_endpoint_stats_summary() + except Exception as e: + logger.error(f"Error getting endpoint stats: {e}") + raise HTTPException(500, str(e)) + + +@router.get("/timeline") +async def get_timeline(metric: str = "memory", window: str = "5m"): + """Get timeline data for charts. + + Args: + metric: 'memory', 'requests', or 'browsers' + window: Time window (only '5m' supported for now) + """ + try: + monitor = get_monitor() + return monitor.get_timeline_data(metric, window) + except Exception as e: + logger.error(f"Error getting timeline: {e}") + raise HTTPException(500, str(e)) + + +@router.get("/logs/janitor") +async def get_janitor_log(limit: int = 100): + """Get recent janitor cleanup events.""" + try: + monitor = get_monitor() + return {"events": monitor.get_janitor_log(limit)} + except Exception as e: + logger.error(f"Error getting janitor log: {e}") + raise HTTPException(500, str(e)) + + +@router.get("/logs/errors") +async def get_errors_log(limit: int = 100): + """Get recent errors.""" + try: + monitor = get_monitor() + return {"errors": monitor.get_errors_log(limit)} + except Exception as e: + logger.error(f"Error getting errors log: {e}") + raise HTTPException(500, str(e)) + + +# ========== Control Actions ========== + +class KillBrowserRequest(BaseModel): + sig: str + + +@router.post("/actions/cleanup") +async def force_cleanup(): + """Force immediate janitor cleanup (kills idle cold pool browsers).""" + try: + from crawler_pool import COLD_POOL, LAST_USED, USAGE_COUNT, LOCK + import time + from contextlib import suppress + + killed_count = 0 + now = time.time() + + async with LOCK: + for sig in list(COLD_POOL.keys()): + # Kill all cold pool browsers immediately + logger.info(f"🧹 Force cleanup: closing cold browser (sig={sig[:8]})") + with suppress(Exception): + await COLD_POOL[sig].close() + COLD_POOL.pop(sig, None) + LAST_USED.pop(sig, None) + USAGE_COUNT.pop(sig, None) + killed_count += 1 + + monitor = get_monitor() + monitor.track_janitor_event("force_cleanup", "manual", {"killed": killed_count}) + + return {"success": True, "killed_browsers": killed_count} + except Exception as e: + logger.error(f"Error during force cleanup: {e}") + raise HTTPException(500, str(e)) + + +@router.post("/actions/kill_browser") +async def kill_browser(req: KillBrowserRequest): + """Kill a specific browser by signature (hot or cold only). + + Args: + sig: Browser config signature (first 8 chars) + """ + try: + from crawler_pool import HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, LOCK, DEFAULT_CONFIG_SIG + from contextlib import suppress + + # Find full signature matching prefix + target_sig = None + pool_type = None + + async with LOCK: + # Check hot pool + for sig in HOT_POOL.keys(): + if sig.startswith(req.sig): + target_sig = sig + pool_type = "hot" + break + + # Check cold pool + if not target_sig: + for sig in COLD_POOL.keys(): + if sig.startswith(req.sig): + target_sig = sig + pool_type = "cold" + break + + # Check if trying to kill permanent + if DEFAULT_CONFIG_SIG and DEFAULT_CONFIG_SIG.startswith(req.sig): + raise HTTPException(403, "Cannot kill permanent browser. Use restart instead.") + + if not target_sig: + raise HTTPException(404, f"Browser with sig={req.sig} not found") + + # Kill the browser + if pool_type == "hot": + browser = HOT_POOL.pop(target_sig) + else: + browser = COLD_POOL.pop(target_sig) + + with suppress(Exception): + await browser.close() + + LAST_USED.pop(target_sig, None) + USAGE_COUNT.pop(target_sig, None) + + logger.info(f"🔪 Killed {pool_type} browser (sig={target_sig[:8]})") + + monitor = get_monitor() + monitor.track_janitor_event("kill_browser", target_sig, {"pool": pool_type, "manual": True}) + + return {"success": True, "killed_sig": target_sig[:8], "pool_type": pool_type} + except HTTPException: + raise + except Exception as e: + logger.error(f"Error killing browser: {e}") + raise HTTPException(500, str(e)) + + +@router.post("/actions/restart_browser") +async def restart_browser(req: KillBrowserRequest): + """Restart a browser (kill + recreate). Works for permanent too. + + Args: + sig: Browser config signature (first 8 chars), or "permanent" + """ + try: + from crawler_pool import (PERMANENT, HOT_POOL, COLD_POOL, LAST_USED, + USAGE_COUNT, LOCK, DEFAULT_CONFIG_SIG, init_permanent) + from crawl4ai import AsyncWebCrawler, BrowserConfig + from contextlib import suppress + import time + + # Handle permanent browser restart + if req.sig == "permanent" or (DEFAULT_CONFIG_SIG and DEFAULT_CONFIG_SIG.startswith(req.sig)): + async with LOCK: + if PERMANENT: + with suppress(Exception): + await PERMANENT.close() + + # Reinitialize permanent + from utils import load_config + config = load_config() + await init_permanent(BrowserConfig( + extra_args=config["crawler"]["browser"].get("extra_args", []), + **config["crawler"]["browser"].get("kwargs", {}), + )) + + logger.info("🔄 Restarted permanent browser") + return {"success": True, "restarted": "permanent"} + + # Handle hot/cold browser restart + target_sig = None + pool_type = None + browser_config = None + + async with LOCK: + # Find browser + for sig in HOT_POOL.keys(): + if sig.startswith(req.sig): + target_sig = sig + pool_type = "hot" + # Would need to reconstruct config (not stored currently) + break + + if not target_sig: + for sig in COLD_POOL.keys(): + if sig.startswith(req.sig): + target_sig = sig + pool_type = "cold" + break + + if not target_sig: + raise HTTPException(404, f"Browser with sig={req.sig} not found") + + # Kill existing + if pool_type == "hot": + browser = HOT_POOL.pop(target_sig) + else: + browser = COLD_POOL.pop(target_sig) + + with suppress(Exception): + await browser.close() + + # Note: We can't easily recreate with same config without storing it + # For now, just kill and let new requests create fresh ones + LAST_USED.pop(target_sig, None) + USAGE_COUNT.pop(target_sig, None) + + logger.info(f"🔄 Restarted {pool_type} browser (sig={target_sig[:8]})") + + monitor = get_monitor() + monitor.track_janitor_event("restart_browser", target_sig, {"pool": pool_type}) + + return {"success": True, "restarted_sig": target_sig[:8], "note": "Browser will be recreated on next request"} + except HTTPException: + raise + except Exception as e: + logger.error(f"Error restarting browser: {e}") + raise HTTPException(500, str(e)) + + +@router.post("/stats/reset") +async def reset_stats(): + """Reset today's endpoint counters.""" + try: + monitor = get_monitor() + monitor.endpoint_stats.clear() + await monitor._persist_endpoint_stats() + + return {"success": True, "message": "Endpoint stats reset"} + except Exception as e: + logger.error(f"Error resetting stats: {e}") + raise HTTPException(500, str(e)) diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 30639852..efb1cecb 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -16,6 +16,7 @@ from fastapi import Request, Depends from fastapi.responses import FileResponse import base64 import re +import logging from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from api import ( handle_markdown_request, handle_llm_qa, @@ -112,15 +113,40 @@ AsyncWebCrawler.arun = capped_arun @asynccontextmanager async def lifespan(_: FastAPI): from crawler_pool import init_permanent + from monitor import MonitorStats + import monitor as monitor_module + + # Initialize monitor + monitor_module.monitor_stats = MonitorStats(redis) + await monitor_module.monitor_stats.load_from_redis() + + # Initialize browser pool await init_permanent(BrowserConfig( extra_args=config["crawler"]["browser"].get("extra_args", []), **config["crawler"]["browser"].get("kwargs", {}), )) + + # Start background tasks app.state.janitor = asyncio.create_task(janitor()) + app.state.timeline_updater = asyncio.create_task(_timeline_updater()) + yield + + # Cleanup app.state.janitor.cancel() + app.state.timeline_updater.cancel() await close_all() +async def _timeline_updater(): + """Update timeline data every 5 seconds.""" + from monitor import get_monitor + while True: + await asyncio.sleep(5) + try: + await get_monitor().update_timeline() + except Exception as e: + logger.warning(f"Timeline update error: {e}") + # ───────────────────── FastAPI instance ────────────────────── app = FastAPI( title=config["app"]["title"], @@ -138,6 +164,16 @@ app.mount( name="play", ) +# ── static monitor dashboard ──────────────────────────────── +MONITOR_DIR = pathlib.Path(__file__).parent / "static" / "monitor" +if not MONITOR_DIR.exists(): + raise RuntimeError(f"Monitor assets not found at {MONITOR_DIR}") +app.mount( + "/dashboard", + StaticFiles(directory=MONITOR_DIR, html=True), + name="monitor_ui", +) + @app.get("/") async def root(): @@ -221,6 +257,12 @@ def _safe_eval_config(expr: str) -> dict: # ── job router ────────────────────────────────────────────── app.include_router(init_job_router(redis, config, token_dep)) +# ── monitor router ────────────────────────────────────────── +from monitor_routes import router as monitor_router +app.include_router(monitor_router) + +logger = logging.getLogger(__name__) + # ──────────────────────── Endpoints ────────────────────────── @app.post("/token") async def get_token(req: TokenRequest): diff --git a/deploy/docker/static/monitor/index.html b/deploy/docker/static/monitor/index.html new file mode 100644 index 00000000..2beb9467 --- /dev/null +++ b/deploy/docker/static/monitor/index.html @@ -0,0 +1,813 @@ + + + + + + Crawl4AI Monitor + + + + + + + + +
+

+ 📊 Crawl4AI Monitor + + GitHub stars + +

+ +
+ +
+ + +
+ + + Playground +
+
+ + +
+ +
+

System Health

+ +
+ +
+
+ CPU + --% +
+
+
+
+
+ + +
+
+ Memory + --% +
+
+
+
+
+ + +
+
+ Network + -- +
+
0 MB / ⬇0 MB
+
+ + +
+
+ Uptime + -- +
+
Updated: never
+
+
+ + +
+
+
+ 🔥 Permanent: + INACTIVE (0MB) +
+
+ ♨️ Hot: + 0 (0MB) +
+
+ ❄️ Cold: + 0 (0MB) +
+
+
+ Janitor: adaptive | + Memory pressure: LOW +
+
+
+ + +
+
+ + + + +
+ +
+ +
+
+

Active Requests (0)

+ +
+ +
+
+
No active requests
+
+ +

Recent Completed

+
+
No completed requests
+
+
+
+ + + + + + + + + +
+
+ + +
+ +
+

Endpoint Analytics

+
+ + + + + + + + + + + + + +
EndpointCountAvg LatencySuccess%Pool%
No data
+
+
+ + +
+
+

Resource Timeline (5min)

+ +
+ + + + Loading... + +
+
+ + +
+

Control Actions

+
+ + + +
+
+
+
+ + + + diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html index 553e6765..510a6620 100644 --- a/deploy/docker/static/playground/index.html +++ b/deploy/docker/static/playground/index.html @@ -167,11 +167,14 @@ -
- - +
+ Monitor +
+ + +