Profiling/monitoring :Add interactive monitoring dashboard and integration tests for monitoring endpoints
- Implemented an interactive monitoring dashboard in `demo_monitoring_dashboard.py` for real-time statistics, profiling session management, and system resource monitoring. - Created a quick test script `test_monitoring_quick.py` to verify the functionality of monitoring endpoints. - Developed comprehensive integration tests in `test_monitoring_endpoints.py` covering health checks, statistics, profiling sessions, and real-time streaming. - Added error handling and user-friendly output for better usability in the dashboard.
This commit is contained in:
@@ -58,6 +58,9 @@ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|||||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||||
from crawl4ai.utils import perform_completion_with_backoff
|
from crawl4ai.utils import perform_completion_with_backoff
|
||||||
|
|
||||||
|
# Import monitoring/tracking functions
|
||||||
|
from routers.monitoring import track_crawl_start, track_crawl_end
|
||||||
|
|
||||||
# Import missing utility functions and types
|
# Import missing utility functions and types
|
||||||
try:
|
try:
|
||||||
from utils import (
|
from utils import (
|
||||||
@@ -665,6 +668,8 @@ async def stream_results(
|
|||||||
|
|
||||||
from utils import datetime_handler
|
from utils import datetime_handler
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for result in results_gen:
|
async for result in results_gen:
|
||||||
try:
|
try:
|
||||||
@@ -681,6 +686,14 @@ async def stream_results(
|
|||||||
if result_dict.get("pdf") is not None:
|
if result_dict.get("pdf") is not None:
|
||||||
result_dict["pdf"] = b64encode(result_dict["pdf"]).decode("utf-8")
|
result_dict["pdf"] = b64encode(result_dict["pdf"]).decode("utf-8")
|
||||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||||
|
|
||||||
|
# Track each streamed result for monitoring
|
||||||
|
duration_ms = int((time.time() - start_time) * 1000)
|
||||||
|
url = result_dict.get('url', 'unknown')
|
||||||
|
success = result_dict.get('success', False)
|
||||||
|
bytes_processed = len(str(result_dict.get("markdown", ""))) + len(str(result_dict.get("html", "")))
|
||||||
|
track_crawl_end(url, success, duration_ms, bytes_processed)
|
||||||
|
|
||||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||||
yield data.encode("utf-8")
|
yield data.encode("utf-8")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -721,6 +734,9 @@ async def handle_crawl_request(
|
|||||||
dispatcher = None,
|
dispatcher = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Handle non-streaming crawl requests with optional hooks."""
|
"""Handle non-streaming crawl requests with optional hooks."""
|
||||||
|
# Track crawl start for monitoring
|
||||||
|
track_crawl_start()
|
||||||
|
|
||||||
start_mem_mb = _get_memory_mb() # <--- Get memory before
|
start_mem_mb = _get_memory_mb() # <--- Get memory before
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
mem_delta_mb = None
|
mem_delta_mb = None
|
||||||
@@ -872,6 +888,15 @@ async def handle_crawl_request(
|
|||||||
"server_peak_memory_mb": peak_mem_mb,
|
"server_peak_memory_mb": peak_mem_mb,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Track successful crawl completion for monitoring
|
||||||
|
duration_ms = int((end_time - start_time) * 1000)
|
||||||
|
for result in processed_results:
|
||||||
|
url = result.get("url", "unknown")
|
||||||
|
success = result.get("success", False)
|
||||||
|
# Estimate bytes processed (rough approximation based on content length)
|
||||||
|
bytes_processed = len(str(result.get("markdown", ""))) + len(str(result.get("html", "")))
|
||||||
|
track_crawl_end(url, success, duration_ms, bytes_processed)
|
||||||
|
|
||||||
# Add hooks information if hooks were used
|
# Add hooks information if hooks were used
|
||||||
if hooks_config and hook_manager:
|
if hooks_config and hook_manager:
|
||||||
from hook_manager import UserHookManager
|
from hook_manager import UserHookManager
|
||||||
@@ -918,6 +943,11 @@ async def handle_crawl_request(
|
|||||||
if start_mem_mb is not None and end_mem_mb_error is not None:
|
if start_mem_mb is not None and end_mem_mb_error is not None:
|
||||||
mem_delta_mb = end_mem_mb_error - start_mem_mb
|
mem_delta_mb = end_mem_mb_error - start_mem_mb
|
||||||
|
|
||||||
|
# Track failed crawl for monitoring
|
||||||
|
duration_ms = int((time.time() - start_time) * 1000)
|
||||||
|
for url in urls:
|
||||||
|
track_crawl_end(url, success=False, duration_ms=duration_ms, bytes_processed=0)
|
||||||
|
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
detail=json.dumps(
|
detail=json.dumps(
|
||||||
@@ -947,6 +977,9 @@ async def handle_stream_crawl_request(
|
|||||||
dispatcher = None,
|
dispatcher = None,
|
||||||
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
|
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
|
||||||
"""Handle streaming crawl requests with optional hooks."""
|
"""Handle streaming crawl requests with optional hooks."""
|
||||||
|
# Track crawl start for monitoring
|
||||||
|
track_crawl_start()
|
||||||
|
|
||||||
hooks_info = None
|
hooks_info = None
|
||||||
try:
|
try:
|
||||||
browser_config = BrowserConfig.load(browser_config)
|
browser_config = BrowserConfig.load(browser_config)
|
||||||
|
|||||||
746
deploy/docker/routers/monitoring.py
Normal file
746
deploy/docker/routers/monitoring.py
Normal file
@@ -0,0 +1,746 @@
|
|||||||
|
"""
|
||||||
|
Monitoring and Profiling Router
|
||||||
|
|
||||||
|
Provides endpoints for:
|
||||||
|
- Browser performance profiling
|
||||||
|
- Real-time crawler statistics
|
||||||
|
- System resource monitoring
|
||||||
|
- Session management
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from typing import Dict, List, Optional, Any, AsyncGenerator
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import uuid
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import psutil
|
||||||
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="/monitoring",
|
||||||
|
tags=["Monitoring & Profiling"],
|
||||||
|
responses={
|
||||||
|
404: {"description": "Session not found"},
|
||||||
|
500: {"description": "Internal server error"}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Data Structures
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# In-memory storage for profiling sessions
|
||||||
|
PROFILING_SESSIONS: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
# Real-time crawler statistics
|
||||||
|
CRAWLER_STATS = {
|
||||||
|
"active_crawls": 0,
|
||||||
|
"total_crawls": 0,
|
||||||
|
"successful_crawls": 0,
|
||||||
|
"failed_crawls": 0,
|
||||||
|
"total_bytes_processed": 0,
|
||||||
|
"average_response_time_ms": 0.0,
|
||||||
|
"last_updated": datetime.now().isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Per-URL statistics
|
||||||
|
URL_STATS: Dict[str, Dict[str, Any]] = defaultdict(lambda: {
|
||||||
|
"total_requests": 0,
|
||||||
|
"success_count": 0,
|
||||||
|
"failure_count": 0,
|
||||||
|
"average_time_ms": 0.0,
|
||||||
|
"last_accessed": None,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Pydantic Models
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class ProfilingStartRequest(BaseModel):
|
||||||
|
"""Request to start a profiling session."""
|
||||||
|
url: str = Field(..., description="URL to profile")
|
||||||
|
browser_config: Optional[Dict[str, Any]] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="Browser configuration"
|
||||||
|
)
|
||||||
|
crawler_config: Optional[Dict[str, Any]] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="Crawler configuration"
|
||||||
|
)
|
||||||
|
profile_duration: Optional[int] = Field(
|
||||||
|
default=30,
|
||||||
|
ge=5,
|
||||||
|
le=300,
|
||||||
|
description="Maximum profiling duration in seconds"
|
||||||
|
)
|
||||||
|
collect_network: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Collect network performance data"
|
||||||
|
)
|
||||||
|
collect_memory: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Collect memory usage data"
|
||||||
|
)
|
||||||
|
collect_cpu: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Collect CPU usage data"
|
||||||
|
)
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
schema_extra = {
|
||||||
|
"example": {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"profile_duration": 30,
|
||||||
|
"collect_network": True,
|
||||||
|
"collect_memory": True,
|
||||||
|
"collect_cpu": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ProfilingSession(BaseModel):
|
||||||
|
"""Profiling session information."""
|
||||||
|
session_id: str = Field(..., description="Unique session identifier")
|
||||||
|
status: str = Field(..., description="Session status: running, completed, failed")
|
||||||
|
url: str = Field(..., description="URL being profiled")
|
||||||
|
start_time: str = Field(..., description="Session start time (ISO format)")
|
||||||
|
end_time: Optional[str] = Field(None, description="Session end time (ISO format)")
|
||||||
|
duration_seconds: Optional[float] = Field(None, description="Total duration in seconds")
|
||||||
|
results: Optional[Dict[str, Any]] = Field(None, description="Profiling results")
|
||||||
|
error: Optional[str] = Field(None, description="Error message if failed")
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
schema_extra = {
|
||||||
|
"example": {
|
||||||
|
"session_id": "abc123",
|
||||||
|
"status": "completed",
|
||||||
|
"url": "https://example.com",
|
||||||
|
"start_time": "2025-10-16T10:30:00",
|
||||||
|
"end_time": "2025-10-16T10:30:30",
|
||||||
|
"duration_seconds": 30.5,
|
||||||
|
"results": {
|
||||||
|
"performance": {
|
||||||
|
"page_load_time_ms": 1234,
|
||||||
|
"dom_content_loaded_ms": 890,
|
||||||
|
"first_paint_ms": 567
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlerStats(BaseModel):
|
||||||
|
"""Current crawler statistics."""
|
||||||
|
active_crawls: int = Field(..., description="Number of currently active crawls")
|
||||||
|
total_crawls: int = Field(..., description="Total crawls since server start")
|
||||||
|
successful_crawls: int = Field(..., description="Number of successful crawls")
|
||||||
|
failed_crawls: int = Field(..., description="Number of failed crawls")
|
||||||
|
success_rate: float = Field(..., description="Success rate percentage")
|
||||||
|
total_bytes_processed: int = Field(..., description="Total bytes processed")
|
||||||
|
average_response_time_ms: float = Field(..., description="Average response time")
|
||||||
|
uptime_seconds: float = Field(..., description="Server uptime in seconds")
|
||||||
|
memory_usage_mb: float = Field(..., description="Current memory usage in MB")
|
||||||
|
cpu_percent: float = Field(..., description="Current CPU usage percentage")
|
||||||
|
last_updated: str = Field(..., description="Last update timestamp")
|
||||||
|
|
||||||
|
|
||||||
|
class URLStatistics(BaseModel):
|
||||||
|
"""Statistics for a specific URL pattern."""
|
||||||
|
url_pattern: str
|
||||||
|
total_requests: int
|
||||||
|
success_count: int
|
||||||
|
failure_count: int
|
||||||
|
success_rate: float
|
||||||
|
average_time_ms: float
|
||||||
|
last_accessed: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
class SessionListResponse(BaseModel):
|
||||||
|
"""List of profiling sessions."""
|
||||||
|
total: int
|
||||||
|
sessions: List[ProfilingSession]
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Helper Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def get_system_stats() -> Dict[str, Any]:
|
||||||
|
"""Get current system resource usage."""
|
||||||
|
try:
|
||||||
|
process = psutil.Process()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"memory_usage_mb": process.memory_info().rss / 1024 / 1024,
|
||||||
|
"cpu_percent": process.cpu_percent(interval=0.1),
|
||||||
|
"num_threads": process.num_threads(),
|
||||||
|
"open_files": len(process.open_files()),
|
||||||
|
"connections": len(process.connections()),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting system stats: {e}")
|
||||||
|
return {
|
||||||
|
"memory_usage_mb": 0.0,
|
||||||
|
"cpu_percent": 0.0,
|
||||||
|
"num_threads": 0,
|
||||||
|
"open_files": 0,
|
||||||
|
"connections": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_old_sessions(max_age_hours: int = 24):
|
||||||
|
"""Remove old profiling sessions to prevent memory leaks."""
|
||||||
|
cutoff = datetime.now() - timedelta(hours=max_age_hours)
|
||||||
|
|
||||||
|
to_remove = []
|
||||||
|
for session_id, session in PROFILING_SESSIONS.items():
|
||||||
|
try:
|
||||||
|
start_time = datetime.fromisoformat(session["start_time"])
|
||||||
|
if start_time < cutoff:
|
||||||
|
to_remove.append(session_id)
|
||||||
|
except (ValueError, KeyError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for session_id in to_remove:
|
||||||
|
del PROFILING_SESSIONS[session_id]
|
||||||
|
logger.info(f"Cleaned up old session: {session_id}")
|
||||||
|
|
||||||
|
return len(to_remove)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Profiling Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/profile/start",
|
||||||
|
response_model=ProfilingSession,
|
||||||
|
summary="Start profiling session",
|
||||||
|
description="Start a new browser profiling session for performance analysis"
|
||||||
|
)
|
||||||
|
async def start_profiling_session(
|
||||||
|
request: ProfilingStartRequest,
|
||||||
|
background_tasks: BackgroundTasks
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Start a new profiling session.
|
||||||
|
|
||||||
|
Returns a session ID that can be used to retrieve results later.
|
||||||
|
The profiling runs in the background and collects:
|
||||||
|
- Page load performance metrics
|
||||||
|
- Network requests and timing
|
||||||
|
- Memory usage patterns
|
||||||
|
- CPU utilization
|
||||||
|
- Browser-specific metrics
|
||||||
|
"""
|
||||||
|
session_id = str(uuid.uuid4())
|
||||||
|
start_time = datetime.now()
|
||||||
|
|
||||||
|
session_data = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"status": "running",
|
||||||
|
"url": request.url,
|
||||||
|
"start_time": start_time.isoformat(),
|
||||||
|
"end_time": None,
|
||||||
|
"duration_seconds": None,
|
||||||
|
"results": None,
|
||||||
|
"error": None,
|
||||||
|
"config": {
|
||||||
|
"profile_duration": request.profile_duration,
|
||||||
|
"collect_network": request.collect_network,
|
||||||
|
"collect_memory": request.collect_memory,
|
||||||
|
"collect_cpu": request.collect_cpu,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PROFILING_SESSIONS[session_id] = session_data
|
||||||
|
|
||||||
|
# Add background task to run profiling
|
||||||
|
background_tasks.add_task(
|
||||||
|
run_profiling_session,
|
||||||
|
session_id,
|
||||||
|
request
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Started profiling session {session_id} for {request.url}")
|
||||||
|
|
||||||
|
return ProfilingSession(**session_data)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/profile/{session_id}",
|
||||||
|
response_model=ProfilingSession,
|
||||||
|
summary="Get profiling results",
|
||||||
|
description="Retrieve results from a profiling session"
|
||||||
|
)
|
||||||
|
async def get_profiling_results(session_id: str):
|
||||||
|
"""
|
||||||
|
Get profiling session results.
|
||||||
|
|
||||||
|
Returns the current status and results of a profiling session.
|
||||||
|
If the session is still running, results will be None.
|
||||||
|
"""
|
||||||
|
if session_id not in PROFILING_SESSIONS:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail=f"Profiling session '{session_id}' not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
session = PROFILING_SESSIONS[session_id]
|
||||||
|
return ProfilingSession(**session)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/profile",
|
||||||
|
response_model=SessionListResponse,
|
||||||
|
summary="List profiling sessions",
|
||||||
|
description="List all profiling sessions with optional filtering"
|
||||||
|
)
|
||||||
|
async def list_profiling_sessions(
|
||||||
|
status: Optional[str] = Query(None, description="Filter by status: running, completed, failed"),
|
||||||
|
limit: int = Query(50, ge=1, le=500, description="Maximum number of sessions to return")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
List all profiling sessions.
|
||||||
|
|
||||||
|
Can be filtered by status and limited in number.
|
||||||
|
"""
|
||||||
|
sessions = list(PROFILING_SESSIONS.values())
|
||||||
|
|
||||||
|
# Filter by status if provided
|
||||||
|
if status:
|
||||||
|
sessions = [s for s in sessions if s["status"] == status]
|
||||||
|
|
||||||
|
# Sort by start time (newest first)
|
||||||
|
sessions.sort(key=lambda x: x["start_time"], reverse=True)
|
||||||
|
|
||||||
|
# Limit results
|
||||||
|
sessions = sessions[:limit]
|
||||||
|
|
||||||
|
return SessionListResponse(
|
||||||
|
total=len(sessions),
|
||||||
|
sessions=[ProfilingSession(**s) for s in sessions]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete(
|
||||||
|
"/profile/{session_id}",
|
||||||
|
summary="Delete profiling session",
|
||||||
|
description="Delete a profiling session and its results"
|
||||||
|
)
|
||||||
|
async def delete_profiling_session(session_id: str):
|
||||||
|
"""
|
||||||
|
Delete a profiling session.
|
||||||
|
|
||||||
|
Removes the session and all associated data from memory.
|
||||||
|
"""
|
||||||
|
if session_id not in PROFILING_SESSIONS:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail=f"Profiling session '{session_id}' not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
session = PROFILING_SESSIONS.pop(session_id)
|
||||||
|
logger.info(f"Deleted profiling session {session_id}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": f"Session {session_id} deleted",
|
||||||
|
"session": ProfilingSession(**session)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/profile/cleanup",
|
||||||
|
summary="Cleanup old sessions",
|
||||||
|
description="Remove old profiling sessions to free memory"
|
||||||
|
)
|
||||||
|
async def cleanup_sessions(
|
||||||
|
max_age_hours: int = Query(24, ge=1, le=168, description="Maximum age in hours")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Cleanup old profiling sessions.
|
||||||
|
|
||||||
|
Removes sessions older than the specified age.
|
||||||
|
"""
|
||||||
|
removed = cleanup_old_sessions(max_age_hours)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"removed_count": removed,
|
||||||
|
"remaining_count": len(PROFILING_SESSIONS),
|
||||||
|
"message": f"Removed {removed} sessions older than {max_age_hours} hours"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Statistics Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/stats",
|
||||||
|
response_model=CrawlerStats,
|
||||||
|
summary="Get crawler statistics",
|
||||||
|
description="Get current crawler statistics and system metrics"
|
||||||
|
)
|
||||||
|
async def get_crawler_stats():
|
||||||
|
"""
|
||||||
|
Get current crawler statistics.
|
||||||
|
|
||||||
|
Returns real-time metrics about:
|
||||||
|
- Active and total crawls
|
||||||
|
- Success/failure rates
|
||||||
|
- Response times
|
||||||
|
- System resource usage
|
||||||
|
"""
|
||||||
|
system_stats = get_system_stats()
|
||||||
|
|
||||||
|
total = CRAWLER_STATS["successful_crawls"] + CRAWLER_STATS["failed_crawls"]
|
||||||
|
success_rate = (
|
||||||
|
(CRAWLER_STATS["successful_crawls"] / total * 100)
|
||||||
|
if total > 0 else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate uptime
|
||||||
|
# In a real implementation, you'd track server start time
|
||||||
|
uptime_seconds = 0.0 # Placeholder
|
||||||
|
|
||||||
|
stats = CrawlerStats(
|
||||||
|
active_crawls=CRAWLER_STATS["active_crawls"],
|
||||||
|
total_crawls=CRAWLER_STATS["total_crawls"],
|
||||||
|
successful_crawls=CRAWLER_STATS["successful_crawls"],
|
||||||
|
failed_crawls=CRAWLER_STATS["failed_crawls"],
|
||||||
|
success_rate=success_rate,
|
||||||
|
total_bytes_processed=CRAWLER_STATS["total_bytes_processed"],
|
||||||
|
average_response_time_ms=CRAWLER_STATS["average_response_time_ms"],
|
||||||
|
uptime_seconds=uptime_seconds,
|
||||||
|
memory_usage_mb=system_stats["memory_usage_mb"],
|
||||||
|
cpu_percent=system_stats["cpu_percent"],
|
||||||
|
last_updated=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/stats/stream",
|
||||||
|
summary="Stream crawler statistics",
|
||||||
|
description="Server-Sent Events stream of real-time crawler statistics"
|
||||||
|
)
|
||||||
|
async def stream_crawler_stats(
|
||||||
|
interval: int = Query(2, ge=1, le=60, description="Update interval in seconds")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Stream real-time crawler statistics.
|
||||||
|
|
||||||
|
Returns an SSE (Server-Sent Events) stream that pushes
|
||||||
|
statistics updates at the specified interval.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```javascript
|
||||||
|
const eventSource = new EventSource('/monitoring/stats/stream?interval=2');
|
||||||
|
eventSource.onmessage = (event) => {
|
||||||
|
const stats = JSON.parse(event.data);
|
||||||
|
console.log('Stats:', stats);
|
||||||
|
};
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def generate_stats() -> AsyncGenerator[str, None]:
|
||||||
|
"""Generate stats stream."""
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
# Get current stats
|
||||||
|
stats = await get_crawler_stats()
|
||||||
|
|
||||||
|
# Format as SSE
|
||||||
|
data = json.dumps(stats.dict())
|
||||||
|
yield f"data: {data}\n\n"
|
||||||
|
|
||||||
|
# Wait for next interval
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("Stats stream cancelled by client")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in stats stream: {e}")
|
||||||
|
yield f"event: error\ndata: {json.dumps({'error': str(e)})}\n\n"
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
generate_stats(),
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers={
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"X-Accel-Buffering": "no",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/stats/urls",
|
||||||
|
response_model=List[URLStatistics],
|
||||||
|
summary="Get URL statistics",
|
||||||
|
description="Get statistics for crawled URLs"
|
||||||
|
)
|
||||||
|
async def get_url_statistics(
|
||||||
|
limit: int = Query(100, ge=1, le=1000, description="Maximum number of URLs to return"),
|
||||||
|
sort_by: str = Query("total_requests", description="Sort field: total_requests, success_rate, average_time_ms")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get statistics for crawled URLs.
|
||||||
|
|
||||||
|
Returns metrics for each URL that has been crawled,
|
||||||
|
including request counts, success rates, and timing.
|
||||||
|
"""
|
||||||
|
stats_list = []
|
||||||
|
|
||||||
|
for url, stats in URL_STATS.items():
|
||||||
|
total = stats["total_requests"]
|
||||||
|
success_rate = (stats["success_count"] / total * 100) if total > 0 else 0.0
|
||||||
|
|
||||||
|
stats_list.append(URLStatistics(
|
||||||
|
url_pattern=url,
|
||||||
|
total_requests=stats["total_requests"],
|
||||||
|
success_count=stats["success_count"],
|
||||||
|
failure_count=stats["failure_count"],
|
||||||
|
success_rate=success_rate,
|
||||||
|
average_time_ms=stats["average_time_ms"],
|
||||||
|
last_accessed=stats["last_accessed"]
|
||||||
|
))
|
||||||
|
|
||||||
|
# Sort
|
||||||
|
if sort_by == "success_rate":
|
||||||
|
stats_list.sort(key=lambda x: x.success_rate, reverse=True)
|
||||||
|
elif sort_by == "average_time_ms":
|
||||||
|
stats_list.sort(key=lambda x: x.average_time_ms)
|
||||||
|
else: # total_requests
|
||||||
|
stats_list.sort(key=lambda x: x.total_requests, reverse=True)
|
||||||
|
|
||||||
|
return stats_list[:limit]
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/stats/reset",
|
||||||
|
summary="Reset statistics",
|
||||||
|
description="Reset all crawler statistics to zero"
|
||||||
|
)
|
||||||
|
async def reset_statistics():
|
||||||
|
"""
|
||||||
|
Reset all statistics.
|
||||||
|
|
||||||
|
Clears all accumulated statistics but keeps the server running.
|
||||||
|
Useful for testing or starting fresh measurements.
|
||||||
|
"""
|
||||||
|
global CRAWLER_STATS, URL_STATS
|
||||||
|
|
||||||
|
CRAWLER_STATS = {
|
||||||
|
"active_crawls": 0,
|
||||||
|
"total_crawls": 0,
|
||||||
|
"successful_crawls": 0,
|
||||||
|
"failed_crawls": 0,
|
||||||
|
"total_bytes_processed": 0,
|
||||||
|
"average_response_time_ms": 0.0,
|
||||||
|
"last_updated": datetime.now().isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
URL_STATS.clear()
|
||||||
|
|
||||||
|
logger.info("All statistics reset")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": "All statistics have been reset",
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Background Tasks
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def run_profiling_session(session_id: str, request: ProfilingStartRequest):
|
||||||
|
"""
|
||||||
|
Background task to run profiling session.
|
||||||
|
|
||||||
|
This performs the actual profiling work:
|
||||||
|
1. Creates a crawler with profiling enabled
|
||||||
|
2. Crawls the target URL
|
||||||
|
3. Collects performance metrics
|
||||||
|
4. Stores results in the session
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
|
logger.info(f"Starting profiling for session {session_id}")
|
||||||
|
|
||||||
|
# Create profiler
|
||||||
|
profiler = BrowserProfiler()
|
||||||
|
|
||||||
|
# Configure browser and crawler
|
||||||
|
browser_config = BrowserConfig.load(request.browser_config)
|
||||||
|
crawler_config = CrawlerRunConfig.load(request.crawler_config)
|
||||||
|
|
||||||
|
# Enable profiling options
|
||||||
|
browser_config.profiling_enabled = True
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
# Start profiling
|
||||||
|
profiler.start()
|
||||||
|
|
||||||
|
# Collect system stats before
|
||||||
|
stats_before = get_system_stats()
|
||||||
|
|
||||||
|
# Crawl with timeout
|
||||||
|
try:
|
||||||
|
result = await asyncio.wait_for(
|
||||||
|
crawler.arun(request.url, config=crawler_config),
|
||||||
|
timeout=request.profile_duration
|
||||||
|
)
|
||||||
|
|
||||||
|
crawl_success = result.success
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"Profiling session {session_id} timed out")
|
||||||
|
crawl_success = False
|
||||||
|
result = None
|
||||||
|
|
||||||
|
# Stop profiling
|
||||||
|
profiler_results = profiler.stop()
|
||||||
|
|
||||||
|
# Collect system stats after
|
||||||
|
stats_after = get_system_stats()
|
||||||
|
|
||||||
|
# Build results
|
||||||
|
results = {
|
||||||
|
"crawl_success": crawl_success,
|
||||||
|
"url": request.url,
|
||||||
|
"performance": profiler_results if profiler_results else {},
|
||||||
|
"system": {
|
||||||
|
"before": stats_before,
|
||||||
|
"after": stats_after,
|
||||||
|
"delta": {
|
||||||
|
"memory_mb": stats_after["memory_usage_mb"] - stats_before["memory_usage_mb"],
|
||||||
|
"cpu_percent": stats_after["cpu_percent"] - stats_before["cpu_percent"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if result:
|
||||||
|
results["content"] = {
|
||||||
|
"markdown_length": len(result.markdown) if result.markdown else 0,
|
||||||
|
"html_length": len(result.html) if result.html else 0,
|
||||||
|
"links_count": len(result.links["internal"]) + len(result.links["external"]),
|
||||||
|
"media_count": len(result.media["images"]) + len(result.media["videos"]),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update session with results
|
||||||
|
end_time = time.time()
|
||||||
|
duration = end_time - start_time
|
||||||
|
|
||||||
|
PROFILING_SESSIONS[session_id].update({
|
||||||
|
"status": "completed",
|
||||||
|
"end_time": datetime.now().isoformat(),
|
||||||
|
"duration_seconds": duration,
|
||||||
|
"results": results
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"Profiling session {session_id} completed in {duration:.2f}s")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Profiling session {session_id} failed: {str(e)}")
|
||||||
|
|
||||||
|
PROFILING_SESSIONS[session_id].update({
|
||||||
|
"status": "failed",
|
||||||
|
"end_time": datetime.now().isoformat(),
|
||||||
|
"duration_seconds": time.time() - start_time,
|
||||||
|
"error": str(e)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Middleware Integration Points
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def track_crawl_start():
|
||||||
|
"""Call this when a crawl starts."""
|
||||||
|
CRAWLER_STATS["active_crawls"] += 1
|
||||||
|
CRAWLER_STATS["total_crawls"] += 1
|
||||||
|
CRAWLER_STATS["last_updated"] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def track_crawl_end(url: str, success: bool, duration_ms: float, bytes_processed: int = 0):
|
||||||
|
"""Call this when a crawl ends."""
|
||||||
|
CRAWLER_STATS["active_crawls"] = max(0, CRAWLER_STATS["active_crawls"] - 1)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
CRAWLER_STATS["successful_crawls"] += 1
|
||||||
|
else:
|
||||||
|
CRAWLER_STATS["failed_crawls"] += 1
|
||||||
|
|
||||||
|
CRAWLER_STATS["total_bytes_processed"] += bytes_processed
|
||||||
|
|
||||||
|
# Update average response time (running average)
|
||||||
|
total = CRAWLER_STATS["successful_crawls"] + CRAWLER_STATS["failed_crawls"]
|
||||||
|
current_avg = CRAWLER_STATS["average_response_time_ms"]
|
||||||
|
CRAWLER_STATS["average_response_time_ms"] = (
|
||||||
|
(current_avg * (total - 1) + duration_ms) / total
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update URL stats
|
||||||
|
url_stat = URL_STATS[url]
|
||||||
|
url_stat["total_requests"] += 1
|
||||||
|
|
||||||
|
if success:
|
||||||
|
url_stat["success_count"] += 1
|
||||||
|
else:
|
||||||
|
url_stat["failure_count"] += 1
|
||||||
|
|
||||||
|
# Update average time for this URL
|
||||||
|
total_url = url_stat["total_requests"]
|
||||||
|
current_avg_url = url_stat["average_time_ms"]
|
||||||
|
url_stat["average_time_ms"] = (
|
||||||
|
(current_avg_url * (total_url - 1) + duration_ms) / total_url
|
||||||
|
)
|
||||||
|
url_stat["last_accessed"] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
CRAWLER_STATS["last_updated"] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Health Check
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/health",
|
||||||
|
summary="Health check",
|
||||||
|
description="Check if monitoring system is operational"
|
||||||
|
)
|
||||||
|
async def health_check():
|
||||||
|
"""
|
||||||
|
Health check endpoint.
|
||||||
|
|
||||||
|
Returns status of the monitoring system.
|
||||||
|
"""
|
||||||
|
system_stats = get_system_stats()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "healthy",
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"active_sessions": len([s for s in PROFILING_SESSIONS.values() if s["status"] == "running"]),
|
||||||
|
"total_sessions": len(PROFILING_SESSIONS),
|
||||||
|
"system": system_stats
|
||||||
|
}
|
||||||
@@ -87,7 +87,7 @@ from prometheus_fastapi_instrumentator import Instrumentator
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from rank_bm25 import BM25Okapi
|
from rank_bm25 import BM25Okapi
|
||||||
from redis import asyncio as aioredis
|
from redis import asyncio as aioredis
|
||||||
from routers import adaptive, dispatchers, scripts
|
from routers import adaptive, dispatchers, scripts, monitoring
|
||||||
from schemas import (
|
from schemas import (
|
||||||
CrawlRequest,
|
CrawlRequest,
|
||||||
CrawlRequestWithHooks,
|
CrawlRequestWithHooks,
|
||||||
@@ -297,6 +297,7 @@ app.include_router(init_job_router(redis, config, token_dep))
|
|||||||
app.include_router(adaptive.router)
|
app.include_router(adaptive.router)
|
||||||
app.include_router(dispatchers.router)
|
app.include_router(dispatchers.router)
|
||||||
app.include_router(scripts.router)
|
app.include_router(scripts.router)
|
||||||
|
app.include_router(monitoring.router)
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────── Endpoints ──────────────────────────
|
# ──────────────────────── Endpoints ──────────────────────────
|
||||||
|
|||||||
@@ -48,6 +48,18 @@ Visit `http://localhost:11235/docs` for interactive Swagger UI documentation.
|
|||||||
- [POST /adaptive/crawl](#post-adaptivecrawl) - Adaptive crawl with auto-discovery
|
- [POST /adaptive/crawl](#post-adaptivecrawl) - Adaptive crawl with auto-discovery
|
||||||
- [GET /adaptive/status/{task_id}](#get-adaptivestatustask_id) - Check adaptive crawl status
|
- [GET /adaptive/status/{task_id}](#get-adaptivestatustask_id) - Check adaptive crawl status
|
||||||
|
|
||||||
|
### Monitoring & Profiling
|
||||||
|
- [GET /monitoring/health](#get-monitoringhealth) - Health check endpoint
|
||||||
|
- [GET /monitoring/stats](#get-monitoringstats) - Get current statistics
|
||||||
|
- [GET /monitoring/stats/stream](#get-monitoringsstatsstream) - Real-time statistics stream (SSE)
|
||||||
|
- [GET /monitoring/stats/urls](#get-monitoringstatssurls) - URL-specific statistics
|
||||||
|
- [POST /monitoring/stats/reset](#post-monitoringsstatsreset) - Reset statistics
|
||||||
|
- [POST /monitoring/profile/start](#post-monitoringprofilestart) - Start profiling session
|
||||||
|
- [GET /monitoring/profile/{session_id}](#get-monitoringprofilesession_id) - Get profiling results
|
||||||
|
- [GET /monitoring/profile](#get-monitoringprofile) - List profiling sessions
|
||||||
|
- [DELETE /monitoring/profile/{session_id}](#delete-monitoringprofilesession_id) - Delete session
|
||||||
|
- [POST /monitoring/profile/cleanup](#post-monitoringprofilecleanup) - Cleanup old sessions
|
||||||
|
|
||||||
### Utility Endpoints
|
### Utility Endpoints
|
||||||
- [POST /token](#post-token) - Get authentication token
|
- [POST /token](#post-token) - Get authentication token
|
||||||
- [GET /health](#get-health) - Health check
|
- [GET /health](#get-health) - Health check
|
||||||
@@ -1013,6 +1025,487 @@ Check status of adaptive crawl task.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Monitoring & Profiling
|
||||||
|
|
||||||
|
The monitoring endpoints provide real-time statistics, profiling capabilities, and health monitoring for your Crawl4AI instance.
|
||||||
|
|
||||||
|
### GET /monitoring/health
|
||||||
|
|
||||||
|
Health check endpoint for monitoring integration.
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "healthy",
|
||||||
|
"uptime_seconds": 3600,
|
||||||
|
"timestamp": "2025-01-07T12:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
response = requests.get("http://localhost:11235/monitoring/health")
|
||||||
|
health = response.json()
|
||||||
|
print(f"Status: {health['status']}")
|
||||||
|
print(f"Uptime: {health['uptime_seconds']}s")
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "cURL"
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11235/monitoring/health
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### GET /monitoring/stats
|
||||||
|
|
||||||
|
Get current crawler statistics and system metrics.
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"active_crawls": 2,
|
||||||
|
"total_crawls": 150,
|
||||||
|
"successful_crawls": 142,
|
||||||
|
"failed_crawls": 8,
|
||||||
|
"success_rate": 94.67,
|
||||||
|
"avg_duration_ms": 1250.5,
|
||||||
|
"total_bytes_processed": 15728640,
|
||||||
|
"system_stats": {
|
||||||
|
"cpu_percent": 45.2,
|
||||||
|
"memory_percent": 62.8,
|
||||||
|
"memory_used_mb": 2048,
|
||||||
|
"memory_available_mb": 8192,
|
||||||
|
"disk_usage_percent": 55.3,
|
||||||
|
"active_processes": 127
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
response = requests.get("http://localhost:11235/monitoring/stats")
|
||||||
|
stats = response.json()
|
||||||
|
|
||||||
|
print(f"Active crawls: {stats['active_crawls']}")
|
||||||
|
print(f"Success rate: {stats['success_rate']:.2f}%")
|
||||||
|
print(f"CPU usage: {stats['system_stats']['cpu_percent']:.1f}%")
|
||||||
|
print(f"Memory usage: {stats['system_stats']['memory_percent']:.1f}%")
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "cURL"
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11235/monitoring/stats
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### GET /monitoring/stats/stream
|
||||||
|
|
||||||
|
Server-Sent Events (SSE) stream of real-time statistics. Updates every 2 seconds.
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```
|
||||||
|
data: {"active_crawls": 2, "total_crawls": 150, ...}
|
||||||
|
|
||||||
|
data: {"active_crawls": 3, "total_crawls": 151, ...}
|
||||||
|
|
||||||
|
data: {"active_crawls": 2, "total_crawls": 151, ...}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Stream real-time stats
|
||||||
|
response = requests.get(
|
||||||
|
"http://localhost:11235/monitoring/stats/stream",
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
|
||||||
|
for line in response.iter_lines():
|
||||||
|
if line.startswith(b"data: "):
|
||||||
|
data = json.loads(line[6:]) # Remove "data: " prefix
|
||||||
|
print(f"Active: {data['active_crawls']}, "
|
||||||
|
f"Total: {data['total_crawls']}, "
|
||||||
|
f"CPU: {data['system_stats']['cpu_percent']:.1f}%")
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "JavaScript"
|
||||||
|
```javascript
|
||||||
|
const eventSource = new EventSource('http://localhost:11235/monitoring/stats/stream');
|
||||||
|
|
||||||
|
eventSource.onmessage = (event) => {
|
||||||
|
const stats = JSON.parse(event.data);
|
||||||
|
console.log('Active crawls:', stats.active_crawls);
|
||||||
|
console.log('CPU:', stats.system_stats.cpu_percent);
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### GET /monitoring/stats/urls
|
||||||
|
|
||||||
|
Get URL-specific statistics showing per-URL performance metrics.
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"total_requests": 45,
|
||||||
|
"successful_requests": 42,
|
||||||
|
"failed_requests": 3,
|
||||||
|
"avg_duration_ms": 850.3,
|
||||||
|
"total_bytes_processed": 2621440,
|
||||||
|
"last_request_time": "2025-01-07T12:00:00Z"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://python.org",
|
||||||
|
"total_requests": 32,
|
||||||
|
"successful_requests": 32,
|
||||||
|
"failed_requests": 0,
|
||||||
|
"avg_duration_ms": 1120.7,
|
||||||
|
"total_bytes_processed": 1835008,
|
||||||
|
"last_request_time": "2025-01-07T11:55:00Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
response = requests.get("http://localhost:11235/monitoring/stats/urls")
|
||||||
|
url_stats = response.json()
|
||||||
|
|
||||||
|
for stat in url_stats:
|
||||||
|
success_rate = (stat['successful_requests'] / stat['total_requests']) * 100
|
||||||
|
print(f"\nURL: {stat['url']}")
|
||||||
|
print(f" Requests: {stat['total_requests']}")
|
||||||
|
print(f" Success rate: {success_rate:.1f}%")
|
||||||
|
print(f" Avg time: {stat['avg_duration_ms']:.1f}ms")
|
||||||
|
print(f" Data processed: {stat['total_bytes_processed'] / 1024:.1f}KB")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### POST /monitoring/stats/reset
|
||||||
|
|
||||||
|
Reset all statistics counters. Useful for testing or starting fresh monitoring sessions.
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "reset",
|
||||||
|
"previous_stats": {
|
||||||
|
"total_crawls": 150,
|
||||||
|
"successful_crawls": 142,
|
||||||
|
"failed_crawls": 8
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
response = requests.post("http://localhost:11235/monitoring/stats/reset")
|
||||||
|
result = response.json()
|
||||||
|
print(f"Stats reset. Previous total: {result['previous_stats']['total_crawls']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "cURL"
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/monitoring/stats/reset
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### POST /monitoring/profile/start
|
||||||
|
|
||||||
|
Start a profiling session to monitor crawler performance over time.
|
||||||
|
|
||||||
|
#### Request
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"urls": [
|
||||||
|
"https://example.com",
|
||||||
|
"https://python.org"
|
||||||
|
],
|
||||||
|
"duration_seconds": 60,
|
||||||
|
"browser_config": {
|
||||||
|
"headless": true
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"word_count_threshold": 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"session_id": "prof_abc123xyz",
|
||||||
|
"status": "running",
|
||||||
|
"started_at": "2025-01-07T12:00:00Z",
|
||||||
|
"urls": [
|
||||||
|
"https://example.com",
|
||||||
|
"https://python.org"
|
||||||
|
],
|
||||||
|
"duration_seconds": 60
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
# Start a profiling session
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/monitoring/profile/start",
|
||||||
|
json={
|
||||||
|
"urls": ["https://example.com", "https://python.org"],
|
||||||
|
"duration_seconds": 60,
|
||||||
|
"crawler_config": {
|
||||||
|
"word_count_threshold": 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
session = response.json()
|
||||||
|
session_id = session["session_id"]
|
||||||
|
print(f"Profiling session started: {session_id}")
|
||||||
|
print(f"Status: {session['status']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### GET /monitoring/profile/{session_id}
|
||||||
|
|
||||||
|
Get profiling session details and results.
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"session_id": "prof_abc123xyz",
|
||||||
|
"status": "completed",
|
||||||
|
"started_at": "2025-01-07T12:00:00Z",
|
||||||
|
"completed_at": "2025-01-07T12:01:00Z",
|
||||||
|
"duration_seconds": 60,
|
||||||
|
"urls": ["https://example.com", "https://python.org"],
|
||||||
|
"results": {
|
||||||
|
"total_requests": 120,
|
||||||
|
"successful_requests": 115,
|
||||||
|
"failed_requests": 5,
|
||||||
|
"avg_response_time_ms": 950.3,
|
||||||
|
"system_metrics": {
|
||||||
|
"avg_cpu_percent": 48.5,
|
||||||
|
"peak_cpu_percent": 72.3,
|
||||||
|
"avg_memory_percent": 55.2,
|
||||||
|
"peak_memory_percent": 68.9,
|
||||||
|
"total_bytes_processed": 5242880
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Start session
|
||||||
|
start_response = requests.post(
|
||||||
|
"http://localhost:11235/monitoring/profile/start",
|
||||||
|
json={
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"duration_seconds": 30
|
||||||
|
}
|
||||||
|
)
|
||||||
|
session_id = start_response.json()["session_id"]
|
||||||
|
|
||||||
|
# Wait for completion
|
||||||
|
time.sleep(32)
|
||||||
|
|
||||||
|
# Get results
|
||||||
|
result_response = requests.get(
|
||||||
|
f"http://localhost:11235/monitoring/profile/{session_id}"
|
||||||
|
)
|
||||||
|
session = result_response.json()
|
||||||
|
|
||||||
|
print(f"Session: {session_id}")
|
||||||
|
print(f"Status: {session['status']}")
|
||||||
|
|
||||||
|
if session['status'] == 'completed':
|
||||||
|
results = session['results']
|
||||||
|
print(f"\nResults:")
|
||||||
|
print(f" Total requests: {results['total_requests']}")
|
||||||
|
print(f" Success rate: {results['successful_requests'] / results['total_requests'] * 100:.1f}%")
|
||||||
|
print(f" Avg response time: {results['avg_response_time_ms']:.1f}ms")
|
||||||
|
print(f"\nSystem Metrics:")
|
||||||
|
print(f" Avg CPU: {results['system_metrics']['avg_cpu_percent']:.1f}%")
|
||||||
|
print(f" Peak CPU: {results['system_metrics']['peak_cpu_percent']:.1f}%")
|
||||||
|
print(f" Avg Memory: {results['system_metrics']['avg_memory_percent']:.1f}%")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### GET /monitoring/profile
|
||||||
|
|
||||||
|
List all profiling sessions.
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"sessions": [
|
||||||
|
{
|
||||||
|
"session_id": "prof_abc123xyz",
|
||||||
|
"status": "completed",
|
||||||
|
"started_at": "2025-01-07T12:00:00Z",
|
||||||
|
"completed_at": "2025-01-07T12:01:00Z",
|
||||||
|
"duration_seconds": 60,
|
||||||
|
"urls": ["https://example.com"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"session_id": "prof_def456uvw",
|
||||||
|
"status": "running",
|
||||||
|
"started_at": "2025-01-07T12:05:00Z",
|
||||||
|
"duration_seconds": 120,
|
||||||
|
"urls": ["https://python.org", "https://github.com"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
response = requests.get("http://localhost:11235/monitoring/profile")
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
print(f"Total sessions: {len(data['sessions'])}")
|
||||||
|
|
||||||
|
for session in data['sessions']:
|
||||||
|
print(f"\n{session['session_id']}")
|
||||||
|
print(f" Status: {session['status']}")
|
||||||
|
print(f" URLs: {', '.join(session['urls'])}")
|
||||||
|
print(f" Duration: {session['duration_seconds']}s")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### DELETE /monitoring/profile/{session_id}
|
||||||
|
|
||||||
|
Delete a profiling session.
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "deleted",
|
||||||
|
"session_id": "prof_abc123xyz"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
response = requests.delete(
|
||||||
|
f"http://localhost:11235/monitoring/profile/{session_id}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
print(f"Session {session_id} deleted")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### POST /monitoring/profile/cleanup
|
||||||
|
|
||||||
|
Clean up old profiling sessions.
|
||||||
|
|
||||||
|
#### Request
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"max_age_seconds": 3600
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"deleted_count": 5,
|
||||||
|
"remaining_count": 3
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
```python
|
||||||
|
# Delete sessions older than 1 hour
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/monitoring/profile/cleanup",
|
||||||
|
json={"max_age_seconds": 3600}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
print(f"Deleted {result['deleted_count']} old sessions")
|
||||||
|
print(f"Remaining: {result['remaining_count']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Monitoring Dashboard Demo
|
||||||
|
|
||||||
|
We provide an interactive terminal-based dashboard for monitoring. Run it with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python tests/docker/extended_features/demo_monitoring_dashboard.py --url http://localhost:11235
|
||||||
|
```
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Real-time statistics with auto-refresh
|
||||||
|
- System resource monitoring (CPU, Memory, Disk)
|
||||||
|
- URL-specific performance metrics
|
||||||
|
- Profiling session management
|
||||||
|
- Interactive commands (view, create, delete sessions)
|
||||||
|
- Color-coded status indicators
|
||||||
|
|
||||||
|
**Dashboard Commands:**
|
||||||
|
- `[D]` - Dashboard view (default)
|
||||||
|
- `[S]` - Profiling sessions view
|
||||||
|
- `[U]` - URL statistics view
|
||||||
|
- `[R]` - Reset statistics
|
||||||
|
- `[N]` - Create new profiling session (from sessions view)
|
||||||
|
- `[V]` - View session details (from sessions view)
|
||||||
|
- `[X]` - Delete session (from sessions view)
|
||||||
|
- `[Q]` - Quit
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Utility Endpoints
|
## Utility Endpoints
|
||||||
|
|
||||||
### POST /token
|
### POST /token
|
||||||
|
|||||||
479
tests/docker/extended_features/demo_monitoring_dashboard.py
Normal file
479
tests/docker/extended_features/demo_monitoring_dashboard.py
Normal file
@@ -0,0 +1,479 @@
|
|||||||
|
"""
|
||||||
|
Interactive Monitoring Dashboard Demo
|
||||||
|
|
||||||
|
This demo showcases the monitoring and profiling capabilities of Crawl4AI's Docker server.
|
||||||
|
It provides:
|
||||||
|
- Real-time statistics dashboard with auto-refresh
|
||||||
|
- Profiling session management
|
||||||
|
- System resource monitoring
|
||||||
|
- URL-specific statistics
|
||||||
|
- Interactive terminal UI
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python demo_monitoring_dashboard.py [--url BASE_URL]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
|
||||||
|
class Colors:
|
||||||
|
"""ANSI color codes for terminal output."""
|
||||||
|
HEADER = '\033[95m'
|
||||||
|
OKBLUE = '\033[94m'
|
||||||
|
OKCYAN = '\033[96m'
|
||||||
|
OKGREEN = '\033[92m'
|
||||||
|
WARNING = '\033[93m'
|
||||||
|
FAIL = '\033[91m'
|
||||||
|
ENDC = '\033[0m'
|
||||||
|
BOLD = '\033[1m'
|
||||||
|
UNDERLINE = '\033[4m'
|
||||||
|
|
||||||
|
|
||||||
|
class MonitoringDashboard:
|
||||||
|
"""Interactive monitoring dashboard for Crawl4AI."""
|
||||||
|
|
||||||
|
def __init__(self, base_url: str = "http://localhost:11234"):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.client = httpx.AsyncClient(base_url=base_url, timeout=60.0)
|
||||||
|
self.running = True
|
||||||
|
self.current_view = "dashboard" # dashboard, sessions, urls
|
||||||
|
self.profiling_sessions: List[Dict] = []
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
"""Close the HTTP client."""
|
||||||
|
await self.client.aclose()
|
||||||
|
|
||||||
|
def clear_screen(self):
|
||||||
|
"""Clear the terminal screen."""
|
||||||
|
print("\033[2J\033[H", end="")
|
||||||
|
|
||||||
|
def print_header(self, title: str):
|
||||||
|
"""Print a formatted header."""
|
||||||
|
width = 80
|
||||||
|
print(f"\n{Colors.HEADER}{Colors.BOLD}")
|
||||||
|
print("=" * width)
|
||||||
|
print(f"{title.center(width)}")
|
||||||
|
print("=" * width)
|
||||||
|
print(f"{Colors.ENDC}")
|
||||||
|
|
||||||
|
def print_section(self, title: str):
|
||||||
|
"""Print a section header."""
|
||||||
|
print(f"\n{Colors.OKBLUE}{Colors.BOLD}▶ {title}{Colors.ENDC}")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
async def check_health(self) -> Dict:
|
||||||
|
"""Check server health."""
|
||||||
|
try:
|
||||||
|
response = await self.client.get("/monitoring/health")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except Exception as e:
|
||||||
|
return {"status": "error", "error": str(e)}
|
||||||
|
|
||||||
|
async def get_stats(self) -> Dict:
|
||||||
|
"""Get current statistics."""
|
||||||
|
try:
|
||||||
|
response = await self.client.get("/monitoring/stats")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def get_url_stats(self) -> List[Dict]:
|
||||||
|
"""Get URL-specific statistics."""
|
||||||
|
try:
|
||||||
|
response = await self.client.get("/monitoring/stats/urls")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except Exception as e:
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def list_profiling_sessions(self) -> List[Dict]:
|
||||||
|
"""List all profiling sessions."""
|
||||||
|
try:
|
||||||
|
response = await self.client.get("/monitoring/profile")
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
return data.get("sessions", [])
|
||||||
|
except Exception as e:
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def start_profiling_session(self, urls: List[str], duration: int = 30) -> Dict:
|
||||||
|
"""Start a new profiling session."""
|
||||||
|
try:
|
||||||
|
request_data = {
|
||||||
|
"urls": urls,
|
||||||
|
"duration_seconds": duration,
|
||||||
|
"crawler_config": {
|
||||||
|
"word_count_threshold": 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response = await self.client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def get_profiling_session(self, session_id: str) -> Dict:
|
||||||
|
"""Get profiling session details."""
|
||||||
|
try:
|
||||||
|
response = await self.client.get(f"/monitoring/profile/{session_id}")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def delete_profiling_session(self, session_id: str) -> Dict:
|
||||||
|
"""Delete a profiling session."""
|
||||||
|
try:
|
||||||
|
response = await self.client.delete(f"/monitoring/profile/{session_id}")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
async def reset_stats(self) -> Dict:
|
||||||
|
"""Reset all statistics."""
|
||||||
|
try:
|
||||||
|
response = await self.client.post("/monitoring/stats/reset")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
def display_dashboard(self, stats: Dict):
|
||||||
|
"""Display the main statistics dashboard."""
|
||||||
|
self.clear_screen()
|
||||||
|
self.print_header("Crawl4AI Monitoring Dashboard")
|
||||||
|
|
||||||
|
# Health Status
|
||||||
|
print(f"\n{Colors.OKGREEN}● Server Status: ONLINE{Colors.ENDC}")
|
||||||
|
print(f"Base URL: {self.base_url}")
|
||||||
|
print(f"Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
|
||||||
|
# Crawler Statistics
|
||||||
|
self.print_section("Crawler Statistics")
|
||||||
|
if "error" in stats:
|
||||||
|
print(f"{Colors.FAIL}Error fetching stats: {stats['error']}{Colors.ENDC}")
|
||||||
|
else:
|
||||||
|
print(f"Active Crawls: {Colors.BOLD}{stats.get('active_crawls', 0)}{Colors.ENDC}")
|
||||||
|
print(f"Total Crawls: {stats.get('total_crawls', 0)}")
|
||||||
|
print(f"Successful: {Colors.OKGREEN}{stats.get('successful_crawls', 0)}{Colors.ENDC}")
|
||||||
|
print(f"Failed: {Colors.FAIL}{stats.get('failed_crawls', 0)}{Colors.ENDC}")
|
||||||
|
print(f"Success Rate: {stats.get('success_rate', 0):.2f}%")
|
||||||
|
print(f"Avg Duration: {stats.get('avg_duration_ms', 0):.2f} ms")
|
||||||
|
|
||||||
|
# Format bytes
|
||||||
|
total_bytes = stats.get('total_bytes_processed', 0)
|
||||||
|
if total_bytes > 1024 * 1024:
|
||||||
|
bytes_str = f"{total_bytes / (1024 * 1024):.2f} MB"
|
||||||
|
elif total_bytes > 1024:
|
||||||
|
bytes_str = f"{total_bytes / 1024:.2f} KB"
|
||||||
|
else:
|
||||||
|
bytes_str = f"{total_bytes} bytes"
|
||||||
|
print(f"Total Data Processed: {bytes_str}")
|
||||||
|
|
||||||
|
# System Statistics
|
||||||
|
if "system_stats" in stats:
|
||||||
|
self.print_section("System Resources")
|
||||||
|
sys_stats = stats["system_stats"]
|
||||||
|
|
||||||
|
cpu = sys_stats.get("cpu_percent", 0)
|
||||||
|
cpu_color = Colors.OKGREEN if cpu < 50 else Colors.WARNING if cpu < 80 else Colors.FAIL
|
||||||
|
print(f"CPU Usage: {cpu_color}{cpu:.1f}%{Colors.ENDC}")
|
||||||
|
|
||||||
|
mem = sys_stats.get("memory_percent", 0)
|
||||||
|
mem_color = Colors.OKGREEN if mem < 50 else Colors.WARNING if mem < 80 else Colors.FAIL
|
||||||
|
print(f"Memory Usage: {mem_color}{mem:.1f}%{Colors.ENDC}")
|
||||||
|
|
||||||
|
mem_used = sys_stats.get("memory_used_mb", 0)
|
||||||
|
mem_available = sys_stats.get("memory_available_mb", 0)
|
||||||
|
print(f"Memory Used: {mem_used:.0f} MB / {mem_available:.0f} MB")
|
||||||
|
|
||||||
|
disk = sys_stats.get("disk_usage_percent", 0)
|
||||||
|
disk_color = Colors.OKGREEN if disk < 70 else Colors.WARNING if disk < 90 else Colors.FAIL
|
||||||
|
print(f"Disk Usage: {disk_color}{disk:.1f}%{Colors.ENDC}")
|
||||||
|
|
||||||
|
print(f"Active Processes: {sys_stats.get('active_processes', 0)}")
|
||||||
|
|
||||||
|
# Navigation
|
||||||
|
self.print_section("Navigation")
|
||||||
|
print(f"[D] Dashboard [S] Profiling Sessions [U] URL Stats [R] Reset Stats [Q] Quit")
|
||||||
|
|
||||||
|
def display_url_stats(self, url_stats: List[Dict]):
|
||||||
|
"""Display URL-specific statistics."""
|
||||||
|
self.clear_screen()
|
||||||
|
self.print_header("URL Statistics")
|
||||||
|
|
||||||
|
if not url_stats:
|
||||||
|
print(f"\n{Colors.WARNING}No URL statistics available yet.{Colors.ENDC}")
|
||||||
|
else:
|
||||||
|
print(f"\nTotal URLs tracked: {len(url_stats)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Table header
|
||||||
|
print(f"{Colors.BOLD}{'URL':<50} {'Requests':<10} {'Success':<10} {'Avg Time':<12} {'Data':<12}{Colors.ENDC}")
|
||||||
|
print("-" * 94)
|
||||||
|
|
||||||
|
# Sort by total requests
|
||||||
|
sorted_stats = sorted(url_stats, key=lambda x: x.get('total_requests', 0), reverse=True)
|
||||||
|
|
||||||
|
for stat in sorted_stats[:20]: # Show top 20
|
||||||
|
url = stat.get('url', 'unknown')
|
||||||
|
if len(url) > 47:
|
||||||
|
url = url[:44] + "..."
|
||||||
|
|
||||||
|
total = stat.get('total_requests', 0)
|
||||||
|
success = stat.get('successful_requests', 0)
|
||||||
|
success_pct = f"{(success/total*100):.0f}%" if total > 0 else "N/A"
|
||||||
|
|
||||||
|
avg_time = stat.get('avg_duration_ms', 0)
|
||||||
|
time_str = f"{avg_time:.0f} ms"
|
||||||
|
|
||||||
|
bytes_processed = stat.get('total_bytes_processed', 0)
|
||||||
|
if bytes_processed > 1024 * 1024:
|
||||||
|
data_str = f"{bytes_processed / (1024 * 1024):.2f} MB"
|
||||||
|
elif bytes_processed > 1024:
|
||||||
|
data_str = f"{bytes_processed / 1024:.2f} KB"
|
||||||
|
else:
|
||||||
|
data_str = f"{bytes_processed} B"
|
||||||
|
|
||||||
|
print(f"{url:<50} {total:<10} {success_pct:<10} {time_str:<12} {data_str:<12}")
|
||||||
|
|
||||||
|
# Navigation
|
||||||
|
self.print_section("Navigation")
|
||||||
|
print(f"[D] Dashboard [S] Profiling Sessions [U] URL Stats [R] Reset Stats [Q] Quit")
|
||||||
|
|
||||||
|
def display_profiling_sessions(self, sessions: List[Dict]):
|
||||||
|
"""Display profiling sessions."""
|
||||||
|
self.clear_screen()
|
||||||
|
self.print_header("Profiling Sessions")
|
||||||
|
|
||||||
|
if not sessions:
|
||||||
|
print(f"\n{Colors.WARNING}No profiling sessions found.{Colors.ENDC}")
|
||||||
|
else:
|
||||||
|
print(f"\nTotal sessions: {len(sessions)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Table header
|
||||||
|
print(f"{Colors.BOLD}{'ID':<25} {'Status':<12} {'URLs':<6} {'Duration':<12} {'Started':<20}{Colors.ENDC}")
|
||||||
|
print("-" * 85)
|
||||||
|
|
||||||
|
# Sort by started time (newest first)
|
||||||
|
sorted_sessions = sorted(sessions, key=lambda x: x.get('started_at', ''), reverse=True)
|
||||||
|
|
||||||
|
for session in sorted_sessions[:15]: # Show top 15
|
||||||
|
session_id = session.get('session_id', 'unknown')
|
||||||
|
if len(session_id) > 22:
|
||||||
|
session_id = session_id[:19] + "..."
|
||||||
|
|
||||||
|
status = session.get('status', 'unknown')
|
||||||
|
status_color = Colors.OKGREEN if status == 'completed' else Colors.WARNING if status == 'running' else Colors.FAIL
|
||||||
|
|
||||||
|
url_count = len(session.get('urls', []))
|
||||||
|
|
||||||
|
duration = session.get('duration_seconds', 0)
|
||||||
|
duration_str = f"{duration}s" if duration else "N/A"
|
||||||
|
|
||||||
|
started = session.get('started_at', 'N/A')
|
||||||
|
if started != 'N/A':
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(started.replace('Z', '+00:00'))
|
||||||
|
started = dt.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"{session_id:<25} {status_color}{status:<12}{Colors.ENDC} {url_count:<6} {duration_str:<12} {started:<20}")
|
||||||
|
|
||||||
|
# Navigation
|
||||||
|
self.print_section("Navigation & Actions")
|
||||||
|
print(f"[D] Dashboard [S] Profiling Sessions [U] URL Stats")
|
||||||
|
print(f"[N] New Session [V] View Session [X] Delete Session")
|
||||||
|
print(f"[R] Reset Stats [Q] Quit")
|
||||||
|
|
||||||
|
async def interactive_session_view(self, session_id: str):
|
||||||
|
"""Display detailed view of a profiling session."""
|
||||||
|
session = await self.get_profiling_session(session_id)
|
||||||
|
|
||||||
|
self.clear_screen()
|
||||||
|
self.print_header(f"Profiling Session: {session_id}")
|
||||||
|
|
||||||
|
if "error" in session:
|
||||||
|
print(f"\n{Colors.FAIL}Error: {session['error']}{Colors.ENDC}")
|
||||||
|
else:
|
||||||
|
print(f"\n{Colors.BOLD}Session ID:{Colors.ENDC} {session.get('session_id', 'N/A')}")
|
||||||
|
|
||||||
|
status = session.get('status', 'unknown')
|
||||||
|
status_color = Colors.OKGREEN if status == 'completed' else Colors.WARNING
|
||||||
|
print(f"{Colors.BOLD}Status:{Colors.ENDC} {status_color}{status}{Colors.ENDC}")
|
||||||
|
|
||||||
|
print(f"{Colors.BOLD}URLs:{Colors.ENDC}")
|
||||||
|
for url in session.get('urls', []):
|
||||||
|
print(f" - {url}")
|
||||||
|
|
||||||
|
started = session.get('started_at', 'N/A')
|
||||||
|
print(f"{Colors.BOLD}Started:{Colors.ENDC} {started}")
|
||||||
|
|
||||||
|
if 'completed_at' in session:
|
||||||
|
print(f"{Colors.BOLD}Completed:{Colors.ENDC} {session['completed_at']}")
|
||||||
|
|
||||||
|
if 'results' in session:
|
||||||
|
self.print_section("Profiling Results")
|
||||||
|
results = session['results']
|
||||||
|
|
||||||
|
print(f"Total Requests: {results.get('total_requests', 0)}")
|
||||||
|
print(f"Successful: {Colors.OKGREEN}{results.get('successful_requests', 0)}{Colors.ENDC}")
|
||||||
|
print(f"Failed: {Colors.FAIL}{results.get('failed_requests', 0)}{Colors.ENDC}")
|
||||||
|
print(f"Avg Response Time: {results.get('avg_response_time_ms', 0):.2f} ms")
|
||||||
|
|
||||||
|
if 'system_metrics' in results:
|
||||||
|
self.print_section("System Metrics During Profiling")
|
||||||
|
metrics = results['system_metrics']
|
||||||
|
print(f"Avg CPU: {metrics.get('avg_cpu_percent', 0):.1f}%")
|
||||||
|
print(f"Peak CPU: {metrics.get('peak_cpu_percent', 0):.1f}%")
|
||||||
|
print(f"Avg Memory: {metrics.get('avg_memory_percent', 0):.1f}%")
|
||||||
|
print(f"Peak Memory: {metrics.get('peak_memory_percent', 0):.1f}%")
|
||||||
|
|
||||||
|
print(f"\n{Colors.OKCYAN}Press any key to return...{Colors.ENDC}")
|
||||||
|
input()
|
||||||
|
|
||||||
|
async def create_new_session(self):
|
||||||
|
"""Interactive session creation."""
|
||||||
|
self.clear_screen()
|
||||||
|
self.print_header("Create New Profiling Session")
|
||||||
|
|
||||||
|
print(f"\n{Colors.BOLD}Enter URLs to profile (one per line, empty line to finish):{Colors.ENDC}")
|
||||||
|
urls = []
|
||||||
|
while True:
|
||||||
|
url = input(f"{Colors.OKCYAN}URL {len(urls) + 1}:{Colors.ENDC} ").strip()
|
||||||
|
if not url:
|
||||||
|
break
|
||||||
|
urls.append(url)
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
print(f"{Colors.FAIL}No URLs provided. Cancelled.{Colors.ENDC}")
|
||||||
|
time.sleep(2)
|
||||||
|
return
|
||||||
|
|
||||||
|
duration = input(f"{Colors.OKCYAN}Duration (seconds, default 30):{Colors.ENDC} ").strip()
|
||||||
|
try:
|
||||||
|
duration = int(duration) if duration else 30
|
||||||
|
except:
|
||||||
|
duration = 30
|
||||||
|
|
||||||
|
print(f"\n{Colors.WARNING}Starting profiling session for {len(urls)} URL(s), {duration}s...{Colors.ENDC}")
|
||||||
|
result = await self.start_profiling_session(urls, duration)
|
||||||
|
|
||||||
|
if "error" in result:
|
||||||
|
print(f"{Colors.FAIL}Error: {result['error']}{Colors.ENDC}")
|
||||||
|
else:
|
||||||
|
print(f"{Colors.OKGREEN}✓ Session started successfully!{Colors.ENDC}")
|
||||||
|
print(f"Session ID: {result.get('session_id', 'N/A')}")
|
||||||
|
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
async def run_dashboard(self):
|
||||||
|
"""Run the interactive dashboard."""
|
||||||
|
print(f"{Colors.OKGREEN}Starting Crawl4AI Monitoring Dashboard...{Colors.ENDC}")
|
||||||
|
print(f"Connecting to {self.base_url}...")
|
||||||
|
|
||||||
|
# Check health
|
||||||
|
health = await self.check_health()
|
||||||
|
if health.get("status") != "healthy":
|
||||||
|
print(f"{Colors.FAIL}Error: Server not responding or unhealthy{Colors.ENDC}")
|
||||||
|
print(f"Health check result: {health}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"{Colors.OKGREEN}✓ Connected successfully!{Colors.ENDC}")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Main loop
|
||||||
|
while self.running:
|
||||||
|
if self.current_view == "dashboard":
|
||||||
|
stats = await self.get_stats()
|
||||||
|
self.display_dashboard(stats)
|
||||||
|
elif self.current_view == "urls":
|
||||||
|
url_stats = await self.get_url_stats()
|
||||||
|
self.display_url_stats(url_stats)
|
||||||
|
elif self.current_view == "sessions":
|
||||||
|
sessions = await self.list_profiling_sessions()
|
||||||
|
self.display_profiling_sessions(sessions)
|
||||||
|
|
||||||
|
# Get user input (non-blocking with timeout)
|
||||||
|
print(f"\n{Colors.OKCYAN}Enter command (or wait 5s for auto-refresh):{Colors.ENDC} ", end="", flush=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Simple input with timeout simulation
|
||||||
|
import select
|
||||||
|
if sys.platform != 'win32':
|
||||||
|
i, _, _ = select.select([sys.stdin], [], [], 5.0)
|
||||||
|
if i:
|
||||||
|
command = sys.stdin.readline().strip().lower()
|
||||||
|
else:
|
||||||
|
command = ""
|
||||||
|
else:
|
||||||
|
# Windows doesn't support select on stdin
|
||||||
|
command = input()
|
||||||
|
except:
|
||||||
|
command = ""
|
||||||
|
|
||||||
|
# Process command
|
||||||
|
if command == 'q':
|
||||||
|
self.running = False
|
||||||
|
elif command == 'd':
|
||||||
|
self.current_view = "dashboard"
|
||||||
|
elif command == 's':
|
||||||
|
self.current_view = "sessions"
|
||||||
|
elif command == 'u':
|
||||||
|
self.current_view = "urls"
|
||||||
|
elif command == 'r':
|
||||||
|
print(f"\n{Colors.WARNING}Resetting statistics...{Colors.ENDC}")
|
||||||
|
await self.reset_stats()
|
||||||
|
time.sleep(1)
|
||||||
|
elif command == 'n' and self.current_view == "sessions":
|
||||||
|
await self.create_new_session()
|
||||||
|
elif command == 'v' and self.current_view == "sessions":
|
||||||
|
session_id = input(f"{Colors.OKCYAN}Enter session ID:{Colors.ENDC} ").strip()
|
||||||
|
if session_id:
|
||||||
|
await self.interactive_session_view(session_id)
|
||||||
|
elif command == 'x' and self.current_view == "sessions":
|
||||||
|
session_id = input(f"{Colors.OKCYAN}Enter session ID to delete:{Colors.ENDC} ").strip()
|
||||||
|
if session_id:
|
||||||
|
result = await self.delete_profiling_session(session_id)
|
||||||
|
if "error" in result:
|
||||||
|
print(f"{Colors.FAIL}Error: {result['error']}{Colors.ENDC}")
|
||||||
|
else:
|
||||||
|
print(f"{Colors.OKGREEN}✓ Session deleted{Colors.ENDC}")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
self.clear_screen()
|
||||||
|
print(f"\n{Colors.OKGREEN}Dashboard closed. Goodbye!{Colors.ENDC}\n")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
parser = argparse.ArgumentParser(description="Crawl4AI Monitoring Dashboard")
|
||||||
|
parser.add_argument(
|
||||||
|
"--url",
|
||||||
|
default="http://localhost:11234",
|
||||||
|
help="Base URL of the Crawl4AI Docker server (default: http://localhost:11234)"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
dashboard = MonitoringDashboard(base_url=args.url)
|
||||||
|
try:
|
||||||
|
await dashboard.run_dashboard()
|
||||||
|
finally:
|
||||||
|
await dashboard.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
88
tests/docker/extended_features/test_monitoring_quick.py
Normal file
88
tests/docker/extended_features/test_monitoring_quick.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Quick test to verify monitoring endpoints are working
|
||||||
|
"""
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
|
||||||
|
BASE_URL = "http://localhost:11234"
|
||||||
|
|
||||||
|
def test_health():
|
||||||
|
"""Test health endpoint"""
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/monitoring/health", timeout=5)
|
||||||
|
if response.status_code == 200:
|
||||||
|
print("✅ Health check: PASSED")
|
||||||
|
print(f" Response: {response.json()}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ Health check: FAILED (status {response.status_code})")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Health check: ERROR - {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_stats():
|
||||||
|
"""Test stats endpoint"""
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/monitoring/stats", timeout=5)
|
||||||
|
if response.status_code == 200:
|
||||||
|
stats = response.json()
|
||||||
|
print("✅ Stats endpoint: PASSED")
|
||||||
|
print(f" Active crawls: {stats.get('active_crawls', 'N/A')}")
|
||||||
|
print(f" Total crawls: {stats.get('total_crawls', 'N/A')}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ Stats endpoint: FAILED (status {response.status_code})")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Stats endpoint: ERROR - {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_url_stats():
|
||||||
|
"""Test URL stats endpoint"""
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/monitoring/stats/urls", timeout=5)
|
||||||
|
if response.status_code == 200:
|
||||||
|
print("✅ URL stats endpoint: PASSED")
|
||||||
|
url_stats = response.json()
|
||||||
|
print(f" URLs tracked: {len(url_stats)}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ URL stats endpoint: FAILED (status {response.status_code})")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ URL stats endpoint: ERROR - {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Monitoring Endpoints Quick Test")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"\nTesting server at: {BASE_URL}")
|
||||||
|
print("\nMake sure the server is running:")
|
||||||
|
print(" cd deploy/docker && python server.py")
|
||||||
|
print("\n" + "-" * 60 + "\n")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
results.append(test_health())
|
||||||
|
print()
|
||||||
|
results.append(test_stats())
|
||||||
|
print()
|
||||||
|
results.append(test_url_stats())
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
passed = sum(results)
|
||||||
|
total = len(results)
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
print(f"✅ All tests passed! ({passed}/{total})")
|
||||||
|
print("\nMonitoring endpoints are working correctly! 🎉")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print(f"❌ Some tests failed ({passed}/{total} passed)")
|
||||||
|
print("\nPlease check the server logs for errors.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
522
tests/docker/test_monitoring_endpoints.py
Normal file
522
tests/docker/test_monitoring_endpoints.py
Normal file
@@ -0,0 +1,522 @@
|
|||||||
|
"""
|
||||||
|
Integration tests for monitoring and profiling endpoints.
|
||||||
|
|
||||||
|
Tests all monitoring endpoints including profiling sessions, statistics,
|
||||||
|
health checks, and real-time streaming.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from httpx import AsyncClient
|
||||||
|
|
||||||
|
# Base URL for the Docker API server
|
||||||
|
BASE_URL = "http://localhost:11235"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def event_loop():
|
||||||
|
"""Create event loop for async tests."""
|
||||||
|
loop = asyncio.get_event_loop_policy().new_event_loop()
|
||||||
|
yield loop
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
async def client():
|
||||||
|
"""Create HTTP client for tests."""
|
||||||
|
async with AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
|
||||||
|
yield client
|
||||||
|
|
||||||
|
|
||||||
|
class TestHealthEndpoint:
|
||||||
|
"""Tests for /monitoring/health endpoint."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_health_check(self, client: AsyncClient):
|
||||||
|
"""Test basic health check returns OK."""
|
||||||
|
response = await client.get("/monitoring/health")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["status"] == "healthy"
|
||||||
|
assert "uptime_seconds" in data
|
||||||
|
assert data["uptime_seconds"] >= 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestStatsEndpoints:
|
||||||
|
"""Tests for /monitoring/stats/* endpoints."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_stats_empty(self, client: AsyncClient):
|
||||||
|
"""Test getting stats when no crawls have been performed."""
|
||||||
|
# Reset stats first
|
||||||
|
await client.post("/monitoring/stats/reset")
|
||||||
|
|
||||||
|
response = await client.get("/monitoring/stats")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# Verify all expected fields
|
||||||
|
assert "active_crawls" in data
|
||||||
|
assert "total_crawls" in data
|
||||||
|
assert "successful_crawls" in data
|
||||||
|
assert "failed_crawls" in data
|
||||||
|
assert "success_rate" in data
|
||||||
|
assert "avg_duration_ms" in data
|
||||||
|
assert "total_bytes_processed" in data
|
||||||
|
assert "system_stats" in data
|
||||||
|
|
||||||
|
# Verify system stats
|
||||||
|
system = data["system_stats"]
|
||||||
|
assert "cpu_percent" in system
|
||||||
|
assert "memory_percent" in system
|
||||||
|
assert "memory_used_mb" in system
|
||||||
|
assert "memory_available_mb" in system
|
||||||
|
assert "disk_usage_percent" in system
|
||||||
|
assert "active_processes" in system
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_stats_after_crawl(self, client: AsyncClient):
|
||||||
|
"""Test stats are updated after performing a crawl."""
|
||||||
|
# Reset stats
|
||||||
|
await client.post("/monitoring/stats/reset")
|
||||||
|
|
||||||
|
# Perform a simple crawl
|
||||||
|
crawl_request = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"crawler_config": {
|
||||||
|
"word_count_threshold": 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
crawl_response = await client.post("/crawl", json=crawl_request)
|
||||||
|
assert crawl_response.status_code == 200
|
||||||
|
|
||||||
|
# Get stats
|
||||||
|
response = await client.get("/monitoring/stats")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# Verify stats are updated
|
||||||
|
assert data["total_crawls"] >= 1
|
||||||
|
assert data["successful_crawls"] >= 0
|
||||||
|
assert data["failed_crawls"] >= 0
|
||||||
|
assert data["total_crawls"] == data["successful_crawls"] + data["failed_crawls"]
|
||||||
|
|
||||||
|
# Verify success rate calculation
|
||||||
|
if data["total_crawls"] > 0:
|
||||||
|
expected_rate = (data["successful_crawls"] / data["total_crawls"]) * 100
|
||||||
|
assert abs(data["success_rate"] - expected_rate) < 0.01
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_stats_reset(self, client: AsyncClient):
|
||||||
|
"""Test resetting stats clears all counters."""
|
||||||
|
# Ensure we have some stats
|
||||||
|
crawl_request = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
await client.post("/crawl", json=crawl_request)
|
||||||
|
|
||||||
|
# Reset stats
|
||||||
|
reset_response = await client.post("/monitoring/stats/reset")
|
||||||
|
assert reset_response.status_code == 200
|
||||||
|
data = reset_response.json()
|
||||||
|
assert data["status"] == "reset"
|
||||||
|
assert "previous_stats" in data
|
||||||
|
|
||||||
|
# Verify stats are cleared
|
||||||
|
stats_response = await client.get("/monitoring/stats")
|
||||||
|
stats = stats_response.json()
|
||||||
|
assert stats["total_crawls"] == 0
|
||||||
|
assert stats["successful_crawls"] == 0
|
||||||
|
assert stats["failed_crawls"] == 0
|
||||||
|
assert stats["active_crawls"] == 0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_url_specific_stats(self, client: AsyncClient):
|
||||||
|
"""Test getting URL-specific statistics."""
|
||||||
|
# Reset and crawl
|
||||||
|
await client.post("/monitoring/stats/reset")
|
||||||
|
crawl_request = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
await client.post("/crawl", json=crawl_request)
|
||||||
|
|
||||||
|
# Get URL stats
|
||||||
|
response = await client.get("/monitoring/stats/urls")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert isinstance(data, list)
|
||||||
|
if len(data) > 0:
|
||||||
|
url_stat = data[0]
|
||||||
|
assert "url" in url_stat
|
||||||
|
assert "total_requests" in url_stat
|
||||||
|
assert "successful_requests" in url_stat
|
||||||
|
assert "failed_requests" in url_stat
|
||||||
|
assert "avg_duration_ms" in url_stat
|
||||||
|
assert "total_bytes_processed" in url_stat
|
||||||
|
assert "last_request_time" in url_stat
|
||||||
|
|
||||||
|
|
||||||
|
class TestStatsStreaming:
|
||||||
|
"""Tests for /monitoring/stats/stream SSE endpoint."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_stats_stream_basic(self, client: AsyncClient):
|
||||||
|
"""Test SSE streaming of statistics."""
|
||||||
|
# Start streaming (collect a few events then stop)
|
||||||
|
events = []
|
||||||
|
async with client.stream("GET", "/monitoring/stats/stream") as response:
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert "text/event-stream" in response.headers.get("content-type", "")
|
||||||
|
|
||||||
|
# Collect first 3 events
|
||||||
|
count = 0
|
||||||
|
async for line in response.aiter_lines():
|
||||||
|
if line.startswith("data: "):
|
||||||
|
data_str = line[6:] # Remove "data: " prefix
|
||||||
|
data = json.loads(data_str)
|
||||||
|
events.append(data)
|
||||||
|
count += 1
|
||||||
|
if count >= 3:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Verify we got events
|
||||||
|
assert len(events) >= 3
|
||||||
|
|
||||||
|
# Verify event structure
|
||||||
|
for event in events:
|
||||||
|
assert "active_crawls" in event
|
||||||
|
assert "total_crawls" in event
|
||||||
|
assert "successful_crawls" in event
|
||||||
|
assert "system_stats" in event
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_stats_stream_during_crawl(self, client: AsyncClient):
|
||||||
|
"""Test streaming updates during active crawl."""
|
||||||
|
# Start streaming in background
|
||||||
|
stream_task = None
|
||||||
|
events = []
|
||||||
|
|
||||||
|
async def collect_stream():
|
||||||
|
async with client.stream("GET", "/monitoring/stats/stream") as response:
|
||||||
|
async for line in response.aiter_lines():
|
||||||
|
if line.startswith("data: "):
|
||||||
|
data_str = line[6:]
|
||||||
|
data = json.loads(data_str)
|
||||||
|
events.append(data)
|
||||||
|
if len(events) >= 5:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Start stream collection
|
||||||
|
stream_task = asyncio.create_task(collect_stream())
|
||||||
|
|
||||||
|
# Wait a bit then start crawl
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
crawl_request = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
asyncio.create_task(client.post("/crawl", json=crawl_request))
|
||||||
|
|
||||||
|
# Wait for events
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(stream_task, timeout=15.0)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
stream_task.cancel()
|
||||||
|
|
||||||
|
# Should have collected some events
|
||||||
|
assert len(events) > 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestProfilingEndpoints:
|
||||||
|
"""Tests for /monitoring/profile/* endpoints."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_list_profiling_sessions_empty(self, client: AsyncClient):
|
||||||
|
"""Test listing profiling sessions when none exist."""
|
||||||
|
response = await client.get("/monitoring/profile")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert "sessions" in data
|
||||||
|
assert isinstance(data["sessions"], list)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_start_profiling_session(self, client: AsyncClient):
|
||||||
|
"""Test starting a new profiling session."""
|
||||||
|
request_data = {
|
||||||
|
"urls": ["https://www.example.com", "https://www.python.org"],
|
||||||
|
"duration_seconds": 2,
|
||||||
|
"crawler_config": {
|
||||||
|
"word_count_threshold": 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert "session_id" in data
|
||||||
|
assert "status" in data
|
||||||
|
assert data["status"] == "running"
|
||||||
|
assert "started_at" in data
|
||||||
|
assert "urls" in data
|
||||||
|
assert len(data["urls"]) == 2
|
||||||
|
|
||||||
|
return data["session_id"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_profiling_session(self, client: AsyncClient):
|
||||||
|
"""Test retrieving a profiling session by ID."""
|
||||||
|
# Start a session
|
||||||
|
request_data = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"duration_seconds": 2,
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
start_response = await client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
session_id = start_response.json()["session_id"]
|
||||||
|
|
||||||
|
# Get session immediately (should be running)
|
||||||
|
response = await client.get(f"/monitoring/profile/{session_id}")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert data["session_id"] == session_id
|
||||||
|
assert data["status"] in ["running", "completed"]
|
||||||
|
assert "started_at" in data
|
||||||
|
assert "urls" in data
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_profiling_session_completion(self, client: AsyncClient):
|
||||||
|
"""Test profiling session completes and produces results."""
|
||||||
|
# Start a short session
|
||||||
|
request_data = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"duration_seconds": 3,
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
start_response = await client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
session_id = start_response.json()["session_id"]
|
||||||
|
|
||||||
|
# Wait for completion
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
|
||||||
|
# Get completed session
|
||||||
|
response = await client.get(f"/monitoring/profile/{session_id}")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert data["status"] == "completed"
|
||||||
|
assert "completed_at" in data
|
||||||
|
assert "duration_seconds" in data
|
||||||
|
assert "results" in data
|
||||||
|
|
||||||
|
# Verify results structure
|
||||||
|
results = data["results"]
|
||||||
|
assert "total_requests" in results
|
||||||
|
assert "successful_requests" in results
|
||||||
|
assert "failed_requests" in results
|
||||||
|
assert "avg_response_time_ms" in results
|
||||||
|
assert "system_metrics" in results
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_profiling_session_not_found(self, client: AsyncClient):
|
||||||
|
"""Test retrieving non-existent session returns 404."""
|
||||||
|
response = await client.get("/monitoring/profile/nonexistent-id-12345")
|
||||||
|
assert response.status_code == 404
|
||||||
|
data = response.json()
|
||||||
|
assert "detail" in data
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_delete_profiling_session(self, client: AsyncClient):
|
||||||
|
"""Test deleting a profiling session."""
|
||||||
|
# Start a session
|
||||||
|
request_data = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"duration_seconds": 1,
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
start_response = await client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
session_id = start_response.json()["session_id"]
|
||||||
|
|
||||||
|
# Wait for completion
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
# Delete session
|
||||||
|
delete_response = await client.delete(f"/monitoring/profile/{session_id}")
|
||||||
|
assert delete_response.status_code == 200
|
||||||
|
data = delete_response.json()
|
||||||
|
assert data["status"] == "deleted"
|
||||||
|
assert data["session_id"] == session_id
|
||||||
|
|
||||||
|
# Verify it's gone
|
||||||
|
get_response = await client.get(f"/monitoring/profile/{session_id}")
|
||||||
|
assert get_response.status_code == 404
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cleanup_old_sessions(self, client: AsyncClient):
|
||||||
|
"""Test cleaning up old profiling sessions."""
|
||||||
|
# Start a few sessions
|
||||||
|
for i in range(3):
|
||||||
|
request_data = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"duration_seconds": 1,
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
await client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
|
||||||
|
# Wait for completion
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
# Cleanup sessions older than 0 seconds (all completed ones)
|
||||||
|
cleanup_response = await client.post(
|
||||||
|
"/monitoring/profile/cleanup",
|
||||||
|
json={"max_age_seconds": 0}
|
||||||
|
)
|
||||||
|
assert cleanup_response.status_code == 200
|
||||||
|
data = cleanup_response.json()
|
||||||
|
assert "deleted_count" in data
|
||||||
|
assert data["deleted_count"] >= 0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_list_sessions_after_operations(self, client: AsyncClient):
|
||||||
|
"""Test listing sessions shows correct state after various operations."""
|
||||||
|
# Start a session
|
||||||
|
request_data = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"duration_seconds": 5,
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
start_response = await client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
session_id = start_response.json()["session_id"]
|
||||||
|
|
||||||
|
# List sessions
|
||||||
|
list_response = await client.get("/monitoring/profile")
|
||||||
|
assert list_response.status_code == 200
|
||||||
|
data = list_response.json()
|
||||||
|
|
||||||
|
# Should have at least one session
|
||||||
|
sessions = data["sessions"]
|
||||||
|
assert len(sessions) >= 1
|
||||||
|
|
||||||
|
# Find our session
|
||||||
|
our_session = next((s for s in sessions if s["session_id"] == session_id), None)
|
||||||
|
assert our_session is not None
|
||||||
|
assert our_session["status"] in ["running", "completed"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestProfilingWithCrawlConfig:
|
||||||
|
"""Tests for profiling with various crawler configurations."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_profiling_with_extraction_strategy(self, client: AsyncClient):
|
||||||
|
"""Test profiling with extraction strategy configured."""
|
||||||
|
request_data = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"duration_seconds": 2,
|
||||||
|
"crawler_config": {
|
||||||
|
"word_count_threshold": 10,
|
||||||
|
"extraction_strategy": "NoExtractionStrategy"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["status"] == "running"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_profiling_with_browser_config(self, client: AsyncClient):
|
||||||
|
"""Test profiling with custom browser configuration."""
|
||||||
|
request_data = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"duration_seconds": 2,
|
||||||
|
"browser_config": {
|
||||||
|
"headless": True,
|
||||||
|
"verbose": False
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"word_count_threshold": 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await client.post("/monitoring/profile/start", json=request_data)
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["status"] == "running"
|
||||||
|
|
||||||
|
|
||||||
|
class TestIntegrationScenarios:
|
||||||
|
"""Integration tests for real-world monitoring scenarios."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_concurrent_crawls_and_monitoring(self, client: AsyncClient):
|
||||||
|
"""Test monitoring multiple concurrent crawls."""
|
||||||
|
# Reset stats
|
||||||
|
await client.post("/monitoring/stats/reset")
|
||||||
|
|
||||||
|
# Start multiple crawls concurrently
|
||||||
|
crawl_tasks = []
|
||||||
|
urls = [
|
||||||
|
"https://www.example.com",
|
||||||
|
"https://www.python.org",
|
||||||
|
"https://www.github.com"
|
||||||
|
]
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
crawl_request = {
|
||||||
|
"urls": [url],
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
task = client.post("/crawl", json=crawl_request)
|
||||||
|
crawl_tasks.append(task)
|
||||||
|
|
||||||
|
# Execute concurrently
|
||||||
|
responses = await asyncio.gather(*crawl_tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
# Get stats
|
||||||
|
await asyncio.sleep(1) # Give tracking time to update
|
||||||
|
stats_response = await client.get("/monitoring/stats")
|
||||||
|
stats = stats_response.json()
|
||||||
|
|
||||||
|
# Should have tracked multiple crawls
|
||||||
|
assert stats["total_crawls"] >= len(urls)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_profiling_and_stats_correlation(self, client: AsyncClient):
|
||||||
|
"""Test that profiling data correlates with statistics."""
|
||||||
|
# Reset stats
|
||||||
|
await client.post("/monitoring/stats/reset")
|
||||||
|
|
||||||
|
# Start profiling session
|
||||||
|
profile_request = {
|
||||||
|
"urls": ["https://www.example.com"],
|
||||||
|
"duration_seconds": 3,
|
||||||
|
"crawler_config": {"word_count_threshold": 10}
|
||||||
|
}
|
||||||
|
profile_response = await client.post("/monitoring/profile/start", json=profile_request)
|
||||||
|
session_id = profile_response.json()["session_id"]
|
||||||
|
|
||||||
|
# Wait for completion
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
|
||||||
|
# Get profiling results
|
||||||
|
profile_data_response = await client.get(f"/monitoring/profile/{session_id}")
|
||||||
|
profile_data = profile_data_response.json()
|
||||||
|
|
||||||
|
# Get stats
|
||||||
|
stats_response = await client.get("/monitoring/stats")
|
||||||
|
stats = stats_response.json()
|
||||||
|
|
||||||
|
# Stats should reflect profiling activity
|
||||||
|
assert stats["total_crawls"] >= profile_data["results"]["total_requests"]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v", "-s"])
|
||||||
Reference in New Issue
Block a user