Profiling/monitoring :Add interactive monitoring dashboard and integration tests for monitoring endpoints

- Implemented an interactive monitoring dashboard in `demo_monitoring_dashboard.py` for real-time statistics, profiling session management, and system resource monitoring. - Created a quick test script `test_monitoring_quick.py` to verify the functionality of monitoring endpoints. - Developed comprehensive integration tests in `test_monitoring_endpoints.py` covering health checks, statistics, profiling sessions, and real-time streaming. - Added error handling and user-friendly output for better usability in the dashboard.
2025-10-16 16:48:13 +08:00
parent 74eeff4c51
commit 3877335d89
7 changed files with 2363 additions and 1 deletions
--- a/tests/docker/test_monitoring_endpoints.py
+++ b/tests/docker/test_monitoring_endpoints.py
@@ -0,0 +1,522 @@
+"""
+Integration tests for monitoring and profiling endpoints.
+
+Tests all monitoring endpoints including profiling sessions, statistics,
+health checks, and real-time streaming.
+"""
+
+import asyncio
+import json
+import time
+from typing import Dict, List
+
+import pytest
+from httpx import AsyncClient
+
+# Base URL for the Docker API server
+BASE_URL = "http://localhost:11235"
+
+
+@pytest.fixture(scope="module")
+def event_loop():
+    """Create event loop for async tests."""
+    loop = asyncio.get_event_loop_policy().new_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture(scope="module")
+async def client():
+    """Create HTTP client for tests."""
+    async with AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
+        yield client
+
+
+class TestHealthEndpoint:
+    """Tests for /monitoring/health endpoint."""
+
+    @pytest.mark.asyncio
+    async def test_health_check(self, client: AsyncClient):
+        """Test basic health check returns OK."""
+        response = await client.get("/monitoring/health")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "healthy"
+        assert "uptime_seconds" in data
+        assert data["uptime_seconds"] >= 0
+
+
+class TestStatsEndpoints:
+    """Tests for /monitoring/stats/* endpoints."""
+
+    @pytest.mark.asyncio
+    async def test_get_stats_empty(self, client: AsyncClient):
+        """Test getting stats when no crawls have been performed."""
+        # Reset stats first
+        await client.post("/monitoring/stats/reset")
+        
+        response = await client.get("/monitoring/stats")
+        assert response.status_code == 200
+        data = response.json()
+        
+        # Verify all expected fields
+        assert "active_crawls" in data
+        assert "total_crawls" in data
+        assert "successful_crawls" in data
+        assert "failed_crawls" in data
+        assert "success_rate" in data
+        assert "avg_duration_ms" in data
+        assert "total_bytes_processed" in data
+        assert "system_stats" in data
+        
+        # Verify system stats
+        system = data["system_stats"]
+        assert "cpu_percent" in system
+        assert "memory_percent" in system
+        assert "memory_used_mb" in system
+        assert "memory_available_mb" in system
+        assert "disk_usage_percent" in system
+        assert "active_processes" in system
+
+    @pytest.mark.asyncio
+    async def test_stats_after_crawl(self, client: AsyncClient):
+        """Test stats are updated after performing a crawl."""
+        # Reset stats
+        await client.post("/monitoring/stats/reset")
+        
+        # Perform a simple crawl
+        crawl_request = {
+            "urls": ["https://www.example.com"],
+            "crawler_config": {
+                "word_count_threshold": 10
+            }
+        }
+        crawl_response = await client.post("/crawl", json=crawl_request)
+        assert crawl_response.status_code == 200
+        
+        # Get stats
+        response = await client.get("/monitoring/stats")
+        assert response.status_code == 200
+        data = response.json()
+        
+        # Verify stats are updated
+        assert data["total_crawls"] >= 1
+        assert data["successful_crawls"] >= 0
+        assert data["failed_crawls"] >= 0
+        assert data["total_crawls"] == data["successful_crawls"] + data["failed_crawls"]
+        
+        # Verify success rate calculation
+        if data["total_crawls"] > 0:
+            expected_rate = (data["successful_crawls"] / data["total_crawls"]) * 100
+            assert abs(data["success_rate"] - expected_rate) < 0.01
+
+    @pytest.mark.asyncio
+    async def test_stats_reset(self, client: AsyncClient):
+        """Test resetting stats clears all counters."""
+        # Ensure we have some stats
+        crawl_request = {
+            "urls": ["https://www.example.com"],
+            "crawler_config": {"word_count_threshold": 10}
+        }
+        await client.post("/crawl", json=crawl_request)
+        
+        # Reset stats
+        reset_response = await client.post("/monitoring/stats/reset")
+        assert reset_response.status_code == 200
+        data = reset_response.json()
+        assert data["status"] == "reset"
+        assert "previous_stats" in data
+        
+        # Verify stats are cleared
+        stats_response = await client.get("/monitoring/stats")
+        stats = stats_response.json()
+        assert stats["total_crawls"] == 0
+        assert stats["successful_crawls"] == 0
+        assert stats["failed_crawls"] == 0
+        assert stats["active_crawls"] == 0
+
+    @pytest.mark.asyncio
+    async def test_url_specific_stats(self, client: AsyncClient):
+        """Test getting URL-specific statistics."""
+        # Reset and crawl
+        await client.post("/monitoring/stats/reset")
+        crawl_request = {
+            "urls": ["https://www.example.com"],
+            "crawler_config": {"word_count_threshold": 10}
+        }
+        await client.post("/crawl", json=crawl_request)
+        
+        # Get URL stats
+        response = await client.get("/monitoring/stats/urls")
+        assert response.status_code == 200
+        data = response.json()
+        
+        assert isinstance(data, list)
+        if len(data) > 0:
+            url_stat = data[0]
+            assert "url" in url_stat
+            assert "total_requests" in url_stat
+            assert "successful_requests" in url_stat
+            assert "failed_requests" in url_stat
+            assert "avg_duration_ms" in url_stat
+            assert "total_bytes_processed" in url_stat
+            assert "last_request_time" in url_stat
+
+
+class TestStatsStreaming:
+    """Tests for /monitoring/stats/stream SSE endpoint."""
+
+    @pytest.mark.asyncio
+    async def test_stats_stream_basic(self, client: AsyncClient):
+        """Test SSE streaming of statistics."""
+        # Start streaming (collect a few events then stop)
+        events = []
+        async with client.stream("GET", "/monitoring/stats/stream") as response:
+            assert response.status_code == 200
+            assert "text/event-stream" in response.headers.get("content-type", "")
+            
+            # Collect first 3 events
+            count = 0
+            async for line in response.aiter_lines():
+                if line.startswith("data: "):
+                    data_str = line[6:]  # Remove "data: " prefix
+                    data = json.loads(data_str)
+                    events.append(data)
+                    count += 1
+                    if count >= 3:
+                        break
+        
+        # Verify we got events
+        assert len(events) >= 3
+        
+        # Verify event structure
+        for event in events:
+            assert "active_crawls" in event
+            assert "total_crawls" in event
+            assert "successful_crawls" in event
+            assert "system_stats" in event
+
+    @pytest.mark.asyncio
+    async def test_stats_stream_during_crawl(self, client: AsyncClient):
+        """Test streaming updates during active crawl."""
+        # Start streaming in background
+        stream_task = None
+        events = []
+        
+        async def collect_stream():
+            async with client.stream("GET", "/monitoring/stats/stream") as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data_str = line[6:]
+                        data = json.loads(data_str)
+                        events.append(data)
+                        if len(events) >= 5:
+                            break
+        
+        # Start stream collection
+        stream_task = asyncio.create_task(collect_stream())
+        
+        # Wait a bit then start crawl
+        await asyncio.sleep(1)
+        crawl_request = {
+            "urls": ["https://www.example.com"],
+            "crawler_config": {"word_count_threshold": 10}
+        }
+        asyncio.create_task(client.post("/crawl", json=crawl_request))
+        
+        # Wait for events
+        try:
+            await asyncio.wait_for(stream_task, timeout=15.0)
+        except asyncio.TimeoutError:
+            stream_task.cancel()
+        
+        # Should have collected some events
+        assert len(events) > 0
+
+
+class TestProfilingEndpoints:
+    """Tests for /monitoring/profile/* endpoints."""
+
+    @pytest.mark.asyncio
+    async def test_list_profiling_sessions_empty(self, client: AsyncClient):
+        """Test listing profiling sessions when none exist."""
+        response = await client.get("/monitoring/profile")
+        assert response.status_code == 200
+        data = response.json()
+        assert "sessions" in data
+        assert isinstance(data["sessions"], list)
+
+    @pytest.mark.asyncio
+    async def test_start_profiling_session(self, client: AsyncClient):
+        """Test starting a new profiling session."""
+        request_data = {
+            "urls": ["https://www.example.com", "https://www.python.org"],
+            "duration_seconds": 2,
+            "crawler_config": {
+                "word_count_threshold": 10
+            }
+        }
+        
+        response = await client.post("/monitoring/profile/start", json=request_data)
+        assert response.status_code == 200
+        data = response.json()
+        
+        assert "session_id" in data
+        assert "status" in data
+        assert data["status"] == "running"
+        assert "started_at" in data
+        assert "urls" in data
+        assert len(data["urls"]) == 2
+        
+        return data["session_id"]
+
+    @pytest.mark.asyncio
+    async def test_get_profiling_session(self, client: AsyncClient):
+        """Test retrieving a profiling session by ID."""
+        # Start a session
+        request_data = {
+            "urls": ["https://www.example.com"],
+            "duration_seconds": 2,
+            "crawler_config": {"word_count_threshold": 10}
+        }
+        start_response = await client.post("/monitoring/profile/start", json=request_data)
+        session_id = start_response.json()["session_id"]
+        
+        # Get session immediately (should be running)
+        response = await client.get(f"/monitoring/profile/{session_id}")
+        assert response.status_code == 200
+        data = response.json()
+        
+        assert data["session_id"] == session_id
+        assert data["status"] in ["running", "completed"]
+        assert "started_at" in data
+        assert "urls" in data
+
+    @pytest.mark.asyncio
+    async def test_profiling_session_completion(self, client: AsyncClient):
+        """Test profiling session completes and produces results."""
+        # Start a short session
+        request_data = {
+            "urls": ["https://www.example.com"],
+            "duration_seconds": 3,
+            "crawler_config": {"word_count_threshold": 10}
+        }
+        start_response = await client.post("/monitoring/profile/start", json=request_data)
+        session_id = start_response.json()["session_id"]
+        
+        # Wait for completion
+        await asyncio.sleep(5)
+        
+        # Get completed session
+        response = await client.get(f"/monitoring/profile/{session_id}")
+        assert response.status_code == 200
+        data = response.json()
+        
+        assert data["status"] == "completed"
+        assert "completed_at" in data
+        assert "duration_seconds" in data
+        assert "results" in data
+        
+        # Verify results structure
+        results = data["results"]
+        assert "total_requests" in results
+        assert "successful_requests" in results
+        assert "failed_requests" in results
+        assert "avg_response_time_ms" in results
+        assert "system_metrics" in results
+
+    @pytest.mark.asyncio
+    async def test_profiling_session_not_found(self, client: AsyncClient):
+        """Test retrieving non-existent session returns 404."""
+        response = await client.get("/monitoring/profile/nonexistent-id-12345")
+        assert response.status_code == 404
+        data = response.json()
+        assert "detail" in data
+
+    @pytest.mark.asyncio
+    async def test_delete_profiling_session(self, client: AsyncClient):
+        """Test deleting a profiling session."""
+        # Start a session
+        request_data = {
+            "urls": ["https://www.example.com"],
+            "duration_seconds": 1,
+            "crawler_config": {"word_count_threshold": 10}
+        }
+        start_response = await client.post("/monitoring/profile/start", json=request_data)
+        session_id = start_response.json()["session_id"]
+        
+        # Wait for completion
+        await asyncio.sleep(2)
+        
+        # Delete session
+        delete_response = await client.delete(f"/monitoring/profile/{session_id}")
+        assert delete_response.status_code == 200
+        data = delete_response.json()
+        assert data["status"] == "deleted"
+        assert data["session_id"] == session_id
+        
+        # Verify it's gone
+        get_response = await client.get(f"/monitoring/profile/{session_id}")
+        assert get_response.status_code == 404
+
+    @pytest.mark.asyncio
+    async def test_cleanup_old_sessions(self, client: AsyncClient):
+        """Test cleaning up old profiling sessions."""
+        # Start a few sessions
+        for i in range(3):
+            request_data = {
+                "urls": ["https://www.example.com"],
+                "duration_seconds": 1,
+                "crawler_config": {"word_count_threshold": 10}
+            }
+            await client.post("/monitoring/profile/start", json=request_data)
+        
+        # Wait for completion
+        await asyncio.sleep(2)
+        
+        # Cleanup sessions older than 0 seconds (all completed ones)
+        cleanup_response = await client.post(
+            "/monitoring/profile/cleanup",
+            json={"max_age_seconds": 0}
+        )
+        assert cleanup_response.status_code == 200
+        data = cleanup_response.json()
+        assert "deleted_count" in data
+        assert data["deleted_count"] >= 0
+
+    @pytest.mark.asyncio
+    async def test_list_sessions_after_operations(self, client: AsyncClient):
+        """Test listing sessions shows correct state after various operations."""
+        # Start a session
+        request_data = {
+            "urls": ["https://www.example.com"],
+            "duration_seconds": 5,
+            "crawler_config": {"word_count_threshold": 10}
+        }
+        start_response = await client.post("/monitoring/profile/start", json=request_data)
+        session_id = start_response.json()["session_id"]
+        
+        # List sessions
+        list_response = await client.get("/monitoring/profile")
+        assert list_response.status_code == 200
+        data = list_response.json()
+        
+        # Should have at least one session
+        sessions = data["sessions"]
+        assert len(sessions) >= 1
+        
+        # Find our session
+        our_session = next((s for s in sessions if s["session_id"] == session_id), None)
+        assert our_session is not None
+        assert our_session["status"] in ["running", "completed"]
+
+
+class TestProfilingWithCrawlConfig:
+    """Tests for profiling with various crawler configurations."""
+
+    @pytest.mark.asyncio
+    async def test_profiling_with_extraction_strategy(self, client: AsyncClient):
+        """Test profiling with extraction strategy configured."""
+        request_data = {
+            "urls": ["https://www.example.com"],
+            "duration_seconds": 2,
+            "crawler_config": {
+                "word_count_threshold": 10,
+                "extraction_strategy": "NoExtractionStrategy"
+            }
+        }
+        
+        response = await client.post("/monitoring/profile/start", json=request_data)
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "running"
+
+    @pytest.mark.asyncio
+    async def test_profiling_with_browser_config(self, client: AsyncClient):
+        """Test profiling with custom browser configuration."""
+        request_data = {
+            "urls": ["https://www.example.com"],
+            "duration_seconds": 2,
+            "browser_config": {
+                "headless": True,
+                "verbose": False
+            },
+            "crawler_config": {
+                "word_count_threshold": 10
+            }
+        }
+        
+        response = await client.post("/monitoring/profile/start", json=request_data)
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "running"
+
+
+class TestIntegrationScenarios:
+    """Integration tests for real-world monitoring scenarios."""
+
+    @pytest.mark.asyncio
+    async def test_concurrent_crawls_and_monitoring(self, client: AsyncClient):
+        """Test monitoring multiple concurrent crawls."""
+        # Reset stats
+        await client.post("/monitoring/stats/reset")
+        
+        # Start multiple crawls concurrently
+        crawl_tasks = []
+        urls = [
+            "https://www.example.com",
+            "https://www.python.org",
+            "https://www.github.com"
+        ]
+        
+        for url in urls:
+            crawl_request = {
+                "urls": [url],
+                "crawler_config": {"word_count_threshold": 10}
+            }
+            task = client.post("/crawl", json=crawl_request)
+            crawl_tasks.append(task)
+        
+        # Execute concurrently
+        responses = await asyncio.gather(*crawl_tasks, return_exceptions=True)
+        
+        # Get stats
+        await asyncio.sleep(1)  # Give tracking time to update
+        stats_response = await client.get("/monitoring/stats")
+        stats = stats_response.json()
+        
+        # Should have tracked multiple crawls
+        assert stats["total_crawls"] >= len(urls)
+
+    @pytest.mark.asyncio
+    async def test_profiling_and_stats_correlation(self, client: AsyncClient):
+        """Test that profiling data correlates with statistics."""
+        # Reset stats
+        await client.post("/monitoring/stats/reset")
+        
+        # Start profiling session
+        profile_request = {
+            "urls": ["https://www.example.com"],
+            "duration_seconds": 3,
+            "crawler_config": {"word_count_threshold": 10}
+        }
+        profile_response = await client.post("/monitoring/profile/start", json=profile_request)
+        session_id = profile_response.json()["session_id"]
+        
+        # Wait for completion
+        await asyncio.sleep(5)
+        
+        # Get profiling results
+        profile_data_response = await client.get(f"/monitoring/profile/{session_id}")
+        profile_data = profile_data_response.json()
+        
+        # Get stats
+        stats_response = await client.get("/monitoring/stats")
+        stats = stats_response.json()
+        
+        # Stats should reflect profiling activity
+        assert stats["total_crawls"] >= profile_data["results"]["total_requests"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])