crawl4ai/tests/docker/test_monitoring_endpoints.py

"""
Integration tests for monitoring and profiling endpoints.

Tests all monitoring endpoints including profiling sessions, statistics,
health checks, and real-time streaming.
"""

import asyncio
import json
import time
from typing import Dict, List

import pytest
from httpx import AsyncClient

# Base URL for the Docker API server
BASE_URL = "http://localhost:11235"


@pytest.fixture(scope="module")
def event_loop():
    """Create event loop for async tests."""
    loop = asyncio.get_event_loop_policy().new_event_loop()
    yield loop
    loop.close()


@pytest.fixture(scope="module")
async def client():
    """Create HTTP client for tests."""
    async with AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
        yield client


class TestHealthEndpoint:
    """Tests for /monitoring/health endpoint."""

    @pytest.mark.asyncio
    async def test_health_check(self, client: AsyncClient):
        """Test basic health check returns OK."""
        response = await client.get("/monitoring/health")
        assert response.status_code == 200
        data = response.json()
        assert data["status"] == "healthy"
        assert "uptime_seconds" in data
        assert data["uptime_seconds"] >= 0


class TestStatsEndpoints:
    """Tests for /monitoring/stats/* endpoints."""

    @pytest.mark.asyncio
    async def test_get_stats_empty(self, client: AsyncClient):
        """Test getting stats when no crawls have been performed."""
        # Reset stats first
        await client.post("/monitoring/stats/reset")

        response = await client.get("/monitoring/stats")
        assert response.status_code == 200
        data = response.json()

        # Verify all expected fields
        assert "active_crawls" in data
        assert "total_crawls" in data
        assert "successful_crawls" in data
        assert "failed_crawls" in data
        assert "success_rate" in data
        assert "avg_duration_ms" in data
        assert "total_bytes_processed" in data
        assert "system_stats" in data

        # Verify system stats
        system = data["system_stats"]
        assert "cpu_percent" in system
        assert "memory_percent" in system
        assert "memory_used_mb" in system
        assert "memory_available_mb" in system
        assert "disk_usage_percent" in system
        assert "active_processes" in system

    @pytest.mark.asyncio
    async def test_stats_after_crawl(self, client: AsyncClient):
        """Test stats are updated after performing a crawl."""
        # Reset stats
        await client.post("/monitoring/stats/reset")

        # Perform a simple crawl
        crawl_request = {
            "urls": ["https://www.example.com"],
            "crawler_config": {
                "word_count_threshold": 10
            }
        }
        crawl_response = await client.post("/crawl", json=crawl_request)
        assert crawl_response.status_code == 200

        # Get stats
        response = await client.get("/monitoring/stats")
        assert response.status_code == 200
        data = response.json()

        # Verify stats are updated
        assert data["total_crawls"] >= 1
        assert data["successful_crawls"] >= 0
        assert data["failed_crawls"] >= 0
        assert data["total_crawls"] == data["successful_crawls"] + data["failed_crawls"]

        # Verify success rate calculation
        if data["total_crawls"] > 0:
            expected_rate = (data["successful_crawls"] / data["total_crawls"]) * 100
            assert abs(data["success_rate"] - expected_rate) < 0.01

    @pytest.mark.asyncio
    async def test_stats_reset(self, client: AsyncClient):
        """Test resetting stats clears all counters."""
        # Ensure we have some stats
        crawl_request = {
            "urls": ["https://www.example.com"],
            "crawler_config": {"word_count_threshold": 10}
        }
        await client.post("/crawl", json=crawl_request)

        # Reset stats
        reset_response = await client.post("/monitoring/stats/reset")
        assert reset_response.status_code == 200
        data = reset_response.json()
        assert data["status"] == "reset"
        assert "previous_stats" in data

        # Verify stats are cleared
        stats_response = await client.get("/monitoring/stats")
        stats = stats_response.json()
        assert stats["total_crawls"] == 0
        assert stats["successful_crawls"] == 0
        assert stats["failed_crawls"] == 0
        assert stats["active_crawls"] == 0

    @pytest.mark.asyncio
    async def test_url_specific_stats(self, client: AsyncClient):
        """Test getting URL-specific statistics."""
        # Reset and crawl
        await client.post("/monitoring/stats/reset")
        crawl_request = {
            "urls": ["https://www.example.com"],
            "crawler_config": {"word_count_threshold": 10}
        }
        await client.post("/crawl", json=crawl_request)

        # Get URL stats
        response = await client.get("/monitoring/stats/urls")
        assert response.status_code == 200
        data = response.json()

        assert isinstance(data, list)
        if len(data) > 0:
            url_stat = data[0]
            assert "url" in url_stat
            assert "total_requests" in url_stat
            assert "successful_requests" in url_stat
            assert "failed_requests" in url_stat
            assert "avg_duration_ms" in url_stat
            assert "total_bytes_processed" in url_stat
            assert "last_request_time" in url_stat


class TestStatsStreaming:
    """Tests for /monitoring/stats/stream SSE endpoint."""

    @pytest.mark.asyncio
    async def test_stats_stream_basic(self, client: AsyncClient):
        """Test SSE streaming of statistics."""
        # Start streaming (collect a few events then stop)
        events = []
        async with client.stream("GET", "/monitoring/stats/stream") as response:
            assert response.status_code == 200
            assert "text/event-stream" in response.headers.get("content-type", "")

            # Collect first 3 events
            count = 0
            async for line in response.aiter_lines():
                if line.startswith("data: "):
                    data_str = line[6:]  # Remove "data: " prefix
                    data = json.loads(data_str)
                    events.append(data)
                    count += 1
                    if count >= 3:
                        break

        # Verify we got events
        assert len(events) >= 3

        # Verify event structure
        for event in events:
            assert "active_crawls" in event
            assert "total_crawls" in event
            assert "successful_crawls" in event
            assert "system_stats" in event

    @pytest.mark.asyncio
    async def test_stats_stream_during_crawl(self, client: AsyncClient):
        """Test streaming updates during active crawl."""
        # Start streaming in background
        stream_task = None
        events = []

        async def collect_stream():
            async with client.stream("GET", "/monitoring/stats/stream") as response:
                async for line in response.aiter_lines():
                    if line.startswith("data: "):
                        data_str = line[6:]
                        data = json.loads(data_str)
                        events.append(data)
                        if len(events) >= 5:
                            break

        # Start stream collection
        stream_task = asyncio.create_task(collect_stream())

        # Wait a bit then start crawl
        await asyncio.sleep(1)
        crawl_request = {
            "urls": ["https://www.example.com"],
            "crawler_config": {"word_count_threshold": 10}
        }
        asyncio.create_task(client.post("/crawl", json=crawl_request))

        # Wait for events
        try:
            await asyncio.wait_for(stream_task, timeout=15.0)
        except asyncio.TimeoutError:
            stream_task.cancel()

        # Should have collected some events
        assert len(events) > 0


class TestProfilingEndpoints:
    """Tests for /monitoring/profile/* endpoints."""

    @pytest.mark.asyncio
    async def test_list_profiling_sessions_empty(self, client: AsyncClient):
        """Test listing profiling sessions when none exist."""
        response = await client.get("/monitoring/profile")
        assert response.status_code == 200
        data = response.json()
        assert "sessions" in data
        assert isinstance(data["sessions"], list)

    @pytest.mark.asyncio
    async def test_start_profiling_session(self, client: AsyncClient):
        """Test starting a new profiling session."""
        request_data = {
            "urls": ["https://www.example.com", "https://www.python.org"],
            "duration_seconds": 2,
            "crawler_config": {
                "word_count_threshold": 10
            }
        }

        response = await client.post("/monitoring/profile/start", json=request_data)
        assert response.status_code == 200
        data = response.json()

        assert "session_id" in data
        assert "status" in data
        assert data["status"] == "running"
        assert "started_at" in data
        assert "urls" in data
        assert len(data["urls"]) == 2

        return data["session_id"]

    @pytest.mark.asyncio
    async def test_get_profiling_session(self, client: AsyncClient):
        """Test retrieving a profiling session by ID."""
        # Start a session
        request_data = {
            "urls": ["https://www.example.com"],
            "duration_seconds": 2,
            "crawler_config": {"word_count_threshold": 10}
        }
        start_response = await client.post("/monitoring/profile/start", json=request_data)
        session_id = start_response.json()["session_id"]

        # Get session immediately (should be running)
        response = await client.get(f"/monitoring/profile/{session_id}")
        assert response.status_code == 200
        data = response.json()

        assert data["session_id"] == session_id
        assert data["status"] in ["running", "completed"]
        assert "started_at" in data
        assert "urls" in data

    @pytest.mark.asyncio
    async def test_profiling_session_completion(self, client: AsyncClient):
        """Test profiling session completes and produces results."""
        # Start a short session
        request_data = {
            "urls": ["https://www.example.com"],
            "duration_seconds": 3,
            "crawler_config": {"word_count_threshold": 10}
        }
        start_response = await client.post("/monitoring/profile/start", json=request_data)
        session_id = start_response.json()["session_id"]

        # Wait for completion
        await asyncio.sleep(5)

        # Get completed session
        response = await client.get(f"/monitoring/profile/{session_id}")
        assert response.status_code == 200
        data = response.json()

        assert data["status"] == "completed"
        assert "completed_at" in data
        assert "duration_seconds" in data
        assert "results" in data

        # Verify results structure
        results = data["results"]
        assert "total_requests" in results
        assert "successful_requests" in results
        assert "failed_requests" in results
        assert "avg_response_time_ms" in results
        assert "system_metrics" in results

    @pytest.mark.asyncio
    async def test_profiling_session_not_found(self, client: AsyncClient):
        """Test retrieving non-existent session returns 404."""
        response = await client.get("/monitoring/profile/nonexistent-id-12345")
        assert response.status_code == 404
        data = response.json()
        assert "detail" in data

    @pytest.mark.asyncio
    async def test_delete_profiling_session(self, client: AsyncClient):
        """Test deleting a profiling session."""
        # Start a session
        request_data = {
            "urls": ["https://www.example.com"],
            "duration_seconds": 1,
            "crawler_config": {"word_count_threshold": 10}
        }
        start_response = await client.post("/monitoring/profile/start", json=request_data)
        session_id = start_response.json()["session_id"]

        # Wait for completion
        await asyncio.sleep(2)

        # Delete session
        delete_response = await client.delete(f"/monitoring/profile/{session_id}")
        assert delete_response.status_code == 200
        data = delete_response.json()
        assert data["status"] == "deleted"
        assert data["session_id"] == session_id

        # Verify it's gone
        get_response = await client.get(f"/monitoring/profile/{session_id}")
        assert get_response.status_code == 404

    @pytest.mark.asyncio
    async def test_cleanup_old_sessions(self, client: AsyncClient):
        """Test cleaning up old profiling sessions."""
        # Start a few sessions
        for i in range(3):
            request_data = {
                "urls": ["https://www.example.com"],
                "duration_seconds": 1,
                "crawler_config": {"word_count_threshold": 10}
            }
            await client.post("/monitoring/profile/start", json=request_data)

        # Wait for completion
        await asyncio.sleep(2)

        # Cleanup sessions older than 0 seconds (all completed ones)
        cleanup_response = await client.post(
            "/monitoring/profile/cleanup",
            json={"max_age_seconds": 0}
        )
        assert cleanup_response.status_code == 200
        data = cleanup_response.json()
        assert "deleted_count" in data
        assert data["deleted_count"] >= 0

    @pytest.mark.asyncio
    async def test_list_sessions_after_operations(self, client: AsyncClient):
        """Test listing sessions shows correct state after various operations."""
        # Start a session
        request_data = {
            "urls": ["https://www.example.com"],
            "duration_seconds": 5,
            "crawler_config": {"word_count_threshold": 10}
        }
        start_response = await client.post("/monitoring/profile/start", json=request_data)
        session_id = start_response.json()["session_id"]

        # List sessions
        list_response = await client.get("/monitoring/profile")
        assert list_response.status_code == 200
        data = list_response.json()

        # Should have at least one session
        sessions = data["sessions"]
        assert len(sessions) >= 1

        # Find our session
        our_session = next((s for s in sessions if s["session_id"] == session_id), None)
        assert our_session is not None
        assert our_session["status"] in ["running", "completed"]


class TestProfilingWithCrawlConfig:
    """Tests for profiling with various crawler configurations."""

    @pytest.mark.asyncio
    async def test_profiling_with_extraction_strategy(self, client: AsyncClient):
        """Test profiling with extraction strategy configured."""
        request_data = {
            "urls": ["https://www.example.com"],
            "duration_seconds": 2,
            "crawler_config": {
                "word_count_threshold": 10,
                "extraction_strategy": "NoExtractionStrategy"
            }
        }

        response = await client.post("/monitoring/profile/start", json=request_data)
        assert response.status_code == 200
        data = response.json()
        assert data["status"] == "running"

    @pytest.mark.asyncio
    async def test_profiling_with_browser_config(self, client: AsyncClient):
        """Test profiling with custom browser configuration."""
        request_data = {
            "urls": ["https://www.example.com"],
            "duration_seconds": 2,
            "browser_config": {
                "headless": True,
                "verbose": False
            },
            "crawler_config": {
                "word_count_threshold": 10
            }
        }

        response = await client.post("/monitoring/profile/start", json=request_data)
        assert response.status_code == 200
        data = response.json()
        assert data["status"] == "running"


class TestIntegrationScenarios:
    """Integration tests for real-world monitoring scenarios."""

    @pytest.mark.asyncio
    async def test_concurrent_crawls_and_monitoring(self, client: AsyncClient):
        """Test monitoring multiple concurrent crawls."""
        # Reset stats
        await client.post("/monitoring/stats/reset")

        # Start multiple crawls concurrently
        crawl_tasks = []
        urls = [
            "https://www.example.com",
            "https://www.python.org",
            "https://www.github.com"
        ]

        for url in urls:
            crawl_request = {
                "urls": [url],
                "crawler_config": {"word_count_threshold": 10}
            }
            task = client.post("/crawl", json=crawl_request)
            crawl_tasks.append(task)

        # Execute concurrently
        responses = await asyncio.gather(*crawl_tasks, return_exceptions=True)

        # Get stats
        await asyncio.sleep(1)  # Give tracking time to update
        stats_response = await client.get("/monitoring/stats")
        stats = stats_response.json()

        # Should have tracked multiple crawls
        assert stats["total_crawls"] >= len(urls)

    @pytest.mark.asyncio
    async def test_profiling_and_stats_correlation(self, client: AsyncClient):
        """Test that profiling data correlates with statistics."""
        # Reset stats
        await client.post("/monitoring/stats/reset")

        # Start profiling session
        profile_request = {
            "urls": ["https://www.example.com"],
            "duration_seconds": 3,
            "crawler_config": {"word_count_threshold": 10}
        }
        profile_response = await client.post("/monitoring/profile/start", json=profile_request)
        session_id = profile_response.json()["session_id"]

        # Wait for completion
        await asyncio.sleep(5)

        # Get profiling results
        profile_data_response = await client.get(f"/monitoring/profile/{session_id}")
        profile_data = profile_data_response.json()

        # Get stats
        stats_response = await client.get("/monitoring/stats")
        stats = stats_response.json()

        # Stats should reflect profiling activity
        assert stats["total_crawls"] >= profile_data["results"]["total_requests"]


if __name__ == "__main__":
    pytest.main([__file__, "-v", "-s"])