feat(monitor): add real-time crawler monitoring system with memory management

Implements a comprehensive monitoring and visualization system for tracking web crawler operations in real-time. The system includes: - Terminal-based dashboard with rich UI for displaying task statuses - Memory pressure monitoring and adaptive dispatch control - Queue statistics and performance metrics tracking - Detailed task progress visualization - Stress testing framework for memory management This addition helps operators track crawler performance and manage memory usage more effectively.
2025-03-12 19:05:24 +08:00
parent 9547bada3a
commit 1630fbdafe
8 changed files with 1956 additions and 321 deletions
--- a/tests/memory/test_crawler_monitor.py
+++ b/tests/memory/test_crawler_monitor.py
@@ -0,0 +1,168 @@
+"""
+Test script for the CrawlerMonitor component.
+This script simulates a crawler with multiple tasks to demonstrate the real-time monitoring capabilities.
+"""
+
+import time
+import uuid
+import random
+import threading
+import sys
+import os
+
+# Add the parent directory to the path to import crawl4ai
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+from crawl4ai.components.crawler_monitor import CrawlerMonitor
+from crawl4ai.models import CrawlStatus
+
+def simulate_crawler_task(monitor, task_id, url, simulate_failure=False):
+    """Simulate a crawler task with different states."""
+    # Task starts in the QUEUED state
+    wait_time = random.uniform(0.5, 3.0)
+    time.sleep(wait_time)
+    
+    # Update to IN_PROGRESS state
+    monitor.update_task(
+        task_id=task_id,
+        status=CrawlStatus.IN_PROGRESS,
+        start_time=time.time(),
+        wait_time=wait_time
+    )
+    
+    # Simulate task running
+    process_time = random.uniform(1.0, 5.0)
+    for i in range(int(process_time * 2)):
+        # Simulate memory usage changes
+        memory_usage = random.uniform(5.0, 25.0)
+        monitor.update_task(
+            task_id=task_id,
+            memory_usage=memory_usage,
+            peak_memory=max(memory_usage, monitor.get_task_stats(task_id).get("peak_memory", 0))
+        )
+        time.sleep(0.5)
+    
+    # Update to COMPLETED or FAILED state
+    if simulate_failure and random.random() < 0.8:  # 80% chance of failure if simulate_failure is True
+        monitor.update_task(
+            task_id=task_id,
+            status=CrawlStatus.FAILED,
+            end_time=time.time(),
+            error_message="Simulated failure: Connection timeout",
+            memory_usage=0.0
+        )
+    else:
+        monitor.update_task(
+            task_id=task_id,
+            status=CrawlStatus.COMPLETED,
+            end_time=time.time(),
+            memory_usage=0.0
+        )
+
+def update_queue_stats(monitor, num_queued_tasks):
+    """Update queue statistics periodically."""
+    while monitor.is_running:
+        queued_tasks = [
+            task for task_id, task in monitor.get_all_task_stats().items()
+            if task["status"] == CrawlStatus.QUEUED.name
+        ]
+        
+        total_queued = len(queued_tasks)
+        
+        if total_queued > 0:
+            current_time = time.time()
+            wait_times = [
+                current_time - task.get("enqueue_time", current_time)
+                for task in queued_tasks
+            ]
+            highest_wait_time = max(wait_times) if wait_times else 0.0
+            avg_wait_time = sum(wait_times) / len(wait_times) if wait_times else 0.0
+        else:
+            highest_wait_time = 0.0
+            avg_wait_time = 0.0
+        
+        monitor.update_queue_statistics(
+            total_queued=total_queued,
+            highest_wait_time=highest_wait_time,
+            avg_wait_time=avg_wait_time
+        )
+        
+        # Simulate memory pressure based on number of active tasks
+        active_tasks = len([
+            task for task_id, task in monitor.get_all_task_stats().items()
+            if task["status"] == CrawlStatus.IN_PROGRESS.name
+        ])
+        
+        if active_tasks > 8:
+            monitor.update_memory_status("CRITICAL")
+        elif active_tasks > 4:
+            monitor.update_memory_status("PRESSURE")
+        else:
+            monitor.update_memory_status("NORMAL")
+            
+        time.sleep(1.0)
+
+def test_crawler_monitor():
+    """Test the CrawlerMonitor with simulated crawler tasks."""
+    # Total number of URLs to crawl
+    total_urls = 50
+    
+    # Initialize the monitor
+    monitor = CrawlerMonitor(urls_total=total_urls, refresh_rate=0.5)
+    
+    # Start the monitor
+    monitor.start()
+    
+    # Start thread to update queue statistics
+    queue_stats_thread = threading.Thread(target=update_queue_stats, args=(monitor, total_urls))
+    queue_stats_thread.daemon = True
+    queue_stats_thread.start()
+    
+    try:
+        # Create task threads
+        threads = []
+        for i in range(total_urls):
+            task_id = str(uuid.uuid4())
+            url = f"https://example.com/page{i}"
+            
+            # Add task to monitor
+            monitor.add_task(task_id, url)
+            
+            # Determine if this task should simulate failure
+            simulate_failure = (i % 10 == 0)  # Every 10th task
+            
+            # Create and start thread for this task
+            thread = threading.Thread(
+                target=simulate_crawler_task,
+                args=(monitor, task_id, url, simulate_failure)
+            )
+            thread.daemon = True
+            threads.append(thread)
+        
+        # Start threads with delay to simulate tasks being added over time
+        batch_size = 5
+        for i in range(0, len(threads), batch_size):
+            batch = threads[i:i+batch_size]
+            for thread in batch:
+                thread.start()
+                time.sleep(0.5)  # Small delay between starting threads
+            
+            # Wait a bit before starting the next batch
+            time.sleep(2.0)
+        
+        # Wait for all threads to complete
+        for thread in threads:
+            thread.join()
+            
+        # Keep monitor running a bit longer to see the final state
+        time.sleep(5.0)
+        
+    except KeyboardInterrupt:
+        print("\nTest interrupted by user")
+    finally:
+        # Stop the monitor
+        monitor.stop()
+        print("\nCrawler monitor test completed")
+
+if __name__ == "__main__":
+    test_crawler_monitor()