feat(monitor): add real-time crawler monitoring system with memory management

Implements a comprehensive monitoring and visualization system for tracking web crawler operations in real-time. The system includes:
- Terminal-based dashboard with rich UI for displaying task statuses
- Memory pressure monitoring and adaptive dispatch control
- Queue statistics and performance metrics tracking
- Detailed task progress visualization
- Stress testing framework for memory management

This addition helps operators track crawler performance and manage memory usage more effectively.
This commit is contained in:
UncleCode
2025-03-12 19:05:24 +08:00
parent 9547bada3a
commit 1630fbdafe
8 changed files with 1956 additions and 321 deletions

View File

@@ -33,13 +33,12 @@ from .content_filter_strategy import (
LLMContentFilter,
RelevantContentFilter,
)
from .models import CrawlResult, MarkdownGenerationResult
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
from .components.crawler_monitor import CrawlerMonitor
from .async_dispatcher import (
MemoryAdaptiveDispatcher,
SemaphoreDispatcher,
RateLimiter,
CrawlerMonitor,
DisplayMode,
BaseDispatcher,
)
from .docker_client import Crawl4aiDockerClient

View File

@@ -4,17 +4,15 @@ from .models import (
CrawlResult,
CrawlerTaskResult,
CrawlStatus,
DisplayMode,
CrawlStats,
DomainState,
)
from rich.live import Live
from rich.table import Table
from rich.console import Console
from rich import box
from datetime import timedelta, datetime
from .components.crawler_monitor import CrawlerMonitor
from .types import AsyncWebCrawler
from collections.abc import AsyncGenerator
import time
import psutil
import asyncio
@@ -24,8 +22,6 @@ from urllib.parse import urlparse
import random
from abc import ABC, abstractmethod
from math import inf as infinity
class RateLimiter:
def __init__(
@@ -87,201 +83,6 @@ class RateLimiter:
return True
class CrawlerMonitor:
def __init__(
self,
max_visible_rows: int = 15,
display_mode: DisplayMode = DisplayMode.DETAILED,
):
self.console = Console()
self.max_visible_rows = max_visible_rows
self.display_mode = display_mode
self.stats: Dict[str, CrawlStats] = {}
self.process = psutil.Process()
self.start_time = time.time()
self.live = Live(self._create_table(), refresh_per_second=2)
def start(self):
self.live.start()
def stop(self):
self.live.stop()
def add_task(self, task_id: str, url: str):
self.stats[task_id] = CrawlStats(
task_id=task_id, url=url, status=CrawlStatus.QUEUED
)
self.live.update(self._create_table())
def update_task(self, task_id: str, **kwargs):
if task_id in self.stats:
for key, value in kwargs.items():
setattr(self.stats[task_id], key, value)
self.live.update(self._create_table())
def _create_aggregated_table(self) -> Table:
"""Creates a compact table showing only aggregated statistics"""
table = Table(
box=box.ROUNDED,
title="Crawler Status Overview",
title_style="bold magenta",
header_style="bold blue",
show_lines=True,
)
# Calculate statistics
total_tasks = len(self.stats)
queued = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
)
in_progress = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
)
completed = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
)
failed = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
)
# Memory statistics
current_memory = self.process.memory_info().rss / (1024 * 1024)
total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
peak_memory = max(
(stat.peak_memory for stat in self.stats.values()), default=0.0
)
# Duration
duration = time.time() - self.start_time
# Create status row
table.add_column("Status", style="bold cyan")
table.add_column("Count", justify="right")
table.add_column("Percentage", justify="right")
table.add_row("Total Tasks", str(total_tasks), "100%")
table.add_row(
"[yellow]In Queue[/yellow]",
str(queued),
f"{(queued / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[blue]In Progress[/blue]",
str(in_progress),
f"{(in_progress / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[green]Completed[/green]",
str(completed),
f"{(completed / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[red]Failed[/red]",
str(failed),
f"{(failed / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
)
# Add memory information
table.add_section()
table.add_row(
"[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
)
table.add_row(
"[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
)
table.add_row(
"[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
)
table.add_row(
"[yellow]Runtime[/yellow]",
str(timedelta(seconds=int(duration))),
"",
)
return table
def _create_detailed_table(self) -> Table:
table = Table(
box=box.ROUNDED,
title="Crawler Performance Monitor",
title_style="bold magenta",
header_style="bold blue",
)
# Add columns
table.add_column("Task ID", style="cyan", no_wrap=True)
table.add_column("URL", style="cyan", no_wrap=True)
table.add_column("Status", style="bold")
table.add_column("Memory (MB)", justify="right")
table.add_column("Peak (MB)", justify="right")
table.add_column("Duration", justify="right")
table.add_column("Info", style="italic")
# Add summary row
total_memory = sum(stat.memory_usage for stat in self.stats.values())
active_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
)
completed_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
)
failed_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
)
table.add_row(
"[bold yellow]SUMMARY",
f"Total: {len(self.stats)}",
f"Active: {active_count}",
f"{total_memory:.1f}",
f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
str(
timedelta(
seconds=int(time.time() - self.start_time)
)
),
f"{completed_count}{failed_count}",
style="bold",
)
table.add_section()
# Add rows for each task
visible_stats = sorted(
self.stats.values(),
key=lambda x: (
x.status != CrawlStatus.IN_PROGRESS,
x.status != CrawlStatus.QUEUED,
x.end_time or infinity,
),
)[: self.max_visible_rows]
for stat in visible_stats:
status_style = {
CrawlStatus.QUEUED: "white",
CrawlStatus.IN_PROGRESS: "yellow",
CrawlStatus.COMPLETED: "green",
CrawlStatus.FAILED: "red",
}[stat.status]
table.add_row(
stat.task_id[:8], # Show first 8 chars of task ID
stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
f"[{status_style}]{stat.status.value}[/{status_style}]",
f"{stat.memory_usage:.1f}",
f"{stat.peak_memory:.1f}",
stat.duration,
stat.error_message[:40] if stat.error_message else "",
)
return table
def _create_table(self) -> Table:
"""Creates the appropriate table based on display mode"""
if self.display_mode == DisplayMode.AGGREGATED:
return self._create_aggregated_table()
return self._create_detailed_table()
class BaseDispatcher(ABC):
def __init__(
@@ -309,7 +110,7 @@ class BaseDispatcher(ABC):
async def run_urls(
self,
urls: List[str],
crawler: "AsyncWebCrawler", # noqa: F821
crawler: AsyncWebCrawler, # noqa: F821
config: CrawlerRunConfig,
monitor: Optional[CrawlerMonitor] = None,
) -> List[CrawlerTaskResult]:
@@ -320,71 +121,144 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
def __init__(
self,
memory_threshold_percent: float = 90.0,
critical_threshold_percent: float = 95.0, # New critical threshold
recovery_threshold_percent: float = 85.0, # New recovery threshold
check_interval: float = 1.0,
max_session_permit: int = 20,
memory_wait_timeout: float = 300.0, # 5 minutes default timeout
fairness_timeout: float = 600.0, # 10 minutes before prioritizing long-waiting URLs
rate_limiter: Optional[RateLimiter] = None,
monitor: Optional[CrawlerMonitor] = None,
):
super().__init__(rate_limiter, monitor)
self.memory_threshold_percent = memory_threshold_percent
self.critical_threshold_percent = critical_threshold_percent
self.recovery_threshold_percent = recovery_threshold_percent
self.check_interval = check_interval
self.max_session_permit = max_session_permit
self.memory_wait_timeout = memory_wait_timeout
self.result_queue = asyncio.Queue() # Queue for storing results
self.fairness_timeout = fairness_timeout
self.result_queue = asyncio.Queue()
self.task_queue = asyncio.PriorityQueue() # Priority queue for better management
self.memory_pressure_mode = False # Flag to indicate when we're in memory pressure mode
self.current_memory_percent = 0.0 # Track current memory usage
async def _memory_monitor_task(self):
"""Background task to continuously monitor memory usage and update state"""
while True:
self.current_memory_percent = psutil.virtual_memory().percent
# Enter memory pressure mode if we cross the threshold
if not self.memory_pressure_mode and self.current_memory_percent >= self.memory_threshold_percent:
self.memory_pressure_mode = True
if self.monitor:
self.monitor.update_memory_status("PRESSURE")
# Exit memory pressure mode if we go below recovery threshold
elif self.memory_pressure_mode and self.current_memory_percent <= self.recovery_threshold_percent:
self.memory_pressure_mode = False
if self.monitor:
self.monitor.update_memory_status("NORMAL")
# In critical mode, we might need to take more drastic action
if self.current_memory_percent >= self.critical_threshold_percent:
if self.monitor:
self.monitor.update_memory_status("CRITICAL")
# We could implement additional memory-saving measures here
await asyncio.sleep(self.check_interval)
def _get_priority_score(self, wait_time: float, retry_count: int) -> float:
"""Calculate priority score (lower is higher priority)
- URLs waiting longer than fairness_timeout get higher priority
- More retry attempts decreases priority
"""
if wait_time > self.fairness_timeout:
# High priority for long-waiting URLs
return -wait_time
# Standard priority based on retries
return retry_count
async def crawl_url(
self,
url: str,
config: CrawlerRunConfig,
task_id: str,
retry_count: int = 0,
) -> CrawlerTaskResult:
start_time = time.time()
error_message = ""
memory_usage = peak_memory = 0.0
# Get starting memory for accurate measurement
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024)
try:
if self.monitor:
self.monitor.update_task(
task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
task_id,
status=CrawlStatus.IN_PROGRESS,
start_time=start_time,
retry_count=retry_count
)
self.concurrent_sessions += 1
if self.rate_limiter:
await self.rate_limiter.wait_if_needed(url)
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024)
# Check if we're in critical memory state
if self.current_memory_percent >= self.critical_threshold_percent:
# Requeue this task with increased priority and retry count
enqueue_time = time.time()
priority = self._get_priority_score(enqueue_time - start_time, retry_count + 1)
await self.task_queue.put((priority, (url, task_id, retry_count + 1, enqueue_time)))
# Update monitoring
if self.monitor:
self.monitor.update_task(
task_id,
status=CrawlStatus.QUEUED,
error_message="Requeued due to critical memory pressure"
)
# Return placeholder result with requeued status
return CrawlerTaskResult(
task_id=task_id,
url=url,
result=CrawlResult(
url=url, html="", metadata={"status": "requeued"},
success=False, error_message="Requeued due to critical memory pressure"
),
memory_usage=0,
peak_memory=0,
start_time=start_time,
end_time=time.time(),
error_message="Requeued due to critical memory pressure",
retry_count=retry_count + 1
)
# Execute the crawl
result = await self.crawler.arun(url, config=config, session_id=task_id)
# Measure memory usage
end_memory = process.memory_info().rss / (1024 * 1024)
memory_usage = peak_memory = end_memory - start_memory
# Handle rate limiting
if self.rate_limiter and result.status_code:
if not self.rate_limiter.update_delay(url, result.status_code):
error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
result = CrawlerTaskResult(
task_id=task_id,
url=url,
result=result,
memory_usage=memory_usage,
peak_memory=peak_memory,
start_time=start_time,
end_time=time.time(),
error_message=error_message,
)
await self.result_queue.put(result)
return result
# Update status based on result
if not result.success:
error_message = result.error_message
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
elif self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
except Exception as e:
error_message = str(e)
if self.monitor:
@@ -392,7 +266,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
result = CrawlResult(
url=url, html="", metadata={}, success=False, error_message=str(e)
)
finally:
end_time = time.time()
if self.monitor:
@@ -402,9 +276,10 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
memory_usage=memory_usage,
peak_memory=peak_memory,
error_message=error_message,
retry_count=retry_count
)
self.concurrent_sessions -= 1
return CrawlerTaskResult(
task_id=task_id,
url=url,
@@ -414,116 +289,240 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
start_time=start_time,
end_time=end_time,
error_message=error_message,
retry_count=retry_count
)
async def run_urls(
self,
urls: List[str],
crawler: "AsyncWebCrawler", # noqa: F821
crawler: AsyncWebCrawler,
config: CrawlerRunConfig,
) -> List[CrawlerTaskResult]:
self.crawler = crawler
# Start the memory monitor task
memory_monitor = asyncio.create_task(self._memory_monitor_task())
if self.monitor:
self.monitor.start()
results = []
try:
pending_tasks = []
active_tasks = []
task_queue = []
for url in urls:
task_id = str(uuid.uuid4())
if self.monitor:
self.monitor.add_task(task_id, url)
task_queue.append((url, task_id))
while task_queue or active_tasks:
wait_start_time = time.time()
while len(active_tasks) < self.max_session_permit and task_queue:
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
# Check if we've exceeded the timeout
if time.time() - wait_start_time > self.memory_wait_timeout:
raise MemoryError(
f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
)
await asyncio.sleep(self.check_interval)
continue
url, task_id = task_queue.pop(0)
task = asyncio.create_task(self.crawl_url(url, config, task_id))
active_tasks.append(task)
if not active_tasks:
await asyncio.sleep(self.check_interval)
continue
done, pending = await asyncio.wait(
active_tasks, return_when=asyncio.FIRST_COMPLETED
)
pending_tasks.extend(done)
active_tasks = list(pending)
return await asyncio.gather(*pending_tasks)
finally:
if self.monitor:
self.monitor.stop()
async def run_urls_stream(
self,
urls: List[str],
crawler: "AsyncWebCrawler", # noqa: F821
config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlerTaskResult, None]:
self.crawler = crawler
if self.monitor:
self.monitor.start()
try:
active_tasks = []
task_queue = []
completed_count = 0
total_urls = len(urls)
# Initialize task queue
for url in urls:
task_id = str(uuid.uuid4())
if self.monitor:
self.monitor.add_task(task_id, url)
task_queue.append((url, task_id))
while completed_count < total_urls:
# Start new tasks if memory permits
while len(active_tasks) < self.max_session_permit and task_queue:
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
await asyncio.sleep(self.check_interval)
continue
url, task_id = task_queue.pop(0)
task = asyncio.create_task(self.crawl_url(url, config, task_id))
active_tasks.append(task)
if not active_tasks and not task_queue:
break
# Wait for any task to complete and yield results
# Add to queue with initial priority 0, retry count 0, and current time
await self.task_queue.put((0, (url, task_id, 0, time.time())))
active_tasks = []
# Process until both queues are empty
while not self.task_queue.empty() or active_tasks:
# If memory pressure is low, start new tasks
if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
try:
# Try to get a task with timeout to avoid blocking indefinitely
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
self.task_queue.get(), timeout=0.1
)
# Create and start the task
task = asyncio.create_task(
self.crawl_url(url, config, task_id, retry_count)
)
active_tasks.append(task)
# Update waiting time in monitor
if self.monitor:
wait_time = time.time() - enqueue_time
self.monitor.update_task(
task_id,
wait_time=wait_time,
status=CrawlStatus.IN_PROGRESS
)
except asyncio.TimeoutError:
# No tasks in queue, that's fine
pass
# Wait for completion even if queue is starved
if active_tasks:
done, pending = await asyncio.wait(
active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
)
# Process completed tasks
for completed_task in done:
result = await completed_task
completed_count += 1
yield result
results.append(result)
# Update active tasks list
active_tasks = list(pending)
else:
await asyncio.sleep(self.check_interval)
# If no active tasks but still waiting, sleep briefly
await asyncio.sleep(self.check_interval / 2)
# Update priorities for waiting tasks if needed
await self._update_queue_priorities()
return results
except Exception as e:
if self.monitor:
self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")
finally:
# Clean up
memory_monitor.cancel()
if self.monitor:
self.monitor.stop()
async def _update_queue_priorities(self):
"""Periodically update priorities of items in the queue to prevent starvation"""
# Skip if queue is empty
if self.task_queue.empty():
return
# Use a drain-and-refill approach to update all priorities
temp_items = []
# Drain the queue (with a safety timeout to prevent blocking)
try:
drain_start = time.time()
while not self.task_queue.empty() and time.time() - drain_start < 5.0: # 5 second safety timeout
try:
# Get item from queue with timeout
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
self.task_queue.get(), timeout=0.1
)
# Calculate new priority based on current wait time
current_time = time.time()
wait_time = current_time - enqueue_time
new_priority = self._get_priority_score(wait_time, retry_count)
# Store with updated priority
temp_items.append((new_priority, (url, task_id, retry_count, enqueue_time)))
# Update monitoring stats for this task
if self.monitor and task_id in self.monitor.stats:
self.monitor.update_task(task_id, wait_time=wait_time)
except asyncio.TimeoutError:
# Queue might be empty or very slow
break
except Exception as e:
# If anything goes wrong, make sure we refill the queue with what we've got
self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")
# Calculate queue statistics
if temp_items and self.monitor:
total_queued = len(temp_items)
wait_times = [item[1][3] for item in temp_items]
highest_wait_time = time.time() - min(wait_times) if wait_times else 0
avg_wait_time = sum(time.time() - t for t in wait_times) / len(wait_times) if wait_times else 0
# Update queue statistics in monitor
self.monitor.update_queue_statistics(
total_queued=total_queued,
highest_wait_time=highest_wait_time,
avg_wait_time=avg_wait_time
)
# Sort by priority (lowest number = highest priority)
temp_items.sort(key=lambda x: x[0])
# Refill the queue with updated priorities
for item in temp_items:
await self.task_queue.put(item)
async def run_urls_stream(
self,
urls: List[str],
crawler: AsyncWebCrawler,
config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlerTaskResult, None]:
self.crawler = crawler
# Start the memory monitor task
memory_monitor = asyncio.create_task(self._memory_monitor_task())
if self.monitor:
self.monitor.start()
try:
# Initialize task queue
for url in urls:
task_id = str(uuid.uuid4())
if self.monitor:
self.monitor.add_task(task_id, url)
# Add to queue with initial priority 0, retry count 0, and current time
await self.task_queue.put((0, (url, task_id, 0, time.time())))
active_tasks = []
completed_count = 0
total_urls = len(urls)
while completed_count < total_urls:
# If memory pressure is low, start new tasks
if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
try:
# Try to get a task with timeout
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
self.task_queue.get(), timeout=0.1
)
# Create and start the task
task = asyncio.create_task(
self.crawl_url(url, config, task_id, retry_count)
)
active_tasks.append(task)
# Update waiting time in monitor
if self.monitor:
wait_time = time.time() - enqueue_time
self.monitor.update_task(
task_id,
wait_time=wait_time,
status=CrawlStatus.IN_PROGRESS
)
except asyncio.TimeoutError:
# No tasks in queue, that's fine
pass
# Process completed tasks and yield results
if active_tasks:
done, pending = await asyncio.wait(
active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
)
for completed_task in done:
result = await completed_task
# Only count as completed if it wasn't requeued
if "requeued" not in result.error_message:
completed_count += 1
yield result
# Update active tasks list
active_tasks = list(pending)
else:
# If no active tasks but still waiting, sleep briefly
await asyncio.sleep(self.check_interval / 2)
# Update priorities for waiting tasks if needed
await self._update_queue_priorities()
finally:
# Clean up
memory_monitor.cancel()
if self.monitor:
self.monitor.stop()
class SemaphoreDispatcher(BaseDispatcher):
def __init__(
@@ -620,7 +619,7 @@ class SemaphoreDispatcher(BaseDispatcher):
async def run_urls(
self,
crawler: "AsyncWebCrawler", # noqa: F821
crawler: AsyncWebCrawler, # noqa: F821
urls: List[str],
config: CrawlerRunConfig,
) -> List[CrawlerTaskResult]:
@@ -644,4 +643,4 @@ class SemaphoreDispatcher(BaseDispatcher):
return await asyncio.gather(*tasks, return_exceptions=True)
finally:
if self.monitor:
self.monitor.stop()
self.monitor.stop()

View File

@@ -0,0 +1,837 @@
import time
import uuid
import threading
import psutil
from datetime import datetime, timedelta
from typing import Dict, Optional, List
import threading
from rich.console import Console
from rich.layout import Layout
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from rich.live import Live
from rich import box
from ..models import CrawlStatus
class TerminalUI:
"""Terminal user interface for CrawlerMonitor using rich library."""
def __init__(self, refresh_rate: float = 1.0, max_width: int = 120):
"""
Initialize the terminal UI.
Args:
refresh_rate: How often to refresh the UI (in seconds)
max_width: Maximum width of the UI in characters
"""
self.console = Console(width=max_width)
self.layout = Layout()
self.refresh_rate = refresh_rate
self.stop_event = threading.Event()
self.ui_thread = None
self.monitor = None # Will be set by CrawlerMonitor
self.max_width = max_width
# Setup layout - vertical layout (top to bottom)
self.layout.split(
Layout(name="header", size=3),
Layout(name="pipeline_status", size=10),
Layout(name="task_details", ratio=1),
Layout(name="footer", size=3) # Increased footer size to fit all content
)
def start(self, monitor):
"""Start the UI thread."""
self.monitor = monitor
self.stop_event.clear()
self.ui_thread = threading.Thread(target=self._ui_loop)
self.ui_thread.daemon = True
self.ui_thread.start()
def stop(self):
"""Stop the UI thread."""
if self.ui_thread and self.ui_thread.is_alive():
self.stop_event.set()
# Only try to join if we're not in the UI thread
# This prevents "cannot join current thread" errors
if threading.current_thread() != self.ui_thread:
self.ui_thread.join(timeout=5.0)
def _ui_loop(self):
"""Main UI rendering loop."""
import sys
import select
import termios
import tty
# Setup terminal for non-blocking input
old_settings = termios.tcgetattr(sys.stdin)
try:
tty.setcbreak(sys.stdin.fileno())
# Use Live display to render the UI
with Live(self.layout, refresh_per_second=1/self.refresh_rate, screen=True) as live:
self.live = live # Store the live display for updates
# Main UI loop
while not self.stop_event.is_set():
self._update_display()
# Check for key press (non-blocking)
if select.select([sys.stdin], [], [], 0)[0]:
key = sys.stdin.read(1)
# Check for 'q' to quit
if key == 'q':
# Signal stop but don't call monitor.stop() from UI thread
# as it would cause the thread to try to join itself
self.stop_event.set()
self.monitor.is_running = False
break
time.sleep(self.refresh_rate)
# Just check if the monitor was stopped
if not self.monitor.is_running:
break
finally:
# Restore terminal settings
termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
def _update_display(self):
"""Update the terminal display with current statistics."""
if not self.monitor:
return
# Update crawler status panel
self.layout["header"].update(self._create_status_panel())
# Update pipeline status panel and task details panel
self.layout["pipeline_status"].update(self._create_pipeline_panel())
self.layout["task_details"].update(self._create_task_details_panel())
# Update footer
self.layout["footer"].update(self._create_footer())
def _create_status_panel(self) -> Panel:
"""Create the crawler status panel."""
summary = self.monitor.get_summary()
# Format memory status with icon
memory_status = self.monitor.get_memory_status()
memory_icon = "🟢" # Default NORMAL
if memory_status == "PRESSURE":
memory_icon = "🟠"
elif memory_status == "CRITICAL":
memory_icon = "🔴"
# Get current memory usage
current_memory = psutil.Process().memory_info().rss / (1024 * 1024) # MB
memory_percent = (current_memory / psutil.virtual_memory().total) * 100
# Format runtime
runtime = self.monitor._format_time(time.time() - self.monitor.start_time if self.monitor.start_time else 0)
# Create the status text
status_text = Text()
status_text.append(f"Web Crawler Dashboard | Runtime: {runtime} | Memory: {memory_percent:.1f}% {memory_icon}\n")
status_text.append(f"Status: {memory_status} | URLs: {summary['urls_completed']}/{summary['urls_total']} | ")
status_text.append(f"Peak Mem: {summary['peak_memory_percent']:.1f}% at {self.monitor._format_time(summary['peak_memory_time'])}")
return Panel(status_text, title="Crawler Status", border_style="blue")
def _create_pipeline_panel(self) -> Panel:
"""Create the pipeline status panel."""
summary = self.monitor.get_summary()
queue_stats = self.monitor.get_queue_stats()
# Create a table for status counts
table = Table(show_header=True, box=None)
table.add_column("Status", style="cyan")
table.add_column("Count", justify="right")
table.add_column("Percentage", justify="right")
table.add_column("Stat", style="cyan")
table.add_column("Value", justify="right")
# Calculate overall progress
progress = f"{summary['urls_completed']}/{summary['urls_total']}"
progress_percent = f"{summary['completion_percentage']:.1f}%"
# Add rows for each status
table.add_row(
"Overall Progress",
progress,
progress_percent,
"Est. Completion",
summary.get('estimated_completion_time', "N/A")
)
# Add rows for each status
status_counts = summary['status_counts']
total = summary['urls_total'] or 1 # Avoid division by zero
# Status rows
table.add_row(
"Completed",
str(status_counts.get(CrawlStatus.COMPLETED.name, 0)),
f"{status_counts.get(CrawlStatus.COMPLETED.name, 0) / total * 100:.1f}%",
"Avg. Time/URL",
f"{summary.get('avg_task_duration', 0):.2f}s"
)
table.add_row(
"Failed",
str(status_counts.get(CrawlStatus.FAILED.name, 0)),
f"{status_counts.get(CrawlStatus.FAILED.name, 0) / total * 100:.1f}%",
"Concurrent Tasks",
str(status_counts.get(CrawlStatus.IN_PROGRESS.name, 0))
)
table.add_row(
"In Progress",
str(status_counts.get(CrawlStatus.IN_PROGRESS.name, 0)),
f"{status_counts.get(CrawlStatus.IN_PROGRESS.name, 0) / total * 100:.1f}%",
"Queue Size",
str(queue_stats['total_queued'])
)
table.add_row(
"Queued",
str(status_counts.get(CrawlStatus.QUEUED.name, 0)),
f"{status_counts.get(CrawlStatus.QUEUED.name, 0) / total * 100:.1f}%",
"Max Wait Time",
f"{queue_stats['highest_wait_time']:.1f}s"
)
# Requeued is a special case as it's not a status
requeued_count = summary.get('requeued_count', 0)
table.add_row(
"Requeued",
str(requeued_count),
f"{summary.get('requeue_rate', 0):.1f}%",
"Avg Wait Time",
f"{queue_stats['avg_wait_time']:.1f}s"
)
# Add empty row for spacing
table.add_row(
"",
"",
"",
"Requeue Rate",
f"{summary.get('requeue_rate', 0):.1f}%"
)
return Panel(table, title="Pipeline Status", border_style="green")
def _create_task_details_panel(self) -> Panel:
"""Create the task details panel."""
# Create a table for task details
table = Table(show_header=True, expand=True)
table.add_column("Task ID", style="cyan", no_wrap=True, width=10)
table.add_column("URL", style="blue", ratio=3)
table.add_column("Status", style="green", width=15)
table.add_column("Memory", justify="right", width=8)
table.add_column("Peak", justify="right", width=8)
table.add_column("Duration", justify="right", width=10)
# Get all task stats
task_stats = self.monitor.get_all_task_stats()
# Add summary row
active_tasks = sum(1 for stats in task_stats.values()
if stats['status'] == CrawlStatus.IN_PROGRESS.name)
total_memory = sum(stats['memory_usage'] for stats in task_stats.values())
total_peak = sum(stats['peak_memory'] for stats in task_stats.values())
# Summary row with separators
table.add_row(
"SUMMARY",
f"Total: {len(task_stats)}",
f"Active: {active_tasks}",
f"{total_memory:.1f}",
f"{total_peak:.1f}",
"N/A"
)
# Add a separator
table.add_row("" * 10, "" * 20, "" * 10, "" * 8, "" * 8, "" * 10)
# Status icons
status_icons = {
CrawlStatus.QUEUED.name: "",
CrawlStatus.IN_PROGRESS.name: "🔄",
CrawlStatus.COMPLETED.name: "",
CrawlStatus.FAILED.name: ""
}
# Calculate how many rows we can display based on available space
# We can display more rows now that we have a dedicated panel
display_count = min(len(task_stats), 20) # Display up to 20 tasks
# Add rows for each task
for task_id, stats in sorted(
list(task_stats.items())[:display_count],
# Sort: 1. IN_PROGRESS first, 2. QUEUED, 3. COMPLETED/FAILED by recency
key=lambda x: (
0 if x[1]['status'] == CrawlStatus.IN_PROGRESS.name else
1 if x[1]['status'] == CrawlStatus.QUEUED.name else
2,
-1 * (x[1].get('end_time', 0) or 0) # Most recent first
)
):
# Truncate task_id and URL for display
short_id = task_id[:8]
url = stats['url']
if len(url) > 50: # Allow longer URLs in the dedicated panel
url = url[:47] + "..."
# Format status with icon
status = f"{status_icons.get(stats['status'], '?')} {stats['status']}"
# Add row
table.add_row(
short_id,
url,
status,
f"{stats['memory_usage']:.1f}",
f"{stats['peak_memory']:.1f}",
stats['duration'] if 'duration' in stats else "0:00"
)
return Panel(table, title="Task Details", border_style="yellow")
def _create_footer(self) -> Panel:
"""Create the footer panel."""
from rich.columns import Columns
from rich.align import Align
memory_status = self.monitor.get_memory_status()
memory_icon = "🟢" # Default NORMAL
if memory_status == "PRESSURE":
memory_icon = "🟠"
elif memory_status == "CRITICAL":
memory_icon = "🔴"
# Left section - memory status
left_text = Text()
left_text.append("Memory Status: ", style="bold")
status_style = "green" if memory_status == "NORMAL" else "yellow" if memory_status == "PRESSURE" else "red bold"
left_text.append(f"{memory_icon} {memory_status}", style=status_style)
# Center section - copyright
center_text = Text("© Crawl4AI 2025 | Made by UnclecCode", style="cyan italic")
# Right section - quit instruction
right_text = Text()
right_text.append("Press ", style="bold")
right_text.append("q", style="white on blue")
right_text.append(" to quit", style="bold")
# Create columns with the three sections
footer_content = Columns(
[
Align.left(left_text),
Align.center(center_text),
Align.right(right_text)
],
expand=True
)
# Create a more visible footer panel
return Panel(
footer_content,
border_style="white",
padding=(0, 1) # Add padding for better visibility
)
class CrawlerMonitor:
"""
Comprehensive monitoring and visualization system for tracking web crawler operations in real-time.
Provides a terminal-based dashboard that displays task statuses, memory usage, queue statistics,
and performance metrics.
"""
def __init__(
self,
urls_total: int = 0,
refresh_rate: float = 1.0,
enable_ui: bool = True,
max_width: int = 120
):
"""
Initialize the CrawlerMonitor.
Args:
urls_total: Total number of URLs to be crawled
refresh_rate: How often to refresh the UI (in seconds)
enable_ui: Whether to display the terminal UI
max_width: Maximum width of the UI in characters
"""
# Core monitoring attributes
self.stats = {} # Task ID -> stats dict
self.memory_status = "NORMAL"
self.start_time = None
self.end_time = None
self.is_running = False
self.queue_stats = {
"total_queued": 0,
"highest_wait_time": 0.0,
"avg_wait_time": 0.0
}
self.urls_total = urls_total
self.urls_completed = 0
self.peak_memory_percent = 0.0
self.peak_memory_time = 0.0
# Status counts
self.status_counts = {
CrawlStatus.QUEUED.name: 0,
CrawlStatus.IN_PROGRESS.name: 0,
CrawlStatus.COMPLETED.name: 0,
CrawlStatus.FAILED.name: 0
}
# Requeue tracking
self.requeued_count = 0
# Thread-safety
self._lock = threading.RLock()
# Terminal UI
self.enable_ui = enable_ui
self.terminal_ui = TerminalUI(
refresh_rate=refresh_rate,
max_width=max_width
) if enable_ui else None
def start(self):
"""
Start the monitoring session.
- Initializes the start_time
- Sets is_running to True
- Starts the terminal UI if enabled
"""
with self._lock:
self.start_time = time.time()
self.is_running = True
# Start the terminal UI
if self.enable_ui and self.terminal_ui:
self.terminal_ui.start(self)
def stop(self):
"""
Stop the monitoring session.
- Records end_time
- Sets is_running to False
- Stops the terminal UI
- Generates final summary statistics
"""
with self._lock:
self.end_time = time.time()
self.is_running = False
# Stop the terminal UI
if self.enable_ui and self.terminal_ui:
self.terminal_ui.stop()
def add_task(self, task_id: str, url: str):
"""
Register a new task with the monitor.
Args:
task_id: Unique identifier for the task
url: URL being crawled
The task is initialized with:
- status: QUEUED
- url: The URL to crawl
- enqueue_time: Current time
- memory_usage: 0
- peak_memory: 0
- wait_time: 0
- retry_count: 0
"""
with self._lock:
self.stats[task_id] = {
"task_id": task_id,
"url": url,
"status": CrawlStatus.QUEUED.name,
"enqueue_time": time.time(),
"start_time": None,
"end_time": None,
"memory_usage": 0.0,
"peak_memory": 0.0,
"error_message": "",
"wait_time": 0.0,
"retry_count": 0,
"duration": "0:00",
"counted_requeue": False
}
# Update status counts
self.status_counts[CrawlStatus.QUEUED.name] += 1
def update_task(
self,
task_id: str,
status: Optional[CrawlStatus] = None,
start_time: Optional[float] = None,
end_time: Optional[float] = None,
memory_usage: Optional[float] = None,
peak_memory: Optional[float] = None,
error_message: Optional[str] = None,
retry_count: Optional[int] = None,
wait_time: Optional[float] = None
):
"""
Update statistics for a specific task.
Args:
task_id: Unique identifier for the task
status: New status (QUEUED, IN_PROGRESS, COMPLETED, FAILED)
start_time: When task execution started
end_time: When task execution ended
memory_usage: Current memory usage in MB
peak_memory: Maximum memory usage in MB
error_message: Error description if failed
retry_count: Number of retry attempts
wait_time: Time spent in queue
Updates task statistics and updates status counts.
If status changes, decrements old status count and
increments new status count.
"""
with self._lock:
# Check if task exists
if task_id not in self.stats:
return
task_stats = self.stats[task_id]
# Update status counts if status is changing
old_status = task_stats["status"]
if status and status.name != old_status:
self.status_counts[old_status] -= 1
self.status_counts[status.name] += 1
# Track completion
if status == CrawlStatus.COMPLETED:
self.urls_completed += 1
# Track requeues
if old_status in [CrawlStatus.COMPLETED.name, CrawlStatus.FAILED.name] and not task_stats.get("counted_requeue", False):
self.requeued_count += 1
task_stats["counted_requeue"] = True
# Update task statistics
if status:
task_stats["status"] = status.name
if start_time is not None:
task_stats["start_time"] = start_time
if end_time is not None:
task_stats["end_time"] = end_time
if memory_usage is not None:
task_stats["memory_usage"] = memory_usage
# Update peak memory if necessary
current_percent = (memory_usage / psutil.virtual_memory().total) * 100
if current_percent > self.peak_memory_percent:
self.peak_memory_percent = current_percent
self.peak_memory_time = time.time()
if peak_memory is not None:
task_stats["peak_memory"] = peak_memory
if error_message is not None:
task_stats["error_message"] = error_message
if retry_count is not None:
task_stats["retry_count"] = retry_count
if wait_time is not None:
task_stats["wait_time"] = wait_time
# Calculate duration
if task_stats["start_time"]:
end = task_stats["end_time"] or time.time()
duration = end - task_stats["start_time"]
task_stats["duration"] = self._format_time(duration)
def update_memory_status(self, status: str):
"""
Update the current memory status.
Args:
status: Memory status (NORMAL, PRESSURE, CRITICAL, or custom)
Also updates the UI to reflect the new status.
"""
with self._lock:
self.memory_status = status
def update_queue_statistics(
self,
total_queued: int,
highest_wait_time: float,
avg_wait_time: float
):
"""
Update statistics related to the task queue.
Args:
total_queued: Number of tasks currently in queue
highest_wait_time: Longest wait time of any queued task
avg_wait_time: Average wait time across all queued tasks
"""
with self._lock:
self.queue_stats = {
"total_queued": total_queued,
"highest_wait_time": highest_wait_time,
"avg_wait_time": avg_wait_time
}
def get_task_stats(self, task_id: str) -> Dict:
"""
Get statistics for a specific task.
Args:
task_id: Unique identifier for the task
Returns:
Dictionary containing all task statistics
"""
with self._lock:
return self.stats.get(task_id, {}).copy()
def get_all_task_stats(self) -> Dict[str, Dict]:
"""
Get statistics for all tasks.
Returns:
Dictionary mapping task_ids to their statistics
"""
with self._lock:
return self.stats.copy()
def get_memory_status(self) -> str:
"""
Get the current memory status.
Returns:
Current memory status string
"""
with self._lock:
return self.memory_status
def get_queue_stats(self) -> Dict:
"""
Get current queue statistics.
Returns:
Dictionary with queue statistics including:
- total_queued: Number of tasks in queue
- highest_wait_time: Longest wait time
- avg_wait_time: Average wait time
"""
with self._lock:
return self.queue_stats.copy()
def get_summary(self) -> Dict:
"""
Get a summary of all crawler statistics.
Returns:
Dictionary containing:
- runtime: Total runtime in seconds
- urls_total: Total URLs to process
- urls_completed: Number of completed URLs
- completion_percentage: Percentage complete
- status_counts: Count of tasks in each status
- memory_status: Current memory status
- peak_memory_percent: Highest memory usage
- peak_memory_time: When peak memory occurred
- avg_task_duration: Average task processing time
- estimated_completion_time: Projected finish time
- requeue_rate: Percentage of tasks requeued
"""
with self._lock:
# Calculate runtime
current_time = time.time()
runtime = current_time - (self.start_time or current_time)
# Calculate completion percentage
completion_percentage = 0
if self.urls_total > 0:
completion_percentage = (self.urls_completed / self.urls_total) * 100
# Calculate average task duration for completed tasks
completed_tasks = [
task for task in self.stats.values()
if task["status"] == CrawlStatus.COMPLETED.name and task.get("start_time") and task.get("end_time")
]
avg_task_duration = 0
if completed_tasks:
total_duration = sum(task["end_time"] - task["start_time"] for task in completed_tasks)
avg_task_duration = total_duration / len(completed_tasks)
# Calculate requeue rate
requeue_rate = 0
if len(self.stats) > 0:
requeue_rate = (self.requeued_count / len(self.stats)) * 100
# Calculate estimated completion time
estimated_completion_time = "N/A"
if avg_task_duration > 0 and self.urls_total > 0 and self.urls_completed > 0:
remaining_tasks = self.urls_total - self.urls_completed
estimated_seconds = remaining_tasks * avg_task_duration
estimated_completion_time = self._format_time(estimated_seconds)
return {
"runtime": runtime,
"urls_total": self.urls_total,
"urls_completed": self.urls_completed,
"completion_percentage": completion_percentage,
"status_counts": self.status_counts.copy(),
"memory_status": self.memory_status,
"peak_memory_percent": self.peak_memory_percent,
"peak_memory_time": self.peak_memory_time,
"avg_task_duration": avg_task_duration,
"estimated_completion_time": estimated_completion_time,
"requeue_rate": requeue_rate,
"requeued_count": self.requeued_count
}
def render(self):
"""
Render the terminal UI.
This is the main UI rendering loop that:
1. Updates all statistics
2. Formats the display
3. Renders the ASCII interface
4. Handles keyboard input
Note: The actual rendering is handled by the TerminalUI class
which uses the rich library's Live display.
"""
if self.enable_ui and self.terminal_ui:
# Force an update of the UI
if hasattr(self.terminal_ui, '_update_display'):
self.terminal_ui._update_display()
def _format_time(self, seconds: float) -> str:
"""
Format time in hours:minutes:seconds.
Args:
seconds: Time in seconds
Returns:
Formatted time string (e.g., "1:23:45")
"""
delta = timedelta(seconds=int(seconds))
hours, remainder = divmod(delta.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
if hours > 0:
return f"{hours}:{minutes:02}:{seconds:02}"
else:
return f"{minutes}:{seconds:02}"
def _calculate_estimated_completion(self) -> str:
"""
Calculate estimated completion time based on current progress.
Returns:
Formatted time string
"""
summary = self.get_summary()
return summary.get("estimated_completion_time", "N/A")
# Example code for testing
if __name__ == "__main__":
# Initialize the monitor
monitor = CrawlerMonitor(urls_total=100)
# Start monitoring
monitor.start()
try:
# Simulate some tasks
for i in range(20):
task_id = str(uuid.uuid4())
url = f"https://example.com/page{i}"
monitor.add_task(task_id, url)
# Simulate 20% of tasks are already running
if i < 4:
monitor.update_task(
task_id=task_id,
status=CrawlStatus.IN_PROGRESS,
start_time=time.time() - 30, # Started 30 seconds ago
memory_usage=10.5
)
# Simulate 10% of tasks are completed
if i >= 4 and i < 6:
start_time = time.time() - 60
end_time = time.time() - 15
monitor.update_task(
task_id=task_id,
status=CrawlStatus.IN_PROGRESS,
start_time=start_time,
memory_usage=8.2
)
monitor.update_task(
task_id=task_id,
status=CrawlStatus.COMPLETED,
end_time=end_time,
memory_usage=0,
peak_memory=15.7
)
# Simulate 5% of tasks fail
if i >= 6 and i < 7:
start_time = time.time() - 45
end_time = time.time() - 20
monitor.update_task(
task_id=task_id,
status=CrawlStatus.IN_PROGRESS,
start_time=start_time,
memory_usage=12.3
)
monitor.update_task(
task_id=task_id,
status=CrawlStatus.FAILED,
end_time=end_time,
memory_usage=0,
peak_memory=18.2,
error_message="Connection timeout"
)
# Simulate memory pressure
monitor.update_memory_status("PRESSURE")
# Simulate queue statistics
monitor.update_queue_statistics(
total_queued=16, # 20 - 4 (in progress)
highest_wait_time=120.5,
avg_wait_time=60.2
)
# Keep the monitor running for a demonstration
print("Crawler Monitor is running. Press 'q' to exit.")
while monitor.is_running:
time.sleep(0.1)
except KeyboardInterrupt:
print("\nExiting crawler monitor...")
finally:
# Stop the monitor
monitor.stop()
print("Crawler monitor exited successfully.")

View File

@@ -28,6 +28,12 @@ class CrawlerTaskResult:
start_time: Union[datetime, float]
end_time: Union[datetime, float]
error_message: str = ""
retry_count: int = 0
wait_time: float = 0.0
@property
def success(self) -> bool:
return self.result.success
class CrawlStatus(Enum):
@@ -67,6 +73,9 @@ class CrawlStats:
memory_usage: float = 0.0
peak_memory: float = 0.0
error_message: str = ""
wait_time: float = 0.0
retry_count: int = 0
counted_requeue: bool = False
@property
def duration(self) -> str:
@@ -87,6 +96,7 @@ class CrawlStats:
duration = end - start
return str(timedelta(seconds=int(duration.total_seconds())))
class DisplayMode(Enum):
DETAILED = "DETAILED"
AGGREGATED = "AGGREGATED"