feat(monitor): add real-time crawler monitoring system with memory management

Implements a comprehensive monitoring and visualization system for tracking web crawler operations in real-time. The system includes:
- Terminal-based dashboard with rich UI for displaying task statuses
- Memory pressure monitoring and adaptive dispatch control
- Queue statistics and performance metrics tracking
- Detailed task progress visualization
- Stress testing framework for memory management

This addition helps operators track crawler performance and manage memory usage more effectively.
This commit is contained in:
UncleCode
2025-03-12 19:05:24 +08:00
parent 9547bada3a
commit 1630fbdafe
8 changed files with 1956 additions and 321 deletions

View File

@@ -4,17 +4,15 @@ from .models import (
CrawlResult,
CrawlerTaskResult,
CrawlStatus,
DisplayMode,
CrawlStats,
DomainState,
)
from rich.live import Live
from rich.table import Table
from rich.console import Console
from rich import box
from datetime import timedelta, datetime
from .components.crawler_monitor import CrawlerMonitor
from .types import AsyncWebCrawler
from collections.abc import AsyncGenerator
import time
import psutil
import asyncio
@@ -24,8 +22,6 @@ from urllib.parse import urlparse
import random
from abc import ABC, abstractmethod
from math import inf as infinity
class RateLimiter:
def __init__(
@@ -87,201 +83,6 @@ class RateLimiter:
return True
class CrawlerMonitor:
def __init__(
self,
max_visible_rows: int = 15,
display_mode: DisplayMode = DisplayMode.DETAILED,
):
self.console = Console()
self.max_visible_rows = max_visible_rows
self.display_mode = display_mode
self.stats: Dict[str, CrawlStats] = {}
self.process = psutil.Process()
self.start_time = time.time()
self.live = Live(self._create_table(), refresh_per_second=2)
def start(self):
self.live.start()
def stop(self):
self.live.stop()
def add_task(self, task_id: str, url: str):
self.stats[task_id] = CrawlStats(
task_id=task_id, url=url, status=CrawlStatus.QUEUED
)
self.live.update(self._create_table())
def update_task(self, task_id: str, **kwargs):
if task_id in self.stats:
for key, value in kwargs.items():
setattr(self.stats[task_id], key, value)
self.live.update(self._create_table())
def _create_aggregated_table(self) -> Table:
"""Creates a compact table showing only aggregated statistics"""
table = Table(
box=box.ROUNDED,
title="Crawler Status Overview",
title_style="bold magenta",
header_style="bold blue",
show_lines=True,
)
# Calculate statistics
total_tasks = len(self.stats)
queued = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
)
in_progress = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
)
completed = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
)
failed = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
)
# Memory statistics
current_memory = self.process.memory_info().rss / (1024 * 1024)
total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
peak_memory = max(
(stat.peak_memory for stat in self.stats.values()), default=0.0
)
# Duration
duration = time.time() - self.start_time
# Create status row
table.add_column("Status", style="bold cyan")
table.add_column("Count", justify="right")
table.add_column("Percentage", justify="right")
table.add_row("Total Tasks", str(total_tasks), "100%")
table.add_row(
"[yellow]In Queue[/yellow]",
str(queued),
f"{(queued / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[blue]In Progress[/blue]",
str(in_progress),
f"{(in_progress / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[green]Completed[/green]",
str(completed),
f"{(completed / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[red]Failed[/red]",
str(failed),
f"{(failed / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
)
# Add memory information
table.add_section()
table.add_row(
"[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
)
table.add_row(
"[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
)
table.add_row(
"[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
)
table.add_row(
"[yellow]Runtime[/yellow]",
str(timedelta(seconds=int(duration))),
"",
)
return table
def _create_detailed_table(self) -> Table:
table = Table(
box=box.ROUNDED,
title="Crawler Performance Monitor",
title_style="bold magenta",
header_style="bold blue",
)
# Add columns
table.add_column("Task ID", style="cyan", no_wrap=True)
table.add_column("URL", style="cyan", no_wrap=True)
table.add_column("Status", style="bold")
table.add_column("Memory (MB)", justify="right")
table.add_column("Peak (MB)", justify="right")
table.add_column("Duration", justify="right")
table.add_column("Info", style="italic")
# Add summary row
total_memory = sum(stat.memory_usage for stat in self.stats.values())
active_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
)
completed_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
)
failed_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
)
table.add_row(
"[bold yellow]SUMMARY",
f"Total: {len(self.stats)}",
f"Active: {active_count}",
f"{total_memory:.1f}",
f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
str(
timedelta(
seconds=int(time.time() - self.start_time)
)
),
f"{completed_count}{failed_count}",
style="bold",
)
table.add_section()
# Add rows for each task
visible_stats = sorted(
self.stats.values(),
key=lambda x: (
x.status != CrawlStatus.IN_PROGRESS,
x.status != CrawlStatus.QUEUED,
x.end_time or infinity,
),
)[: self.max_visible_rows]
for stat in visible_stats:
status_style = {
CrawlStatus.QUEUED: "white",
CrawlStatus.IN_PROGRESS: "yellow",
CrawlStatus.COMPLETED: "green",
CrawlStatus.FAILED: "red",
}[stat.status]
table.add_row(
stat.task_id[:8], # Show first 8 chars of task ID
stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
f"[{status_style}]{stat.status.value}[/{status_style}]",
f"{stat.memory_usage:.1f}",
f"{stat.peak_memory:.1f}",
stat.duration,
stat.error_message[:40] if stat.error_message else "",
)
return table
def _create_table(self) -> Table:
"""Creates the appropriate table based on display mode"""
if self.display_mode == DisplayMode.AGGREGATED:
return self._create_aggregated_table()
return self._create_detailed_table()
class BaseDispatcher(ABC):
def __init__(
@@ -309,7 +110,7 @@ class BaseDispatcher(ABC):
async def run_urls(
self,
urls: List[str],
crawler: "AsyncWebCrawler", # noqa: F821
crawler: AsyncWebCrawler, # noqa: F821
config: CrawlerRunConfig,
monitor: Optional[CrawlerMonitor] = None,
) -> List[CrawlerTaskResult]:
@@ -320,71 +121,144 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
def __init__(
self,
memory_threshold_percent: float = 90.0,
critical_threshold_percent: float = 95.0, # New critical threshold
recovery_threshold_percent: float = 85.0, # New recovery threshold
check_interval: float = 1.0,
max_session_permit: int = 20,
memory_wait_timeout: float = 300.0, # 5 minutes default timeout
fairness_timeout: float = 600.0, # 10 minutes before prioritizing long-waiting URLs
rate_limiter: Optional[RateLimiter] = None,
monitor: Optional[CrawlerMonitor] = None,
):
super().__init__(rate_limiter, monitor)
self.memory_threshold_percent = memory_threshold_percent
self.critical_threshold_percent = critical_threshold_percent
self.recovery_threshold_percent = recovery_threshold_percent
self.check_interval = check_interval
self.max_session_permit = max_session_permit
self.memory_wait_timeout = memory_wait_timeout
self.result_queue = asyncio.Queue() # Queue for storing results
self.fairness_timeout = fairness_timeout
self.result_queue = asyncio.Queue()
self.task_queue = asyncio.PriorityQueue() # Priority queue for better management
self.memory_pressure_mode = False # Flag to indicate when we're in memory pressure mode
self.current_memory_percent = 0.0 # Track current memory usage
async def _memory_monitor_task(self):
"""Background task to continuously monitor memory usage and update state"""
while True:
self.current_memory_percent = psutil.virtual_memory().percent
# Enter memory pressure mode if we cross the threshold
if not self.memory_pressure_mode and self.current_memory_percent >= self.memory_threshold_percent:
self.memory_pressure_mode = True
if self.monitor:
self.monitor.update_memory_status("PRESSURE")
# Exit memory pressure mode if we go below recovery threshold
elif self.memory_pressure_mode and self.current_memory_percent <= self.recovery_threshold_percent:
self.memory_pressure_mode = False
if self.monitor:
self.monitor.update_memory_status("NORMAL")
# In critical mode, we might need to take more drastic action
if self.current_memory_percent >= self.critical_threshold_percent:
if self.monitor:
self.monitor.update_memory_status("CRITICAL")
# We could implement additional memory-saving measures here
await asyncio.sleep(self.check_interval)
def _get_priority_score(self, wait_time: float, retry_count: int) -> float:
"""Calculate priority score (lower is higher priority)
- URLs waiting longer than fairness_timeout get higher priority
- More retry attempts decreases priority
"""
if wait_time > self.fairness_timeout:
# High priority for long-waiting URLs
return -wait_time
# Standard priority based on retries
return retry_count
async def crawl_url(
self,
url: str,
config: CrawlerRunConfig,
task_id: str,
retry_count: int = 0,
) -> CrawlerTaskResult:
start_time = time.time()
error_message = ""
memory_usage = peak_memory = 0.0
# Get starting memory for accurate measurement
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024)
try:
if self.monitor:
self.monitor.update_task(
task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
task_id,
status=CrawlStatus.IN_PROGRESS,
start_time=start_time,
retry_count=retry_count
)
self.concurrent_sessions += 1
if self.rate_limiter:
await self.rate_limiter.wait_if_needed(url)
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024)
# Check if we're in critical memory state
if self.current_memory_percent >= self.critical_threshold_percent:
# Requeue this task with increased priority and retry count
enqueue_time = time.time()
priority = self._get_priority_score(enqueue_time - start_time, retry_count + 1)
await self.task_queue.put((priority, (url, task_id, retry_count + 1, enqueue_time)))
# Update monitoring
if self.monitor:
self.monitor.update_task(
task_id,
status=CrawlStatus.QUEUED,
error_message="Requeued due to critical memory pressure"
)
# Return placeholder result with requeued status
return CrawlerTaskResult(
task_id=task_id,
url=url,
result=CrawlResult(
url=url, html="", metadata={"status": "requeued"},
success=False, error_message="Requeued due to critical memory pressure"
),
memory_usage=0,
peak_memory=0,
start_time=start_time,
end_time=time.time(),
error_message="Requeued due to critical memory pressure",
retry_count=retry_count + 1
)
# Execute the crawl
result = await self.crawler.arun(url, config=config, session_id=task_id)
# Measure memory usage
end_memory = process.memory_info().rss / (1024 * 1024)
memory_usage = peak_memory = end_memory - start_memory
# Handle rate limiting
if self.rate_limiter and result.status_code:
if not self.rate_limiter.update_delay(url, result.status_code):
error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
result = CrawlerTaskResult(
task_id=task_id,
url=url,
result=result,
memory_usage=memory_usage,
peak_memory=peak_memory,
start_time=start_time,
end_time=time.time(),
error_message=error_message,
)
await self.result_queue.put(result)
return result
# Update status based on result
if not result.success:
error_message = result.error_message
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
elif self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
except Exception as e:
error_message = str(e)
if self.monitor:
@@ -392,7 +266,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
result = CrawlResult(
url=url, html="", metadata={}, success=False, error_message=str(e)
)
finally:
end_time = time.time()
if self.monitor:
@@ -402,9 +276,10 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
memory_usage=memory_usage,
peak_memory=peak_memory,
error_message=error_message,
retry_count=retry_count
)
self.concurrent_sessions -= 1
return CrawlerTaskResult(
task_id=task_id,
url=url,
@@ -414,116 +289,240 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
start_time=start_time,
end_time=end_time,
error_message=error_message,
retry_count=retry_count
)
async def run_urls(
self,
urls: List[str],
crawler: "AsyncWebCrawler", # noqa: F821
crawler: AsyncWebCrawler,
config: CrawlerRunConfig,
) -> List[CrawlerTaskResult]:
self.crawler = crawler
# Start the memory monitor task
memory_monitor = asyncio.create_task(self._memory_monitor_task())
if self.monitor:
self.monitor.start()
results = []
try:
pending_tasks = []
active_tasks = []
task_queue = []
for url in urls:
task_id = str(uuid.uuid4())
if self.monitor:
self.monitor.add_task(task_id, url)
task_queue.append((url, task_id))
while task_queue or active_tasks:
wait_start_time = time.time()
while len(active_tasks) < self.max_session_permit and task_queue:
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
# Check if we've exceeded the timeout
if time.time() - wait_start_time > self.memory_wait_timeout:
raise MemoryError(
f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
)
await asyncio.sleep(self.check_interval)
continue
url, task_id = task_queue.pop(0)
task = asyncio.create_task(self.crawl_url(url, config, task_id))
active_tasks.append(task)
if not active_tasks:
await asyncio.sleep(self.check_interval)
continue
done, pending = await asyncio.wait(
active_tasks, return_when=asyncio.FIRST_COMPLETED
)
pending_tasks.extend(done)
active_tasks = list(pending)
return await asyncio.gather(*pending_tasks)
finally:
if self.monitor:
self.monitor.stop()
async def run_urls_stream(
self,
urls: List[str],
crawler: "AsyncWebCrawler", # noqa: F821
config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlerTaskResult, None]:
self.crawler = crawler
if self.monitor:
self.monitor.start()
try:
active_tasks = []
task_queue = []
completed_count = 0
total_urls = len(urls)
# Initialize task queue
for url in urls:
task_id = str(uuid.uuid4())
if self.monitor:
self.monitor.add_task(task_id, url)
task_queue.append((url, task_id))
while completed_count < total_urls:
# Start new tasks if memory permits
while len(active_tasks) < self.max_session_permit and task_queue:
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
await asyncio.sleep(self.check_interval)
continue
url, task_id = task_queue.pop(0)
task = asyncio.create_task(self.crawl_url(url, config, task_id))
active_tasks.append(task)
if not active_tasks and not task_queue:
break
# Wait for any task to complete and yield results
# Add to queue with initial priority 0, retry count 0, and current time
await self.task_queue.put((0, (url, task_id, 0, time.time())))
active_tasks = []
# Process until both queues are empty
while not self.task_queue.empty() or active_tasks:
# If memory pressure is low, start new tasks
if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
try:
# Try to get a task with timeout to avoid blocking indefinitely
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
self.task_queue.get(), timeout=0.1
)
# Create and start the task
task = asyncio.create_task(
self.crawl_url(url, config, task_id, retry_count)
)
active_tasks.append(task)
# Update waiting time in monitor
if self.monitor:
wait_time = time.time() - enqueue_time
self.monitor.update_task(
task_id,
wait_time=wait_time,
status=CrawlStatus.IN_PROGRESS
)
except asyncio.TimeoutError:
# No tasks in queue, that's fine
pass
# Wait for completion even if queue is starved
if active_tasks:
done, pending = await asyncio.wait(
active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
)
# Process completed tasks
for completed_task in done:
result = await completed_task
completed_count += 1
yield result
results.append(result)
# Update active tasks list
active_tasks = list(pending)
else:
await asyncio.sleep(self.check_interval)
# If no active tasks but still waiting, sleep briefly
await asyncio.sleep(self.check_interval / 2)
# Update priorities for waiting tasks if needed
await self._update_queue_priorities()
return results
except Exception as e:
if self.monitor:
self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")
finally:
# Clean up
memory_monitor.cancel()
if self.monitor:
self.monitor.stop()
async def _update_queue_priorities(self):
"""Periodically update priorities of items in the queue to prevent starvation"""
# Skip if queue is empty
if self.task_queue.empty():
return
# Use a drain-and-refill approach to update all priorities
temp_items = []
# Drain the queue (with a safety timeout to prevent blocking)
try:
drain_start = time.time()
while not self.task_queue.empty() and time.time() - drain_start < 5.0: # 5 second safety timeout
try:
# Get item from queue with timeout
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
self.task_queue.get(), timeout=0.1
)
# Calculate new priority based on current wait time
current_time = time.time()
wait_time = current_time - enqueue_time
new_priority = self._get_priority_score(wait_time, retry_count)
# Store with updated priority
temp_items.append((new_priority, (url, task_id, retry_count, enqueue_time)))
# Update monitoring stats for this task
if self.monitor and task_id in self.monitor.stats:
self.monitor.update_task(task_id, wait_time=wait_time)
except asyncio.TimeoutError:
# Queue might be empty or very slow
break
except Exception as e:
# If anything goes wrong, make sure we refill the queue with what we've got
self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")
# Calculate queue statistics
if temp_items and self.monitor:
total_queued = len(temp_items)
wait_times = [item[1][3] for item in temp_items]
highest_wait_time = time.time() - min(wait_times) if wait_times else 0
avg_wait_time = sum(time.time() - t for t in wait_times) / len(wait_times) if wait_times else 0
# Update queue statistics in monitor
self.monitor.update_queue_statistics(
total_queued=total_queued,
highest_wait_time=highest_wait_time,
avg_wait_time=avg_wait_time
)
# Sort by priority (lowest number = highest priority)
temp_items.sort(key=lambda x: x[0])
# Refill the queue with updated priorities
for item in temp_items:
await self.task_queue.put(item)
async def run_urls_stream(
self,
urls: List[str],
crawler: AsyncWebCrawler,
config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlerTaskResult, None]:
self.crawler = crawler
# Start the memory monitor task
memory_monitor = asyncio.create_task(self._memory_monitor_task())
if self.monitor:
self.monitor.start()
try:
# Initialize task queue
for url in urls:
task_id = str(uuid.uuid4())
if self.monitor:
self.monitor.add_task(task_id, url)
# Add to queue with initial priority 0, retry count 0, and current time
await self.task_queue.put((0, (url, task_id, 0, time.time())))
active_tasks = []
completed_count = 0
total_urls = len(urls)
while completed_count < total_urls:
# If memory pressure is low, start new tasks
if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
try:
# Try to get a task with timeout
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
self.task_queue.get(), timeout=0.1
)
# Create and start the task
task = asyncio.create_task(
self.crawl_url(url, config, task_id, retry_count)
)
active_tasks.append(task)
# Update waiting time in monitor
if self.monitor:
wait_time = time.time() - enqueue_time
self.monitor.update_task(
task_id,
wait_time=wait_time,
status=CrawlStatus.IN_PROGRESS
)
except asyncio.TimeoutError:
# No tasks in queue, that's fine
pass
# Process completed tasks and yield results
if active_tasks:
done, pending = await asyncio.wait(
active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
)
for completed_task in done:
result = await completed_task
# Only count as completed if it wasn't requeued
if "requeued" not in result.error_message:
completed_count += 1
yield result
# Update active tasks list
active_tasks = list(pending)
else:
# If no active tasks but still waiting, sleep briefly
await asyncio.sleep(self.check_interval / 2)
# Update priorities for waiting tasks if needed
await self._update_queue_priorities()
finally:
# Clean up
memory_monitor.cancel()
if self.monitor:
self.monitor.stop()
class SemaphoreDispatcher(BaseDispatcher):
def __init__(
@@ -620,7 +619,7 @@ class SemaphoreDispatcher(BaseDispatcher):
async def run_urls(
self,
crawler: "AsyncWebCrawler", # noqa: F821
crawler: AsyncWebCrawler, # noqa: F821
urls: List[str],
config: CrawlerRunConfig,
) -> List[CrawlerTaskResult]:
@@ -644,4 +643,4 @@ class SemaphoreDispatcher(BaseDispatcher):
return await asyncio.gather(*tasks, return_exceptions=True)
finally:
if self.monitor:
self.monitor.stop()
self.monitor.stop()