feat(monitor): add real-time crawler monitoring system with memory management

Implements a comprehensive monitoring and visualization system for tracking web crawler operations in real-time. The system includes: - Terminal-based dashboard with rich UI for displaying task statuses - Memory pressure monitoring and adaptive dispatch control - Queue statistics and performance metrics tracking - Detailed task progress visualization - Stress testing framework for memory management This addition helps operators track crawler performance and manage memory usage more effectively.
2025-03-12 19:05:24 +08:00
parent 9547bada3a
commit 1630fbdafe
8 changed files with 1956 additions and 321 deletions
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -4,17 +4,15 @@ from .models import (
    CrawlResult,
    CrawlerTaskResult,
    CrawlStatus,
-    DisplayMode,
-    CrawlStats,
    DomainState,
 )

-from rich.live import Live
-from rich.table import Table
-from rich.console import Console
-from rich import box
-from datetime import timedelta, datetime
+from .components.crawler_monitor import CrawlerMonitor
+
+from .types import AsyncWebCrawler
+
 from collections.abc import AsyncGenerator
+
 import time
 import psutil
 import asyncio
@@ -24,8 +22,6 @@ from urllib.parse import urlparse
 import random
 from abc import ABC, abstractmethod

-from math import inf as infinity
-

 class RateLimiter:
    def __init__(
@@ -87,201 +83,6 @@ class RateLimiter:
        return True


-class CrawlerMonitor:
-    def __init__(
-        self,
-        max_visible_rows: int = 15,
-        display_mode: DisplayMode = DisplayMode.DETAILED,
-    ):
-        self.console = Console()
-        self.max_visible_rows = max_visible_rows
-        self.display_mode = display_mode
-        self.stats: Dict[str, CrawlStats] = {}
-        self.process = psutil.Process()
-        self.start_time = time.time()
-        self.live = Live(self._create_table(), refresh_per_second=2)
-
-    def start(self):
-        self.live.start()
-
-    def stop(self):
-        self.live.stop()
-
-    def add_task(self, task_id: str, url: str):
-        self.stats[task_id] = CrawlStats(
-            task_id=task_id, url=url, status=CrawlStatus.QUEUED
-        )
-        self.live.update(self._create_table())
-
-    def update_task(self, task_id: str, **kwargs):
-        if task_id in self.stats:
-            for key, value in kwargs.items():
-                setattr(self.stats[task_id], key, value)
-            self.live.update(self._create_table())
-
-    def _create_aggregated_table(self) -> Table:
-        """Creates a compact table showing only aggregated statistics"""
-        table = Table(
-            box=box.ROUNDED,
-            title="Crawler Status Overview",
-            title_style="bold magenta",
-            header_style="bold blue",
-            show_lines=True,
-        )
-
-        # Calculate statistics
-        total_tasks = len(self.stats)
-        queued = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
-        )
-        in_progress = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
-        )
-        completed = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
-        )
-        failed = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
-        )
-
-        # Memory statistics
-        current_memory = self.process.memory_info().rss / (1024 * 1024)
-        total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
-        peak_memory = max(
-            (stat.peak_memory for stat in self.stats.values()), default=0.0
-        )
-
-        # Duration
-        duration = time.time() - self.start_time
-
-        # Create status row
-        table.add_column("Status", style="bold cyan")
-        table.add_column("Count", justify="right")
-        table.add_column("Percentage", justify="right")
-
-        table.add_row("Total Tasks", str(total_tasks), "100%")
-        table.add_row(
-            "[yellow]In Queue[/yellow]",
-            str(queued),
-            f"{(queued / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[blue]In Progress[/blue]",
-            str(in_progress),
-            f"{(in_progress / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[green]Completed[/green]",
-            str(completed),
-            f"{(completed / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[red]Failed[/red]",
-            str(failed),
-            f"{(failed / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
-        )
-
-        # Add memory information
-        table.add_section()
-        table.add_row(
-            "[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[yellow]Runtime[/yellow]",
-            str(timedelta(seconds=int(duration))),
-            "",
-        )
-
-        return table
-
-    def _create_detailed_table(self) -> Table:
-        table = Table(
-            box=box.ROUNDED,
-            title="Crawler Performance Monitor",
-            title_style="bold magenta",
-            header_style="bold blue",
-        )
-
-        # Add columns
-        table.add_column("Task ID", style="cyan", no_wrap=True)
-        table.add_column("URL", style="cyan", no_wrap=True)
-        table.add_column("Status", style="bold")
-        table.add_column("Memory (MB)", justify="right")
-        table.add_column("Peak (MB)", justify="right")
-        table.add_column("Duration", justify="right")
-        table.add_column("Info", style="italic")
-
-        # Add summary row
-        total_memory = sum(stat.memory_usage for stat in self.stats.values())
-        active_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
-        )
-        completed_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
-        )
-        failed_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
-        )
-
-        table.add_row(
-            "[bold yellow]SUMMARY",
-            f"Total: {len(self.stats)}",
-            f"Active: {active_count}",
-            f"{total_memory:.1f}",
-            f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
-            str(
-                timedelta(
-                    seconds=int(time.time() - self.start_time)
-                )
-            ),
-            f"✓{completed_count} ✗{failed_count}",
-            style="bold",
-        )
-
-        table.add_section()
-
-        # Add rows for each task
-        visible_stats = sorted(
-            self.stats.values(),
-            key=lambda x: (
-                x.status != CrawlStatus.IN_PROGRESS,
-                x.status != CrawlStatus.QUEUED,
-                x.end_time or infinity,
-            ),
-        )[: self.max_visible_rows]
-
-        for stat in visible_stats:
-            status_style = {
-                CrawlStatus.QUEUED: "white",
-                CrawlStatus.IN_PROGRESS: "yellow",
-                CrawlStatus.COMPLETED: "green",
-                CrawlStatus.FAILED: "red",
-            }[stat.status]
-
-            table.add_row(
-                stat.task_id[:8],  # Show first 8 chars of task ID
-                stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
-                f"[{status_style}]{stat.status.value}[/{status_style}]",
-                f"{stat.memory_usage:.1f}",
-                f"{stat.peak_memory:.1f}",
-                stat.duration,
-                stat.error_message[:40] if stat.error_message else "",
-            )
-
-        return table
-
-    def _create_table(self) -> Table:
-        """Creates the appropriate table based on display mode"""
-        if self.display_mode == DisplayMode.AGGREGATED:
-            return self._create_aggregated_table()
-        return self._create_detailed_table()
-

 class BaseDispatcher(ABC):
    def __init__(
@@ -309,7 +110,7 @@ class BaseDispatcher(ABC):
    async def run_urls(
        self,
        urls: List[str],
-        crawler: "AsyncWebCrawler",  # noqa: F821
+        crawler: AsyncWebCrawler,  # noqa: F821
        config: CrawlerRunConfig,
        monitor: Optional[CrawlerMonitor] = None,
    ) -> List[CrawlerTaskResult]:
@@ -320,71 +121,144 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
    def __init__(
        self,
        memory_threshold_percent: float = 90.0,
+        critical_threshold_percent: float = 95.0,  # New critical threshold
+        recovery_threshold_percent: float = 85.0,  # New recovery threshold
        check_interval: float = 1.0,
        max_session_permit: int = 20,
-        memory_wait_timeout: float = 300.0,  # 5 minutes default timeout
+        fairness_timeout: float = 600.0,  # 10 minutes before prioritizing long-waiting URLs
        rate_limiter: Optional[RateLimiter] = None,
        monitor: Optional[CrawlerMonitor] = None,
    ):
        super().__init__(rate_limiter, monitor)
        self.memory_threshold_percent = memory_threshold_percent
+        self.critical_threshold_percent = critical_threshold_percent
+        self.recovery_threshold_percent = recovery_threshold_percent
        self.check_interval = check_interval
        self.max_session_permit = max_session_permit
-        self.memory_wait_timeout = memory_wait_timeout
-        self.result_queue = asyncio.Queue()  # Queue for storing results
-
+        self.fairness_timeout = fairness_timeout
+        self.result_queue = asyncio.Queue()
+        self.task_queue = asyncio.PriorityQueue()  # Priority queue for better management
+        self.memory_pressure_mode = False  # Flag to indicate when we're in memory pressure mode
+        self.current_memory_percent = 0.0  # Track current memory usage
+        
+    async def _memory_monitor_task(self):
+        """Background task to continuously monitor memory usage and update state"""
+        while True:
+            self.current_memory_percent = psutil.virtual_memory().percent
+            
+            # Enter memory pressure mode if we cross the threshold
+            if not self.memory_pressure_mode and self.current_memory_percent >= self.memory_threshold_percent:
+                self.memory_pressure_mode = True
+                if self.monitor:
+                    self.monitor.update_memory_status("PRESSURE")
+            
+            # Exit memory pressure mode if we go below recovery threshold
+            elif self.memory_pressure_mode and self.current_memory_percent <= self.recovery_threshold_percent:
+                self.memory_pressure_mode = False
+                if self.monitor:
+                    self.monitor.update_memory_status("NORMAL")
+            
+            # In critical mode, we might need to take more drastic action
+            if self.current_memory_percent >= self.critical_threshold_percent:
+                if self.monitor:
+                    self.monitor.update_memory_status("CRITICAL")
+                # We could implement additional memory-saving measures here
+                
+            await asyncio.sleep(self.check_interval)
+    
+    def _get_priority_score(self, wait_time: float, retry_count: int) -> float:
+        """Calculate priority score (lower is higher priority)
+        - URLs waiting longer than fairness_timeout get higher priority
+        - More retry attempts decreases priority
+        """
+        if wait_time > self.fairness_timeout:
+            # High priority for long-waiting URLs
+            return -wait_time
+        # Standard priority based on retries
+        return retry_count
+    
    async def crawl_url(
        self,
        url: str,
        config: CrawlerRunConfig,
        task_id: str,
+        retry_count: int = 0,
    ) -> CrawlerTaskResult:
        start_time = time.time()
        error_message = ""
        memory_usage = peak_memory = 0.0
-
+        
+        # Get starting memory for accurate measurement
+        process = psutil.Process()
+        start_memory = process.memory_info().rss / (1024 * 1024)
+        
        try:
            if self.monitor:
                self.monitor.update_task(
-                    task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
+                    task_id, 
+                    status=CrawlStatus.IN_PROGRESS, 
+                    start_time=start_time,
+                    retry_count=retry_count
                )
+                
            self.concurrent_sessions += 1
-
+            
            if self.rate_limiter:
                await self.rate_limiter.wait_if_needed(url)
-
-            process = psutil.Process()
-            start_memory = process.memory_info().rss / (1024 * 1024)
+                
+            # Check if we're in critical memory state
+            if self.current_memory_percent >= self.critical_threshold_percent:
+                # Requeue this task with increased priority and retry count
+                enqueue_time = time.time()
+                priority = self._get_priority_score(enqueue_time - start_time, retry_count + 1)
+                await self.task_queue.put((priority, (url, task_id, retry_count + 1, enqueue_time)))
+                
+                # Update monitoring
+                if self.monitor:
+                    self.monitor.update_task(
+                        task_id,
+                        status=CrawlStatus.QUEUED,
+                        error_message="Requeued due to critical memory pressure"
+                    )
+                
+                # Return placeholder result with requeued status
+                return CrawlerTaskResult(
+                    task_id=task_id,
+                    url=url,
+                    result=CrawlResult(
+                        url=url, html="", metadata={"status": "requeued"}, 
+                        success=False, error_message="Requeued due to critical memory pressure"
+                    ),
+                    memory_usage=0,
+                    peak_memory=0,
+                    start_time=start_time,
+                    end_time=time.time(),
+                    error_message="Requeued due to critical memory pressure",
+                    retry_count=retry_count + 1
+                )
+            
+            # Execute the crawl
            result = await self.crawler.arun(url, config=config, session_id=task_id)
+            
+            # Measure memory usage
            end_memory = process.memory_info().rss / (1024 * 1024)
-
            memory_usage = peak_memory = end_memory - start_memory
-
+            
+            # Handle rate limiting
            if self.rate_limiter and result.status_code:
                if not self.rate_limiter.update_delay(url, result.status_code):
                    error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
                    if self.monitor:
                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
-                    result = CrawlerTaskResult(
-                        task_id=task_id,
-                        url=url,
-                        result=result,
-                        memory_usage=memory_usage,
-                        peak_memory=peak_memory,
-                        start_time=start_time,
-                        end_time=time.time(),
-                        error_message=error_message,
-                    )
-                    await self.result_queue.put(result)
-                    return result
-
+                        
+            # Update status based on result
            if not result.success:
                error_message = result.error_message
                if self.monitor:
                    self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
            elif self.monitor:
                self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
-
+                
        except Exception as e:
            error_message = str(e)
            if self.monitor:
@@ -392,7 +266,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
            result = CrawlResult(
                url=url, html="", metadata={}, success=False, error_message=str(e)
            )
-
+            
        finally:
            end_time = time.time()
            if self.monitor:
@@ -402,9 +276,10 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
                    memory_usage=memory_usage,
                    peak_memory=peak_memory,
                    error_message=error_message,
+                    retry_count=retry_count
                )
            self.concurrent_sessions -= 1
-
+            
        return CrawlerTaskResult(
            task_id=task_id,
            url=url,
@@ -414,116 +289,240 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
            start_time=start_time,
            end_time=end_time,
            error_message=error_message,
+            retry_count=retry_count
        )
-
+        
    async def run_urls(
        self,
        urls: List[str],
-        crawler: "AsyncWebCrawler",  # noqa: F821
+        crawler: AsyncWebCrawler,
        config: CrawlerRunConfig,
    ) -> List[CrawlerTaskResult]:
        self.crawler = crawler
-
+        
+        # Start the memory monitor task
+        memory_monitor = asyncio.create_task(self._memory_monitor_task())
+        
        if self.monitor:
            self.monitor.start()
-
+            
+        results = []
+        
        try:
-            pending_tasks = []
-            active_tasks = []
-            task_queue = []
-
-            for url in urls:
-                task_id = str(uuid.uuid4())
-                if self.monitor:
-                    self.monitor.add_task(task_id, url)
-                task_queue.append((url, task_id))
-
-            while task_queue or active_tasks:
-                wait_start_time = time.time()
-                while len(active_tasks) < self.max_session_permit and task_queue:
-                    if psutil.virtual_memory().percent >= self.memory_threshold_percent:
-                        # Check if we've exceeded the timeout
-                        if time.time() - wait_start_time > self.memory_wait_timeout:
-                            raise MemoryError(
-                                f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
-                            )
-                        await asyncio.sleep(self.check_interval)
-                        continue
-
-                    url, task_id = task_queue.pop(0)
-                    task = asyncio.create_task(self.crawl_url(url, config, task_id))
-                    active_tasks.append(task)
-
-                if not active_tasks:
-                    await asyncio.sleep(self.check_interval)
-                    continue
-
-                done, pending = await asyncio.wait(
-                    active_tasks, return_when=asyncio.FIRST_COMPLETED
-                )
-
-                pending_tasks.extend(done)
-                active_tasks = list(pending)
-
-            return await asyncio.gather(*pending_tasks)
-        finally:
-            if self.monitor:
-                self.monitor.stop()
-
-    async def run_urls_stream(
-        self,
-        urls: List[str],
-        crawler: "AsyncWebCrawler", # noqa: F821
-        config: CrawlerRunConfig,
-    ) -> AsyncGenerator[CrawlerTaskResult, None]:
-        self.crawler = crawler
-        if self.monitor:
-            self.monitor.start()
-
-        try:
-            active_tasks = []
-            task_queue = []
-            completed_count = 0
-            total_urls = len(urls)
-
            # Initialize task queue
            for url in urls:
                task_id = str(uuid.uuid4())
                if self.monitor:
                    self.monitor.add_task(task_id, url)
-                task_queue.append((url, task_id))
-
-            while completed_count < total_urls:
-                # Start new tasks if memory permits
-                while len(active_tasks) < self.max_session_permit and task_queue:
-                    if psutil.virtual_memory().percent >= self.memory_threshold_percent:
-                        await asyncio.sleep(self.check_interval)
-                        continue
-
-                    url, task_id = task_queue.pop(0)
-                    task = asyncio.create_task(self.crawl_url(url, config, task_id))
-                    active_tasks.append(task)
-
-                if not active_tasks and not task_queue:
-                    break
-
-                # Wait for any task to complete and yield results
+                # Add to queue with initial priority 0, retry count 0, and current time
+                await self.task_queue.put((0, (url, task_id, 0, time.time())))
+                
+            active_tasks = []
+            
+            # Process until both queues are empty
+            while not self.task_queue.empty() or active_tasks:
+                # If memory pressure is low, start new tasks
+                if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
+                    try:
+                        # Try to get a task with timeout to avoid blocking indefinitely
+                        priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
+                            self.task_queue.get(), timeout=0.1
+                        )
+                        
+                        # Create and start the task
+                        task = asyncio.create_task(
+                            self.crawl_url(url, config, task_id, retry_count)
+                        )
+                        active_tasks.append(task)
+                        
+                        # Update waiting time in monitor
+                        if self.monitor:
+                            wait_time = time.time() - enqueue_time
+                            self.monitor.update_task(
+                                task_id, 
+                                wait_time=wait_time,
+                                status=CrawlStatus.IN_PROGRESS
+                            )
+                            
+                    except asyncio.TimeoutError:
+                        # No tasks in queue, that's fine
+                        pass
+                        
+                # Wait for completion even if queue is starved
                if active_tasks:
                    done, pending = await asyncio.wait(
                        active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
                    )
+                    
+                    # Process completed tasks
                    for completed_task in done:
                        result = await completed_task
-                        completed_count += 1
-                        yield result
+                        results.append(result)
+                        
+                    # Update active tasks list
                    active_tasks = list(pending)
                else:
-                    await asyncio.sleep(self.check_interval)
+                    # If no active tasks but still waiting, sleep briefly
+                    await asyncio.sleep(self.check_interval / 2)
+                    
+                # Update priorities for waiting tasks if needed
+                await self._update_queue_priorities()
+                
+            return results

+        except Exception as e:
+            if self.monitor:
+                self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")                
+        
        finally:
+            # Clean up
+            memory_monitor.cancel()
            if self.monitor:
                self.monitor.stop()
-
+                
+    async def _update_queue_priorities(self):
+        """Periodically update priorities of items in the queue to prevent starvation"""
+        # Skip if queue is empty
+        if self.task_queue.empty():
+            return
+            
+        # Use a drain-and-refill approach to update all priorities
+        temp_items = []
+        
+        # Drain the queue (with a safety timeout to prevent blocking)
+        try:
+            drain_start = time.time()
+            while not self.task_queue.empty() and time.time() - drain_start < 5.0:  # 5 second safety timeout
+                try:
+                    # Get item from queue with timeout
+                    priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
+                        self.task_queue.get(), timeout=0.1
+                    )
+                    
+                    # Calculate new priority based on current wait time
+                    current_time = time.time()
+                    wait_time = current_time - enqueue_time
+                    new_priority = self._get_priority_score(wait_time, retry_count)
+                    
+                    # Store with updated priority
+                    temp_items.append((new_priority, (url, task_id, retry_count, enqueue_time)))
+                    
+                    # Update monitoring stats for this task
+                    if self.monitor and task_id in self.monitor.stats:
+                        self.monitor.update_task(task_id, wait_time=wait_time)
+                        
+                except asyncio.TimeoutError:
+                    # Queue might be empty or very slow
+                    break
+        except Exception as e:
+            # If anything goes wrong, make sure we refill the queue with what we've got
+            self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")
+        
+        # Calculate queue statistics
+        if temp_items and self.monitor:
+            total_queued = len(temp_items)
+            wait_times = [item[1][3] for item in temp_items]
+            highest_wait_time = time.time() - min(wait_times) if wait_times else 0
+            avg_wait_time = sum(time.time() - t for t in wait_times) / len(wait_times) if wait_times else 0
+            
+            # Update queue statistics in monitor
+            self.monitor.update_queue_statistics(
+                total_queued=total_queued,
+                highest_wait_time=highest_wait_time,
+                avg_wait_time=avg_wait_time
+            )
+        
+        # Sort by priority (lowest number = highest priority)
+        temp_items.sort(key=lambda x: x[0])
+        
+        # Refill the queue with updated priorities
+        for item in temp_items:
+            await self.task_queue.put(item)
+                
+    async def run_urls_stream(
+        self,
+        urls: List[str],
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlerTaskResult, None]:
+        self.crawler = crawler
+        
+        # Start the memory monitor task
+        memory_monitor = asyncio.create_task(self._memory_monitor_task())
+        
+        if self.monitor:
+            self.monitor.start()
+            
+        try:
+            # Initialize task queue
+            for url in urls:
+                task_id = str(uuid.uuid4())
+                if self.monitor:
+                    self.monitor.add_task(task_id, url)
+                # Add to queue with initial priority 0, retry count 0, and current time
+                await self.task_queue.put((0, (url, task_id, 0, time.time())))
+                
+            active_tasks = []
+            completed_count = 0
+            total_urls = len(urls)
+            
+            while completed_count < total_urls:
+                # If memory pressure is low, start new tasks
+                if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
+                    try:
+                        # Try to get a task with timeout
+                        priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
+                            self.task_queue.get(), timeout=0.1
+                        )
+                        
+                        # Create and start the task
+                        task = asyncio.create_task(
+                            self.crawl_url(url, config, task_id, retry_count)
+                        )
+                        active_tasks.append(task)
+                        
+                        # Update waiting time in monitor
+                        if self.monitor:
+                            wait_time = time.time() - enqueue_time
+                            self.monitor.update_task(
+                                task_id, 
+                                wait_time=wait_time,
+                                status=CrawlStatus.IN_PROGRESS
+                            )
+                            
+                    except asyncio.TimeoutError:
+                        # No tasks in queue, that's fine
+                        pass
+                        
+                # Process completed tasks and yield results
+                if active_tasks:
+                    done, pending = await asyncio.wait(
+                        active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
+                    )
+                    
+                    for completed_task in done:
+                        result = await completed_task
+                        
+                        # Only count as completed if it wasn't requeued
+                        if "requeued" not in result.error_message:
+                            completed_count += 1
+                            yield result
+                        
+                    # Update active tasks list
+                    active_tasks = list(pending)
+                else:
+                    # If no active tasks but still waiting, sleep briefly
+                    await asyncio.sleep(self.check_interval / 2)
+                
+                # Update priorities for waiting tasks if needed
+                await self._update_queue_priorities()
+                
+        finally:
+            # Clean up
+            memory_monitor.cancel()
+            if self.monitor:
+                self.monitor.stop()
+                

 class SemaphoreDispatcher(BaseDispatcher):
    def __init__(
@@ -620,7 +619,7 @@ class SemaphoreDispatcher(BaseDispatcher):

    async def run_urls(
        self,
-        crawler: "AsyncWebCrawler",  # noqa: F821
+        crawler: AsyncWebCrawler,  # noqa: F821
        urls: List[str],
        config: CrawlerRunConfig,
    ) -> List[CrawlerTaskResult]:
@@ -644,4 +643,4 @@ class SemaphoreDispatcher(BaseDispatcher):
            return await asyncio.gather(*tasks, return_exceptions=True)
        finally:
            if self.monitor:
-                self.monitor.stop()
+                self.monitor.stop()