test: Add comprehensive unit tests for AsyncExecutor functionality

2024-11-13 19:46:05 +08:00
2 changed files with 960 additions and 0 deletions
--- a/crawl4ai/async_executor.py
+++ b/crawl4ai/async_executor.py
@@ -0,0 +1,741 @@
+from __future__ import annotations
+import asyncio
+import psutil
+import logging
+import time
+import sqlite3
+import aiosqlite
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Callable, Any, Set, Type
+from typing import Awaitable
+from pathlib import Path
+import json
+from datetime import datetime
+from typing import ClassVar, Type, Union
+import inspect
+
+# Imports from your crawler package
+from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy
+from .chunking_strategy import ChunkingStrategy, RegexChunking
+from .extraction_strategy import ExtractionStrategy
+from .models import CrawlResult
+from .config import MIN_WORD_THRESHOLD
+from .async_webcrawler import AsyncWebCrawler
+from .config import MAX_METRICS_HISTORY
+
+import logging
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] %(name)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+# self.logger.error(f"Executor {self.__class__.__name__}: Error message", exc_info=True)
+# self.logger.info(f"Executor {self.__class__.__name__}: Info message")
+# self.logger.warning(f"Executor {self.__class__.__name__}: Warning message")
+
+
+# Enums and Constants
+class ExecutionMode(Enum):
+    """Execution mode for the crawler executor."""
+    SPEED = "speed"
+    RESOURCE = "resource"
+
+class TaskState(Enum):
+    """Possible states for a crawling task."""
+    PENDING = "pending"
+    SCHEDULED = "scheduled"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    RETRYING = "retrying"
+
+# Types of callbacks we should support
+class CallbackType(Enum):
+    PRE_EXECUTION = "pre_execution"      # Before processing a URL
+    POST_EXECUTION = "post_execution"    # After successful processing
+    ON_ERROR = "on_error"               # When an error occurs
+    ON_RETRY = "on_retry"               # Before retrying a failed URL
+    ON_BATCH_START = "on_batch_start"   # Before starting a batch
+    ON_BATCH_END = "on_batch_end"       # After completing a batch
+    ON_COMPLETE = "on_complete"         # After all URLs are processed
+
+ 
+@dataclass
+class SystemMetrics:
+    """System resource metrics."""
+    cpu_percent: float
+    memory_percent: float
+    available_memory: int
+    timestamp: float
+
+    @classmethod
+    def capture(cls) -> 'SystemMetrics':
+        """Capture current system metrics."""
+        return cls(
+            cpu_percent=psutil.cpu_percent(),
+            memory_percent=psutil.virtual_memory().percent,
+            available_memory=psutil.virtual_memory().available,
+            timestamp=time.time()
+        )
+
+@dataclass
+class TaskMetadata:
+    """Metadata for a crawling task."""
+    url: str
+    state: TaskState
+    attempts: int = 0
+    last_attempt: Optional[float] = None
+    error: Optional[str] = None
+    result: Optional[Any] = None
+
+@dataclass
+class ExecutorMetrics:
+    """Performance and resource metrics for the executor."""
+    # Performance metrics
+    total_urls: int = 0
+    completed_urls: int = 0
+    failed_urls: int = 0
+    start_time: Optional[float] = None
+    total_retries: int = 0
+    response_times: List[float] = field(default_factory=list)
+    
+    # Resource metrics
+    system_metrics: List[SystemMetrics] = field(default_factory=list)
+    active_connections: int = 0
+    
+    def capture_system_metrics(self):
+        """Capture system metrics and enforce history size limit."""
+        metrics = SystemMetrics.capture()
+        self.system_metrics.append(metrics)
+        if len(self.system_metrics) > MAX_METRICS_HISTORY:
+            self.system_metrics.pop(0)  # Remove the oldest metric
+    
+    @property
+    def urls_per_second(self) -> float:
+        """Calculate URLs processed per second."""
+        if not self.start_time or not self.completed_urls:
+            return 0.0
+        duration = time.time() - self.start_time
+        return self.completed_urls / duration if duration > 0 else 0
+    
+    @property
+    def success_rate(self) -> float:
+        """Calculate success rate as percentage."""
+        if not self.total_urls:
+            return 0.0
+        return (self.completed_urls / self.total_urls) * 100
+    
+    @property
+    def retry_rate(self) -> float:
+        """Calculate retry rate as percentage."""
+        if not self.total_urls:
+            return 0.0
+        return (self.total_retries / self.total_urls) * 100
+    
+    @property
+    def average_response_time(self) -> float:
+        """Calculate average response time in seconds."""
+        if not self.response_times:
+            return 0.0
+        return sum(self.response_times) / len(self.response_times)
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert metrics to dictionary format."""
+        return {
+            "performance": {
+                "urls_per_second": self.urls_per_second,
+                "success_rate": self.success_rate,
+                "retry_rate": self.retry_rate,
+                "average_response_time": self.average_response_time,
+                "total_urls": self.total_urls,
+                "completed_urls": self.completed_urls,
+                "failed_urls": self.failed_urls
+            },
+            "resources": {
+                "cpu_utilization": self.system_metrics[-1].cpu_percent if self.system_metrics else 0,
+                "memory_usage": self.system_metrics[-1].memory_percent if self.system_metrics else 0,
+                "active_connections": self.active_connections
+            }
+        }
+
+class ResourceMonitor:
+    """Monitors and manages system resources."""
+
+    def __init__(self, mode: ExecutionMode):
+        self.mode = mode
+        self.metrics_history: List[SystemMetrics] = []
+        self._setup_thresholds()
+
+    def _setup_thresholds(self):
+        """Set up resource thresholds based on execution mode."""
+        if self.mode == ExecutionMode.SPEED:
+            self.memory_threshold = 80  # 80% memory usage limit
+            self.cpu_threshold = 90     # 90% CPU usage limit
+        else:
+            self.memory_threshold = 40  # 40% memory usage limit
+            self.cpu_threshold = 30     # 30% CPU usage limit
+
+    async def check_resources(self) -> bool:
+        """Check if system resources are within acceptable limits."""
+        metrics = SystemMetrics.capture()
+        self.metrics_history.append(metrics)
+
+        # Keep only last hour of metrics
+        cutoff_time = time.time() - 3600
+        self.metrics_history = [m for m in self.metrics_history if m.timestamp > cutoff_time]
+
+        return (metrics.cpu_percent < self.cpu_threshold and
+                metrics.memory_percent < self.memory_threshold)
+
+    def get_optimal_batch_size(self, total_urls: int) -> int:
+        metrics = SystemMetrics.capture()
+        if self.mode == ExecutionMode.SPEED:
+            base_size = min(1000, total_urls)
+
+            # Adjust based on resource usage
+            cpu_factor = max(0.0, (self.cpu_threshold - metrics.cpu_percent) / self.cpu_threshold)
+            mem_factor = max(0.0, (self.memory_threshold - metrics.memory_percent) / self.memory_threshold)
+
+            min_factor = min(cpu_factor, mem_factor)
+            adjusted_size = max(1, int(base_size * min_factor))
+            return min(total_urls, adjusted_size)
+        else:
+            # For resource optimization, use a conservative batch size based on resource usage
+            cpu_factor = max(0.1, (self.cpu_threshold - metrics.cpu_percent) / self.cpu_threshold)
+            mem_factor = max(0.1, (self.memory_threshold - metrics.memory_percent) / self.memory_threshold)
+
+            min_factor = min(cpu_factor, mem_factor)
+            adjusted_size = max(1, int(50 * min_factor))
+            return min(total_urls, adjusted_size)
+
+class ExecutorControl:
+    """Control interface for the executor."""
+    
+    def __init__(self):
+        self._paused = False
+        self._cancelled = False
+        self._pause_event = asyncio.Event()
+        self._pause_event.set()  # Not paused initially
+        self._lock = asyncio.Lock()  # Lock to protect shared state
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        
+    async def pause(self):
+        """Pause the execution."""
+        async with self._lock:
+            self._paused = True
+            self._pause_event.clear()
+            
+    async def resume(self):
+        """Resume the execution."""
+        async with self._lock:
+            self._paused = False
+            self._pause_event.set()
+            
+    async def cancel(self):
+        """Cancel all pending operations."""
+        async with self._lock:
+            self._cancelled = True
+            self._pause_event.set()  # Release any paused operations
+            
+    async def is_paused(self) -> bool:
+        """Check if execution is paused."""
+        async with self._lock:
+            return self._paused
+        
+    async def is_cancelled(self) -> bool:
+        """Check if execution is cancelled."""
+        async with self._lock:
+            return self._cancelled
+        
+    async def wait_if_paused(self, timeout: Optional[float] = None):
+        """Wait if execution is paused, with an optional timeout."""
+        try:
+            await asyncio.wait_for(self._pause_event.wait(), timeout=timeout)
+        except asyncio.TimeoutError:
+            # Timeout occurred, handle as needed
+            async with self._lock:
+                self._paused = False  # Optionally reset the paused state
+                self._pause_event.set()
+            # Optionally log a warning
+            self.logger.warning(f"ExecutorControl: wait_if_paused() timed out after {timeout} seconds. Proceeding with execution.")
+                
+    async def reset(self):
+        """Reset control state."""
+        async with self._lock:
+            self._paused = False
+            self._cancelled = False
+            self._pause_event.set()
+
+
+class ExecutorStrategy(ABC):
+    """Abstract Base class for executor strategies.
+    
+    Callbacks:
+        - PRE_EXECUTION: Callable[[str, Dict[str, Any]], None]
+        - POST_EXECUTION: Callable[[str, Any, Dict[str, Any]], None]
+        - ON_ERROR: Callable[[str, Exception, Dict[str, Any]], None]
+        - ON_RETRY: Callable[[str, int, Dict[str, Any]], None]
+        - ON_BATCH_START: Callable[[List[str], Dict[str, Any]], None]
+        - ON_BATCH_END: Callable[[List[str], Dict[str, Any]], None]
+        - ON_COMPLETE: Callable[[Dict[str, Any], Dict[str, Any]], None]
+    """
+
+    def __init__(
+        self,
+        crawler: AsyncWebCrawler,
+        mode: ExecutionMode,
+        # callbacks: Optional[Dict[CallbackType, Callable]] = None,
+        callbacks: Optional[Dict[CallbackType, Callable[[Any], Union[Awaitable[None], None]]]] = None,
+        persistence_path: Optional[Path] = None,
+        **crawl_config_kwargs
+    ):
+        self.crawler = crawler
+        self.mode = mode
+        self.callbacks = callbacks or {}
+        self.resource_monitor = ResourceMonitor(mode)
+        self.tasks: Dict[str, TaskMetadata] = {}
+        self.active_tasks: Set[str] = set()
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        self.metrics = ExecutorMetrics()
+        self.control = ExecutorControl()
+        self.crawl_config_kwargs = crawl_config_kwargs  # Store parameters for arun
+
+    async def get_status(self) -> Dict[str, Any]:
+        """Get current executor status and metrics."""
+        return {
+            "status": {
+                "paused": await self.control.is_paused(),
+                "cancelled": await self.control.is_cancelled(),
+                "active_tasks": len(self.active_tasks)
+            },
+            "metrics": self.metrics.to_dict()
+        }
+
+    async def clear_state(self):
+        """Reset executor state."""
+        self.tasks.clear()
+        self.active_tasks.clear()
+        self.metrics = ExecutorMetrics()
+        await self.control.reset()
+        await self.persistence.clear()  # Implement this method
+
+    async def _execute_callback(
+        self,
+        callback_type: CallbackType,
+        *args,
+        **kwargs
+    ):
+        """Execute callback if it exists."""
+        if callback := self.callbacks.get(callback_type):
+            try:
+                if inspect.iscoroutinefunction(callback):
+                    await callback(*args, **kwargs)
+                else:
+                    callback(*args, **kwargs)
+            except Exception as e:
+                # self.logger.error(f"Callback {callback_type} failed: {e}")
+                self.logger.error(f"Executor {self.__class__.__name__}: Callback {callback_type.value} failed: {e}", exc_info=True)
+
+    async def _process_url(self, url: str) -> CrawlResult:
+        max_retries = self.crawl_config_kwargs.get('max_retries', 3)
+        backoff_factor = self.crawl_config_kwargs.get('backoff_factor', 1)
+        attempts = 0
+
+        while attempts <= max_retries:
+            # Invoke PRE_EXECUTION callback
+            await self._execute_callback(CallbackType.PRE_EXECUTION, url, self.metrics.to_dict())
+
+            """Process a single URL using the crawler."""
+            # Wait if execution is paused
+            await self.control.wait_if_paused(timeout=300)
+
+            # Check if cancelled
+            if await self.control.is_cancelled():
+                raise asyncio.CancelledError("Execution was cancelled")
+
+            start_time = time.time()
+            self.metrics.active_connections += 1
+
+            try:
+                result = await self.crawler.arun(url, **self.crawl_config_kwargs)
+                self.metrics.completed_urls += 1
+                self.metrics.response_times.append(time.time() - start_time)
+                # Invoke POST_EXECUTION callback
+                await self._execute_callback(CallbackType.POST_EXECUTION, url, result, self.metrics.to_dict())
+                
+                return result
+
+            except Exception as e:
+                attempts += 1
+                self.metrics.failed_urls += 1
+                # self.logger.error(f"Error processing URL {url}: {e}")
+                self.logger.error(f"Executor {self.__class__.__name__}: Error processing URL {url}: {e}", exc_info=True)
+                # Invoke ON_ERROR callback
+                await self._execute_callback(CallbackType.ON_ERROR, url, e, self.metrics.to_dict())
+
+                if attempts <= max_retries:
+                    # Invoke ON_RETRY callback
+                    await self._execute_callback(CallbackType.ON_RETRY, url, attempts, self.metrics.to_dict())
+                    # Wait before retrying
+                    await asyncio.sleep(backoff_factor * attempts)
+                else:
+                    raise e
+
+            finally:
+                self.metrics.active_connections -= 1
+                # Update system metrics
+                # INFO: Uncomment this line if you want to capture system metrics after each URL, but it causes a performance hit
+                # self.metrics.system_metrics.append(SystemMetrics.capture())
+                # Exit the loop if successful or retries exceeded
+                if attempts > max_retries:
+                    break                
+
+    async def execute(self, urls: List[str]) -> Dict[str, Any]:
+        """Execute crawling tasks."""
+        # Initialize metrics
+        self.metrics.total_urls = len(urls)
+        self.metrics.start_time = time.time()
+
+        # Create context with metrics (used for callbacks)
+        context = {
+            "mode": self.mode,
+            "start_time": self.metrics.start_time,
+            "total_urls": self.metrics.total_urls
+        }
+
+        # Invoke ON_BATCH_START callback
+        await self._execute_callback(CallbackType.ON_BATCH_START, urls, context)
+
+        results = {}
+        batch_errors = []
+
+        # Use the crawler within an async context manager
+        async with self.crawler:
+            # Check for cancellation before starting
+            if await self.control.is_cancelled():
+                raise asyncio.CancelledError("Execution was cancelled")
+
+            # Wait if paused
+            await self.control.wait_if_paused(timeout=300)
+
+            # Prepare list of batches
+            batches = []
+            total_urls_remaining = len(urls)
+            index = 0
+
+            while index < len(urls):
+                batch_size = self.resource_monitor.get_optimal_batch_size(total_urls_remaining)
+                batch_urls = urls[index:index + batch_size]
+                batches.append(batch_urls)
+                index += batch_size
+                total_urls_remaining -= batch_size
+
+            # Process each batch
+            for batch_urls in batches:
+                # Check for cancellation
+                if await self.control.is_cancelled():
+                    raise asyncio.CancelledError("Execution was cancelled")
+
+                # Wait if paused
+                await self.control.wait_if_paused(timeout=300)
+
+                try:
+                    # Process the batch
+                    batch_results = await self.process_batch(batch_urls)
+                    # Update results
+                    results.update(batch_results)
+                    # Capture system metrics after each batch
+                    self.metrics.capture_system_metrics()
+                    # Update system metrics after each batch
+                    # self.metrics.system_metrics.append(SystemMetrics.capture()) # Has memory leak issue
+                    # Invoke ON_BATCH_END callback
+                    await self._execute_callback(CallbackType.ON_BATCH_END, batch_urls, context)
+                except Exception as e:
+                    # Handle batch-level exceptions
+                    self.logger.error(f"Error processing batch: {e}")
+                    await self._execute_callback(CallbackType.ON_ERROR, "batch", e, context)
+                    # Collect the error
+                    batch_errors.append((batch_urls, e))
+                    # Continue to next batch instead of raising
+                    continue
+
+        # Execution complete
+        await self._execute_callback(CallbackType.ON_COMPLETE, results, context)
+
+        # Log final metrics and batch errors if any
+        final_status = await self.get_status()
+        # self.logger.info(f"Execution completed. Metrics: {final_status}")
+        self.logger.info(f"Executor {self.__class__.__name__}: Execution completed. Metrics: {final_status}")
+
+        if batch_errors:
+            # self.logger.warning(f"Execution completed with errors in {len(batch_errors)} batches.")
+            self.logger.warning(f"Executor {self.__class__.__name__}: Execution completed with errors in {len(batch_errors)} batches.")
+
+        return results
+
+
+    @abstractmethod
+    async def process_batch(self, batch_urls: List[str]) -> Dict[str, Any]:
+        """Process a batch of URLs."""
+        pass
+
+class SpeedOptimizedExecutor(ExecutorStrategy):
+    """Executor optimized for speed."""
+
+    def __init__(
+        self,
+        crawler: AsyncWebCrawler,
+        callbacks: Optional[Dict[CallbackType, Callable]] = None,
+        persistence_path: Optional[Path] = None,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = None,
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        connection_pool_size: int = 1000,
+        dns_cache_size: int = 10000,
+        backoff_factor: int = 1,
+        **kwargs
+    ):
+        if chunking_strategy is None:
+            chunking_strategy = RegexChunking()
+
+        super().__init__(
+            crawler=crawler,
+            mode=ExecutionMode.SPEED,
+            callbacks=callbacks,
+            persistence_path=persistence_path,
+            word_count_threshold=word_count_threshold,
+            extraction_strategy=extraction_strategy,
+            chunking_strategy=chunking_strategy,
+            bypass_cache=bypass_cache,
+            css_selector=css_selector,
+            screenshot=screenshot,
+            user_agent=user_agent,
+            verbose=verbose,
+            **kwargs
+        )
+
+        self.connection_pool_size = connection_pool_size
+        self.dns_cache_size = dns_cache_size
+        self.backoff_factor = backoff_factor
+
+        self.logger.info(
+            # "Initialized speed-optimized executor with:"
+            f"Executor {self.__class__.__name__}: Initialized with:"
+            f" connection_pool_size={self.connection_pool_size},"
+            f" dns_cache_size={self.dns_cache_size}"
+        )
+
+    async def process_batch(self, batch_urls: List[str]) -> Dict[str, Any]:
+        """Process a batch of URLs concurrently."""
+        batch_tasks = [self._process_url(url) for url in batch_urls]
+
+        # Execute batch with concurrency control
+        batch_results_list = await asyncio.gather(*batch_tasks, return_exceptions=True)
+
+        batch_results = {}
+        for url, result in zip(batch_urls, batch_results_list):
+            if isinstance(result, Exception):
+                batch_results[url] = {"success": False, "error": str(result)}
+            else:
+                batch_results[url] = {"success": True, "result": result}
+
+        return batch_results
+
+class ResourceOptimizedExecutor(ExecutorStrategy):
+    """Executor optimized for resource usage."""
+
+    def __init__(
+        self,
+        crawler: AsyncWebCrawler,
+        callbacks: Optional[Dict[CallbackType, Callable]] = None,
+        persistence_path: Optional[Path] = None,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = None,
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        connection_pool_size: int = 50,
+        dns_cache_size: int = 1000,
+        backoff_factor: int = 5,
+        max_concurrent_tasks: int = 5,
+        **kwargs
+    ):
+        if chunking_strategy is None:
+            chunking_strategy = RegexChunking()
+
+        super().__init__(
+            crawler=crawler,
+            mode=ExecutionMode.RESOURCE,
+            callbacks=callbacks,
+            persistence_path=persistence_path,
+            word_count_threshold=word_count_threshold,
+            extraction_strategy=extraction_strategy,
+            chunking_strategy=chunking_strategy,
+            bypass_cache=bypass_cache,
+            css_selector=css_selector,
+            screenshot=screenshot,
+            user_agent=user_agent,
+            verbose=verbose,
+            **kwargs
+        )
+
+        self.connection_pool_size = connection_pool_size
+        self.dns_cache_size = dns_cache_size
+        self.backoff_factor = backoff_factor
+        self.max_concurrent_tasks = max_concurrent_tasks
+
+        self.logger.info(
+            # "Initialized resource-optimized executor with:"
+            f"Executor {self.__class__.__name__}: Initialized with:"
+            f" connection_pool_size={self.connection_pool_size},"
+            f" dns_cache_size={self.dns_cache_size},"
+            f" max_concurrent_tasks={self.max_concurrent_tasks}"
+        )
+
+    async def process_batch(self, batch_urls: List[str]) -> Dict[str, Any]:
+        """Process a batch of URLs with resource optimization."""
+        batch_results = {}
+        semaphore = asyncio.Semaphore(self.max_concurrent_tasks)
+
+        # Wait until resources are available before processing batch
+        while not await self.resource_monitor.check_resources():
+            # self.logger.warning("Resource limits reached, waiting...")
+            self.logger.warning(f"Executor {self.__class__.__name__}: Resource limits reached, waiting...")
+            await asyncio.sleep(self.backoff_factor)
+            # Check for cancellation
+            if await self.control.is_cancelled():
+                raise asyncio.CancelledError("Execution was cancelled")
+
+        async def process_url_with_semaphore(url):
+            async with semaphore:
+                # Check for cancellation
+                if await self.control.is_cancelled():
+                    raise asyncio.CancelledError("Execution was cancelled")
+                # Wait if paused
+                await self.control.wait_if_paused(timeout=300)
+
+                try:
+                    result = await self._process_url(url)
+                    batch_results[url] = {"success": True, "result": result}
+                except Exception as e:
+                    batch_results[url] = {"success": False, "error": str(e)}
+                finally:
+                    # Update system metrics after each URL
+                    # INFO: Uncomment this line if you want to capture system metrics after each URL, but it causes a performance hit
+                    # self.metrics.system_metrics.append(SystemMetrics.capture())
+                    # Controlled delay between URLs
+                    await asyncio.sleep(0.1)  # Small delay for resource management
+
+        tasks = [process_url_with_semaphore(url) for url in batch_urls]
+        await asyncio.gather(*tasks)
+
+        return batch_results
+    
+
+
+
+
+async def main():
+    # Sample callback functions
+    async def pre_execution_callback(url: str, context: Dict[str, Any]):
+        print(f"Pre-execution callback: About to process URL {url}")
+
+    async def post_execution_callback(url: str, result: Any, context: Dict[str, Any]):
+        print(f"Post-execution callback: Successfully processed URL {url}")
+
+    async def on_error_callback(url: str, error: Exception, context: Dict[str, Any]):
+        print(f"Error callback: Error processing URL {url}: {error}")
+
+    async def on_retry_callback(url: str, attempt: int, context: Dict[str, Any]):
+        print(f"Retry callback: Retrying URL {url}, attempt {attempt}")
+
+    async def on_batch_start_callback(urls: List[str], context: Dict[str, Any]):
+        print(f"Batch start callback: Starting batch with {len(urls)} URLs")
+
+    async def on_batch_end_callback(urls: List[str], context: Dict[str, Any]):
+        print(f"Batch end callback: Completed batch with {len(urls)} URLs")
+
+    async def on_complete_callback(results: Dict[str, Any], context: Dict[str, Any]):
+        print(f"Complete callback: Execution completed with {len(results)} results")
+
+    # Sample URLs to crawl
+    urls = [
+        "https://www.example.com",
+        "https://www.python.org",
+        "https://www.asyncio.org",
+        # Add more URLs as needed
+    ]
+
+    # Instantiate the crawler
+    crawler = AsyncWebCrawler()
+
+    # Set up callbacks
+    callbacks = {
+        CallbackType.PRE_EXECUTION: pre_execution_callback,
+        CallbackType.POST_EXECUTION: post_execution_callback,
+        CallbackType.ON_ERROR: on_error_callback,
+        CallbackType.ON_RETRY: on_retry_callback,
+        CallbackType.ON_BATCH_START: on_batch_start_callback,
+        CallbackType.ON_BATCH_END: on_batch_end_callback,
+        CallbackType.ON_COMPLETE: on_complete_callback,
+    }
+
+    # Instantiate the executors
+    speed_executor = SpeedOptimizedExecutor(
+        crawler=crawler,
+        callbacks=callbacks,
+        max_retries=2,  # Example additional config
+    )
+
+    resource_executor = ResourceOptimizedExecutor(
+        crawler=crawler,
+        callbacks=callbacks,
+        max_concurrent_tasks=3,  # Limit concurrency
+        max_retries=2,           # Example additional config
+    )
+
+    # Choose which executor to use
+    executor = speed_executor  # Or resource_executor
+
+    # Start the execution in a background task
+    execution_task = asyncio.create_task(executor.execute(urls))
+
+    # Simulate control operations
+    await asyncio.sleep(2)  # Let it run for a bit
+    print("Pausing execution...")
+    await executor.control.pause()
+    await asyncio.sleep(2)  # Wait while paused
+    print("Resuming execution...")
+    await executor.control.resume()
+
+    # Wait for execution to complete
+    results = await execution_task
+
+    # Print the results
+    print("Execution results:")
+    for url, result in results.items():
+        print(f"{url}: {result}")
+
+    # Get and print final metrics
+    final_status = await executor.get_status()
+    print("Final executor status and metrics:")
+    print(final_status)
+
+# Run the main function
+if __name__ == "__main__":
+    asyncio.run(main())
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/async/test_async_executor.py
+++ b/tests/async/test_async_executor.py
@@ -0,0 +1,219 @@
+import os, sys
+import unittest
+import asynctest
+import asyncio
+import time
+
+from typing import Dict, Any, List
+from unittest.mock import AsyncMock, MagicMock, patch
+
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(parent_dir)
+
+# Assuming all classes and imports are already available from the code above
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.config import MAX_METRICS_HISTORY
+from crawl4ai.async_executor import (
+    SpeedOptimizedExecutor,
+    ResourceOptimizedExecutor,
+    AsyncWebCrawler,
+    ExecutionMode,
+    SystemMetrics,
+    CallbackType
+)
+
+class TestAsyncExecutor(asynctest.TestCase):
+    async def setUp(self):
+        # Set up a mock crawler
+        self.mock_crawler = AsyncMock(spec=AsyncWebCrawler)
+        self.mock_crawler.arun = AsyncMock(side_effect=self.mock_crawl)
+
+        # Sample URLs
+        self.urls = [
+            "https://www.example.com",
+            "https://www.python.org",
+            "https://www.asyncio.org",
+            "https://www.nonexistenturl.xyz",  # This will simulate a failure
+        ]
+
+        # Set up callbacks
+        self.callbacks = {
+            CallbackType.PRE_EXECUTION: AsyncMock(),
+            CallbackType.POST_EXECUTION: AsyncMock(),
+            CallbackType.ON_ERROR: AsyncMock(),
+            CallbackType.ON_RETRY: AsyncMock(),
+            CallbackType.ON_BATCH_START: AsyncMock(),
+            CallbackType.ON_BATCH_END: AsyncMock(),
+            CallbackType.ON_COMPLETE: AsyncMock(),
+        }
+
+    async def mock_crawl(self, url: str, **kwargs):
+        if "nonexistenturl" in url:
+            raise Exception("Failed to fetch URL")
+        return f"Mock content for {url}"
+
+    async def test_speed_executor_basic(self):
+        """Test basic functionality of SpeedOptimizedExecutor."""
+        executor = SpeedOptimizedExecutor(
+            crawler=self.mock_crawler,
+            callbacks=self.callbacks,
+            max_retries=1,
+        )
+
+        results = await executor.execute(self.urls)
+
+        # Assertions
+        self.assertEqual(len(results), len(self.urls))
+        self.mock_crawler.arun.assert_awaited()
+        self.callbacks[CallbackType.PRE_EXECUTION].assert_awaited()
+        self.callbacks[CallbackType.POST_EXECUTION].assert_awaited()
+        self.callbacks[CallbackType.ON_ERROR].assert_awaited()
+
+    async def test_resource_executor_basic(self):
+        """Test basic functionality of ResourceOptimizedExecutor."""
+        executor = ResourceOptimizedExecutor(
+            crawler=self.mock_crawler,
+            callbacks=self.callbacks,
+            max_concurrent_tasks=2,
+            max_retries=1,
+        )
+
+        results = await executor.execute(self.urls)
+
+        # Assertions
+        self.assertEqual(len(results), len(self.urls))
+        self.mock_crawler.arun.assert_awaited()
+        self.callbacks[CallbackType.PRE_EXECUTION].assert_awaited()
+        self.callbacks[CallbackType.POST_EXECUTION].assert_awaited()
+        self.callbacks[CallbackType.ON_ERROR].assert_awaited()
+
+    async def test_pause_and_resume(self):
+        """Test the pause and resume functionality."""
+        executor = SpeedOptimizedExecutor(
+            crawler=self.mock_crawler,
+            callbacks=self.callbacks,
+            max_retries=1,
+        )
+
+        execution_task = asyncio.create_task(executor.execute(self.urls))
+        await asyncio.sleep(0.1)
+        await executor.control.pause()
+        self.assertTrue(await executor.control.is_paused())
+
+        # Ensure that execution is paused
+        await asyncio.sleep(0.5)
+        await executor.control.resume()
+        self.assertFalse(await executor.control.is_paused())
+
+        results = await execution_task
+
+        # Assertions
+        self.assertEqual(len(results), len(self.urls))
+
+    async def test_cancellation(self):
+        """Test the cancellation functionality."""
+        executor = SpeedOptimizedExecutor(
+            crawler=self.mock_crawler,
+            callbacks=self.callbacks,
+            max_retries=1,
+        )
+
+        execution_task = asyncio.create_task(executor.execute(self.urls))
+        await asyncio.sleep(0.1)
+        await executor.control.cancel()
+        self.assertTrue(await executor.control.is_cancelled())
+
+        with self.assertRaises(asyncio.CancelledError):
+            await execution_task
+
+    async def test_max_retries(self):
+        """Test that the executor respects the max_retries setting."""
+        executor = SpeedOptimizedExecutor(
+            crawler=self.mock_crawler,
+            callbacks=self.callbacks,
+            max_retries=2,
+        )
+
+        results = await executor.execute(self.urls)
+
+        # The failing URL should have been retried
+        self.assertEqual(self.mock_crawler.arun.call_count, len(self.urls) + 2)
+        self.assertEqual(executor.metrics.total_retries, 2)
+
+    async def test_callbacks_invoked(self):
+        """Test that all callbacks are invoked appropriately."""
+        executor = SpeedOptimizedExecutor(
+            crawler=self.mock_crawler,
+            callbacks=self.callbacks,
+            max_retries=1,
+        )
+
+        await executor.execute(self.urls)
+
+        # Check that callbacks were called the correct number of times
+        self.assertEqual(
+            self.callbacks[CallbackType.PRE_EXECUTION].call_count,
+            len(self.urls) * (1 + executor.metrics.total_retries),
+        )
+        self.assertEqual(
+            self.callbacks[CallbackType.POST_EXECUTION].call_count,
+            executor.metrics.completed_urls,
+        )
+        self.assertEqual(
+            self.callbacks[CallbackType.ON_ERROR].call_count,
+            executor.metrics.failed_urls * (1 + executor.metrics.total_retries),
+        )
+        self.callbacks[CallbackType.ON_COMPLETE].assert_awaited_once()
+
+    async def test_resource_limits(self):
+        """Test that the ResourceOptimizedExecutor respects resource limits."""
+        with patch('psutil.cpu_percent', return_value=95), \
+             patch('psutil.virtual_memory', return_value=MagicMock(percent=85, available=1000)):
+            executor = ResourceOptimizedExecutor(
+                crawler=self.mock_crawler,
+                callbacks=self.callbacks,
+                max_concurrent_tasks=2,
+                max_retries=1,
+            )
+
+            results = await executor.execute(self.urls)
+
+            # Assertions
+            self.assertEqual(len(results), len(self.urls))
+            # Since resources are over threshold, batch size should be minimized
+            batch_sizes = [executor.resource_monitor.get_optimal_batch_size(len(self.urls))]
+            self.assertTrue(all(size == 1 for size in batch_sizes))
+
+    async def test_system_metrics_limit(self):
+        """Test that the system_metrics list does not grow indefinitely."""
+        executor = SpeedOptimizedExecutor(
+            crawler=self.mock_crawler,
+            callbacks=self.callbacks,
+            max_retries=1,
+        )
+
+        # Simulate many batches to exceed MAX_METRICS_HISTORY
+        original_max_history = MAX_METRICS_HISTORY
+        try:
+            # Temporarily reduce MAX_METRICS_HISTORY for the test
+            globals()['MAX_METRICS_HISTORY'] = 5
+
+            # Mock capture_system_metrics to increase system_metrics length
+            with patch.object(executor.metrics, 'capture_system_metrics') as mock_capture:
+                def side_effect():
+                    executor.metrics.system_metrics.append(SystemMetrics(0, 0, 0, time.time()))
+                    if len(executor.metrics.system_metrics) > MAX_METRICS_HISTORY:
+                        executor.metrics.system_metrics.pop(0)
+                mock_capture.side_effect = side_effect
+
+                await executor.execute(self.urls * 3)  # Multiply URLs to create more batches
+
+                # Assertions
+                self.assertLessEqual(len(executor.metrics.system_metrics), MAX_METRICS_HISTORY)
+        finally:
+            # Restore original MAX_METRICS_HISTORY
+            globals()['MAX_METRICS_HISTORY'] = original_max_history
+
+if __name__ == "__main__":
+    unittest.main()