refactor(dispatcher): migrate to modular dispatcher system with enhanced monitoring

Reorganize dispatcher functionality into separate components: - Create dedicated dispatcher classes (MemoryAdaptive, Semaphore) - Add RateLimiter for smart request throttling - Implement CrawlerMonitor for real-time progress tracking - Move dispatcher config from CrawlerRunConfig to separate classes BREAKING CHANGE: Dispatcher configuration moved from CrawlerRunConfig to dedicated dispatcher classes. Users need to update their configuration approach for multi-URL crawling.
2025-01-11 21:10:27 +08:00
parent 3865342c93
commit 825c78a048
19 changed files with 1742 additions and 484 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -6,7 +6,8 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, Cosi
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
 from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
-from .models import CrawlResult
+from .models import CrawlResult, MarkdownGenerationResult
 from .async_dispatcher import MemoryAdaptiveDispatcher, SemaphoreDispatcher, RateLimiter, CrawlerMonitor, DisplayMode
 from .__version__ import __version__
 __all__ = [
@@ -24,6 +25,12 @@ __all__ = [
    'DefaultMarkdownGenerator',
    'PruningContentFilter',
    'BM25ContentFilter',
    'MemoryAdaptiveDispatcher',
    'SemaphoreDispatcher',
    'RateLimiter',
    'CrawlerMonitor',
    'DisplayMode',
    'MarkdownGenerationResult',
 ]
 def is_sync_version_installed():
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -11,8 +11,7 @@ from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy
 from .markdown_generation_strategy import MarkdownGenerationStrategy
-from typing import Union, List, Tuple, Optional
+from typing import Union, List
 from dataclasses import dataclass, field
 class BrowserConfig:
    """
@@ -184,14 +183,6 @@ class BrowserConfig:
        )
@dataclass
 class RateLimitConfig:
    base_delay: Tuple[float, float] = (1.0, 3.0)
    max_delay: float = 60.0
    max_retries: int = 3
    rate_limit_codes: List[int] = field(default_factory=lambda: [429, 503])
 class CrawlerRunConfig:
    """
    Configuration class for controlling how the crawler runs each crawl operation.
@@ -320,14 +311,8 @@ class CrawlerRunConfig:
        log_console (bool): If True, log console messages from the page.
                            Default: False.
-        # Dispatcher configuration
+        # Optional Parameters
-        memory_threshold_percent: float = 70.0
+        url: str = None  # This is not a compulsory parameter
        check_interval: float = 1.0
        max_session_permit: int = 20
        enable_rate_limiting: bool = False
        rate_limit_config: Optional[RateLimitConfig] = None
        display_mode: Optional[str] = None
        url: str = None
    """
    def __init__(
@@ -400,13 +385,6 @@ class CrawlerRunConfig:
        verbose: bool = True,
        log_console: bool = False,
        # Dispatcher configuration
        memory_threshold_percent: float = 70.0,
        check_interval: float = 1.0,
        max_session_permit: int = 20,
        enable_rate_limiting: bool = False,
        rate_limit_config: Optional[RateLimitConfig] = None,
        display_mode: Optional[str] = None,
        url: str = None,
    ):
        self.url = url
@@ -479,14 +457,6 @@ class CrawlerRunConfig:
        self.verbose = verbose
        self.log_console = log_console
        # Dispatcher configuration
        self.memory_threshold_percent = memory_threshold_percent
        self.check_interval = check_interval
        self.max_session_permit = max_session_permit
        self.enable_rate_limiting = enable_rate_limiting
        self.rate_limit_config = rate_limit_config
        self.display_mode = display_mode
        # Validate type of extraction strategy and chunking strategy if they are provided
        if self.extraction_strategy is not None and not isinstance(
            self.extraction_strategy, ExtractionStrategy
@@ -573,13 +543,6 @@ class CrawlerRunConfig:
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
            # Dispatcher configuration
            memory_threshold_percent=kwargs.get("memory_threshold_percent", 70.0),
            check_interval=kwargs.get("check_interval", 1.0),
            max_session_permit=kwargs.get("max_session_permit", 20),
            enable_rate_limiting=kwargs.get("enable_rate_limiting", False),
            rate_limit_config=kwargs.get("rate_limit_config"),
            display_mode=kwargs.get("display_mode"),
            url=kwargs.get("url"),
        )
@@ -638,11 +601,5 @@ class CrawlerRunConfig:
            "exclude_domains": self.exclude_domains,
            "verbose": self.verbose,
            "log_console": self.log_console,
            "memory_threshold_percent": self.memory_threshold_percent,
            "check_interval": self.check_interval,
            "max_session_permit": self.max_session_permit,
            "enable_rate_limiting": self.enable_rate_limiting,
            "rate_limit_config": self.rate_limit_config,
            "display_mode": self.display_mode,
            "url": self.url,
        }
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -0,0 +1,560 @@
 from typing import Dict, Optional, List
 from .async_configs import *
 from .models import *
 from rich.live import Live
 from rich.table import Table
 from rich.console import Console
 from rich.style import Style
 from rich import box
 from datetime import datetime, timedelta
 from dataclasses import dataclass
 import time
 import psutil
 import asyncio
 import uuid
 from urllib.parse import urlparse
 import random
 from abc import ABC, abstractmethod
 class RateLimiter:
    def __init__(
        self,
        base_delay: Tuple[float, float] = (1.0, 3.0),
        max_delay: float = 60.0,
        max_retries: int = 3,
        rate_limit_codes: List[int] = None
    ):
        self.base_delay = base_delay
        self.max_delay = max_delay
        self.max_retries = max_retries
        self.rate_limit_codes = rate_limit_codes or [429, 503]
        self.domains: Dict[str, DomainState] = {}
    def get_domain(self, url: str) -> str:
        return urlparse(url).netloc
    async def wait_if_needed(self, url: str) -> None:
        domain = self.get_domain(url)
        state = self.domains.get(domain)
        if not state:
            self.domains[domain] = DomainState()
            state = self.domains[domain]
        now = time.time()
        if state.last_request_time:
            wait_time = max(0, state.current_delay - (now - state.last_request_time))
            if wait_time > 0:
                await asyncio.sleep(wait_time)
        # Random delay within base range if no current delay
        if state.current_delay == 0:
            state.current_delay = random.uniform(*self.base_delay)
        state.last_request_time = time.time()
    def update_delay(self, url: str, status_code: int) -> bool:
        domain = self.get_domain(url)
        state = self.domains[domain]
        if status_code in self.rate_limit_codes:
            state.fail_count += 1
            if state.fail_count > self.max_retries:
                return False
            # Exponential backoff with random jitter
            state.current_delay = min(
                state.current_delay * 2 * random.uniform(0.75, 1.25),
                self.max_delay
            )
        else:
            # Gradually reduce delay on success
            state.current_delay = max(
                random.uniform(*self.base_delay),
                state.current_delay * 0.75
            )
            state.fail_count = 0
        return True
 class CrawlerMonitor:
    def __init__(self, max_visible_rows: int = 15, display_mode: DisplayMode = DisplayMode.DETAILED):
        self.console = Console()
        self.max_visible_rows = max_visible_rows
        self.display_mode = display_mode
        self.stats: Dict[str, CrawlStats] = {}
        self.process = psutil.Process()
        self.start_time = datetime.now()
        self.live = Live(self._create_table(), refresh_per_second=2)
    def start(self):
        self.live.start()
    def stop(self):
        self.live.stop()
    def add_task(self, task_id: str, url: str):
        self.stats[task_id] = CrawlStats(task_id=task_id, url=url, status=CrawlStatus.QUEUED)
        self.live.update(self._create_table())
    def update_task(self, task_id: str, **kwargs):
        if task_id in self.stats:
            for key, value in kwargs.items():
                setattr(self.stats[task_id], key, value)
            self.live.update(self._create_table())
    def _create_aggregated_table(self) -> Table:
        """Creates a compact table showing only aggregated statistics"""
        table = Table(
            box=box.ROUNDED,
            title="Crawler Status Overview",
            title_style="bold magenta",
            header_style="bold blue",
            show_lines=True
        )
        # Calculate statistics
        total_tasks = len(self.stats)
        queued = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED)
        in_progress = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS)
        completed = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED)
        failed = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED)
        # Memory statistics
        current_memory = self.process.memory_info().rss / (1024 * 1024)
        total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
        peak_memory = max((stat.peak_memory for stat in self.stats.values()), default=0.0)
        # Duration
        duration = datetime.now() - self.start_time
        # Create status row
        table.add_column("Status", style="bold cyan")
        table.add_column("Count", justify="right")
        table.add_column("Percentage", justify="right")
        table.add_row(
            "Total Tasks",
            str(total_tasks),
            "100%"
        )
        table.add_row(
            "[yellow]In Queue[/yellow]",
            str(queued),
            f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
        )
        table.add_row(
            "[blue]In Progress[/blue]",
            str(in_progress),
            f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
        )
        table.add_row(
            "[green]Completed[/green]",
            str(completed),
            f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
        )
        table.add_row(
            "[red]Failed[/red]",
            str(failed),
            f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
        )
        # Add memory information
        table.add_section()
        table.add_row(
            "[magenta]Current Memory[/magenta]",
            f"{current_memory:.1f} MB",
            ""
        )
        table.add_row(
            "[magenta]Total Task Memory[/magenta]",
            f"{total_task_memory:.1f} MB",
            ""
        )
        table.add_row(
            "[magenta]Peak Task Memory[/magenta]",
            f"{peak_memory:.1f} MB",
            ""
        )
        table.add_row(
            "[yellow]Runtime[/yellow]",
            str(timedelta(seconds=int(duration.total_seconds()))),
            ""
        )
        return table
    def _create_detailed_table(self) -> Table:
        table = Table(
            box=box.ROUNDED,
            title="Crawler Performance Monitor",
            title_style="bold magenta",
            header_style="bold blue"
        )
        # Add columns
        table.add_column("Task ID", style="cyan", no_wrap=True)
        table.add_column("URL", style="cyan", no_wrap=True)
        table.add_column("Status", style="bold")
        table.add_column("Memory (MB)", justify="right")
        table.add_column("Peak (MB)", justify="right")
        table.add_column("Duration", justify="right")
        table.add_column("Info", style="italic")
        # Add summary row
        total_memory = sum(stat.memory_usage for stat in self.stats.values())
        active_count = sum(1 for stat in self.stats.values() 
                         if stat.status == CrawlStatus.IN_PROGRESS)
        completed_count = sum(1 for stat in self.stats.values() 
                            if stat.status == CrawlStatus.COMPLETED)
        failed_count = sum(1 for stat in self.stats.values() 
                         if stat.status == CrawlStatus.FAILED)
        table.add_row(
            "[bold yellow]SUMMARY",
            f"Total: {len(self.stats)}",
            f"Active: {active_count}",
            f"{total_memory:.1f}",
            f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
            str(timedelta(seconds=int((datetime.now() - self.start_time).total_seconds()))),
            f"✓{completed_count} ✗{failed_count}",
            style="bold"
        )
        table.add_section()
        # Add rows for each task
        visible_stats = sorted(
            self.stats.values(),
            key=lambda x: (
                x.status != CrawlStatus.IN_PROGRESS,
                x.status != CrawlStatus.QUEUED,
                x.end_time or datetime.max
            )
        )[:self.max_visible_rows]
        for stat in visible_stats:
            status_style = {
                CrawlStatus.QUEUED: "white",
                CrawlStatus.IN_PROGRESS: "yellow",
                CrawlStatus.COMPLETED: "green",
                CrawlStatus.FAILED: "red"
            }[stat.status]
            table.add_row(
                stat.task_id[:8],  # Show first 8 chars of task ID
                stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
                f"[{status_style}]{stat.status.value}[/{status_style}]",
                f"{stat.memory_usage:.1f}",
                f"{stat.peak_memory:.1f}",
                stat.duration,
                stat.error_message[:40] if stat.error_message else ""
            )
        return table
    def _create_table(self) -> Table:
        """Creates the appropriate table based on display mode"""
        if self.display_mode == DisplayMode.AGGREGATED:
            return self._create_aggregated_table()
        return self._create_detailed_table()
 class BaseDispatcher(ABC):
    def __init__(
        self,
        rate_limiter: Optional[RateLimiter] = None,
        monitor: Optional[CrawlerMonitor] = None        
    ):
        self.crawler = None
        self._domain_last_hit: Dict[str, float] = {}
        self.concurrent_sessions = 0
        self.rate_limiter = rate_limiter
        self.monitor = monitor
    @abstractmethod
    async def crawl_url(
        self, 
        url: str, 
        config: CrawlerRunConfig, 
        task_id: str,
        monitor: Optional[CrawlerMonitor] = None
    ) -> CrawlerTaskResult:
        pass
    @abstractmethod
    async def run_urls(
        self, 
        urls: List[str], 
        crawler: "AsyncWebCrawler",
        config: CrawlerRunConfig,
        monitor: Optional[CrawlerMonitor] = None
    ) -> List[CrawlerTaskResult]:
        pass
 class MemoryAdaptiveDispatcher(BaseDispatcher):
    def __init__(
        self,
        memory_threshold_percent: float = 70.0,
        check_interval: float = 1.0,
        max_session_permit: int = 20,
        memory_wait_timeout: float = 300.0,  # 5 minutes default timeout
        rate_limiter: Optional[RateLimiter] = None,
        monitor: Optional[CrawlerMonitor] = None
    ):
        super().__init__(rate_limiter, monitor)
        self.memory_threshold_percent = memory_threshold_percent
        self.check_interval = check_interval
        self.max_session_permit = max_session_permit
        self.memory_wait_timeout = memory_wait_timeout
    async def crawl_url(
        self, 
        url: str, 
        config: CrawlerRunConfig, 
        task_id: str,
    ) -> CrawlerTaskResult:
        start_time = datetime.now()
        error_message = ""
        memory_usage = peak_memory = 0.0
        try:
            if self.monitor:
                self.monitor.update_task(task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time)
            self.concurrent_sessions += 1
            if self.rate_limiter:
                await self.rate_limiter.wait_if_needed(url)
            process = psutil.Process()
            start_memory = process.memory_info().rss / (1024 * 1024)
            result = await self.crawler.arun(url, config=config, session_id=task_id)
            end_memory = process.memory_info().rss / (1024 * 1024)
            memory_usage = peak_memory = end_memory - start_memory
            if self.rate_limiter and result.status_code:
                if not self.rate_limiter.update_delay(url, result.status_code):
                    error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
                    if self.monitor:
                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
                    return CrawlerTaskResult(
                        task_id=task_id,
                        url=url,
                        result=result,
                        memory_usage=memory_usage,
                        peak_memory=peak_memory,
                        start_time=start_time,
                        end_time=datetime.now(),
                        error_message=error_message
                    )
            if not result.success:
                error_message = result.error_message
                if self.monitor:
                    self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
            elif self.monitor:
                self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
        except Exception as e:
            error_message = str(e)
            if self.monitor:
                self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
            result = CrawlResult(url=url, html="", metadata={}, success=False, error_message=str(e))
        finally:
            end_time = datetime.now()
            if self.monitor:
                self.monitor.update_task(
                    task_id,
                    end_time=end_time,
                    memory_usage=memory_usage,
                    peak_memory=peak_memory,
                    error_message=error_message
                )
            self.concurrent_sessions -= 1
        return CrawlerTaskResult(
            task_id=task_id,
            url=url,
            result=result,
            memory_usage=memory_usage,
            peak_memory=peak_memory,
            start_time=start_time,
            end_time=end_time,
            error_message=error_message
        )
    async def run_urls(
        self, 
        urls: List[str], 
        crawler: "AsyncWebCrawler",
        config: CrawlerRunConfig,
    ) -> List[CrawlerTaskResult]:
        self.crawler = crawler
        if self.monitor:
            self.monitor.start()
        try:
            pending_tasks = []
            active_tasks = []
            task_queue = []
            for url in urls:
                task_id = str(uuid.uuid4())
                if self.monitor:
                    self.monitor.add_task(task_id, url)
                task_queue.append((url, task_id))
            while task_queue or active_tasks:
                wait_start_time = time.time()
                while len(active_tasks) < self.max_session_permit and task_queue:
                    if psutil.virtual_memory().percent >= self.memory_threshold_percent:
                        # Check if we've exceeded the timeout
                        if time.time() - wait_start_time > self.memory_wait_timeout:
                            raise MemoryError(f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds")
                        await asyncio.sleep(self.check_interval)
                        continue
                    url, task_id = task_queue.pop(0)
                    task = asyncio.create_task(self.crawl_url(url, config, task_id))
                    active_tasks.append(task)
                if not active_tasks:
                    await asyncio.sleep(self.check_interval)
                    continue
                done, pending = await asyncio.wait(
                    active_tasks,
                    return_when=asyncio.FIRST_COMPLETED
                )
                pending_tasks.extend(done)
                active_tasks = list(pending)
            return await asyncio.gather(*pending_tasks)
        finally:
            if self.monitor:
                self.monitor.stop()
 class SemaphoreDispatcher(BaseDispatcher):
    def __init__(
        self,
        semaphore_count: int = 5,
        max_session_permit: int = 20,
        rate_limiter: Optional[RateLimiter] = None,
        monitor: Optional[CrawlerMonitor] = None
    ):
        super().__init__(rate_limiter, monitor)
        self.semaphore_count = semaphore_count
        self.max_session_permit = max_session_permit    
    async def crawl_url(
        self, 
        url: str, 
        config: CrawlerRunConfig, 
        task_id: str,
        semaphore: asyncio.Semaphore = None
    ) -> CrawlerTaskResult:
        start_time = datetime.now()
        error_message = ""
        memory_usage = peak_memory = 0.0
        try:
            if self.monitor:
                self.monitor.update_task(task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time)
            if self.rate_limiter:
                await self.rate_limiter.wait_if_needed(url)
            async with semaphore:
                process = psutil.Process()
                start_memory = process.memory_info().rss / (1024 * 1024)
                result = await self.crawler.arun(url, config=config, session_id=task_id)
                end_memory = process.memory_info().rss / (1024 * 1024)
                memory_usage = peak_memory = end_memory - start_memory
                if self.rate_limiter and result.status_code:
                    if not self.rate_limiter.update_delay(url, result.status_code):
                        error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
                        if self.monitor:
                            self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
                        return CrawlerTaskResult(
                            task_id=task_id,
                            url=url,
                            result=result,
                            memory_usage=memory_usage,
                            peak_memory=peak_memory,
                            start_time=start_time,
                            end_time=datetime.now(),
                            error_message=error_message
                        )
                if not result.success:
                    error_message = result.error_message
                    if self.monitor:
                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
                elif self.monitor:
                    self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
        except Exception as e:
            error_message = str(e)
            if self.monitor:
                self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
            result = CrawlResult(url=url, html="", metadata={}, success=False, error_message=str(e))
        finally:
            end_time = datetime.now()
            if self.monitor:
                self.monitor.update_task(
                    task_id,
                    end_time=end_time,
                    memory_usage=memory_usage,
                    peak_memory=peak_memory,
                    error_message=error_message
                )
        return CrawlerTaskResult(
            task_id=task_id,
            url=url,
            result=result,
            memory_usage=memory_usage,
            peak_memory=peak_memory,
            start_time=start_time,
            end_time=end_time,
            error_message=error_message
        )
    async def run_urls(
        self, 
        crawler: "AsyncWebCrawler",
        urls: List[str], 
        config: CrawlerRunConfig,
    ) -> List[CrawlerTaskResult]:
        self.crawler = crawler
        if self.monitor:
            self.monitor.start()
        try:
            semaphore = asyncio.Semaphore(self.semaphore_count)
            tasks = []
            for url in urls:
                task_id = str(uuid.uuid4())
                if self.monitor:
                    self.monitor.add_task(task_id, url)
                task = asyncio.create_task(
                    self.crawl_url(url, config, task_id, semaphore)
                )
                tasks.append(task)
            return await asyncio.gather(*tasks, return_exceptions=True)
        finally:
            if self.monitor:
                self.monitor.stop()
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -9,7 +9,7 @@ import json
 import asyncio
 # from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
-from .models import CrawlResult, MarkdownGenerationResult
+from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult
 from .async_database import async_db_manager
 from .chunking_strategy import *
 from .content_filter_strategy import *
@@ -20,6 +20,8 @@ from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGene
 from .content_scraping_strategy import WebScrapingStrategy
 from .async_logger import AsyncLogger
 from .async_configs import BrowserConfig, CrawlerRunConfig
 from .async_dispatcher import *
 from .config import (
    MIN_WORD_THRESHOLD, 
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -675,6 +677,7 @@ class AsyncWebCrawler:
            self,
            urls: List[str],
            config: Optional[CrawlerRunConfig] = None,
            dispatcher: Optional[BaseDispatcher] = None,
            # Legacy parameters maintained for backwards compatibility
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
@@ -690,7 +693,7 @@ class AsyncWebCrawler:
            **kwargs,
        ) -> List[CrawlResult]:
            """
-            Runs the crawler for multiple URLs concurrently using MemoryAdaptiveDispatcher.
+            Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
            Migration Guide:
            Old way (deprecated):
@@ -705,84 +708,83 @@ class AsyncWebCrawler:
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
-                    enable_rate_limiting=True,
+                    dispatcher_config=DispatcherConfig(
-                    rate_limit_config=RateLimitConfig(...),
+                        enable_rate_limiting=True,
                        rate_limit_config=RateLimitConfig(...),
                    ),
                    ...
                )
-                results = await crawler.arun_many(urls, config=config)
+                results = await crawler.arun_many(
                    urls, 
                    config=config,
                    dispatcher_strategy=MemoryAdaptiveDispatcher  # Optional, this is the default
                )
            Args:
                urls: List of URLs to crawl
                config: Configuration object controlling crawl behavior for all URLs
                dispatcher_strategy: The dispatcher strategy class to use. Defaults to MemoryAdaptiveDispatcher.
                [other parameters maintained for backwards compatibility]
            Returns:
                List[CrawlResult]: Results for each URL
            """
-            # Handle configuration
+            # Create config if not provided
-            if config is not None:
+            if config is None:
-                if any(param is not None for param in [
+                config = CrawlerRunConfig(
-                    word_count_threshold, extraction_strategy, chunking_strategy,
+                    word_count_threshold=word_count_threshold,
-                    content_filter, cache_mode, css_selector, screenshot, pdf
+                    extraction_strategy=extraction_strategy,
-                ]):
+                    chunking_strategy=chunking_strategy,
-                    self.logger.warning(
+                    content_filter=content_filter,
-                        message="Both config and legacy parameters provided. config will take precedence.",
+                    cache_mode=cache_mode,
-                        tag="WARNING"
+                    bypass_cache=bypass_cache,
-                    )
+                    css_selector=css_selector,
-            else:
+                    screenshot=screenshot,
-                # Merge all parameters into a single kwargs dict for config creation
+                    pdf=pdf,
-                config_kwargs = {
+                    verbose=verbose,
                    "word_count_threshold": word_count_threshold,
                    "extraction_strategy": extraction_strategy,
                    "chunking_strategy": chunking_strategy,
                    "content_filter": content_filter,
                    "cache_mode": cache_mode,
                    "bypass_cache": bypass_cache,
                    "css_selector": css_selector,
                    "screenshot": screenshot,
                    "pdf": pdf,
                    "verbose": verbose,
                    **kwargs
                }
                config = CrawlerRunConfig.from_kwargs(config_kwargs)
            if bypass_cache:
                if kwargs.get("warning", True):
                    warnings.warn(
                        "'bypass_cache' is deprecated and will be removed in version 0.5.0. "
                        "Use 'cache_mode=CacheMode.BYPASS' instead. "
                        "Pass warning=False to suppress this warning.",
                        DeprecationWarning,
                        stacklevel=2
                    )
                if config.cache_mode is None:
                    config.cache_mode = CacheMode.BYPASS
            from .dispatcher import MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
            # Create dispatcher with configuration from CrawlerRunConfig
            dispatcher = MemoryAdaptiveDispatcher(
                crawler=self,
                memory_threshold_percent=config.memory_threshold_percent,
                check_interval=config.check_interval,
                max_session_permit=config.max_session_permit,
                enable_rate_limiting=config.enable_rate_limiting,
                rate_limit_config=vars(config.rate_limit_config) if config.rate_limit_config else None
            )
            # Create monitor if display mode is specified
            monitor = None
            if config.display_mode:
                monitor = CrawlerMonitor(
                    max_visible_rows=15,
                    display_mode=DisplayMode(config.display_mode)
                )
-            # Run URLs through dispatcher
+            # # Initialize the dispatcher with the selected strategy
-            task_results = await dispatcher.run_urls(urls, config, monitor=monitor)
+            # dispatcher = dispatcher_strategy(self, config.dispatcher_config)
-            # Convert CrawlerTaskResult to CrawlResult
+            # memory_monitor: CrawlerMonitor = None
-            return [task_result.result for task_result in task_results]
+            # if config.dispatcher_config.enable_monitor:
            #     memory_monitor = CrawlerMonitor(max_visible_rows=config.dispatcher_config.max_display_rows, display_mode=config.dispatcher_config.display_mode)
            # Create default dispatcher if none provided
            if dispatcher is None:
                dispatcher = MemoryAdaptiveDispatcher(
                    self,
                    rate_limiter=RateLimiter(
                        base_delay=(1.0, 3.0),
                        max_delay=60.0,
                        max_retries=3
                    )
                )            
            # Run the URLs through the dispatcher
            _results: List[CrawlerTaskResult] = await dispatcher.run_urls(
                crawler=self,
                urls=urls, 
                config=config
            )
            results: CrawlResult = []
            for res in _results:
                _res : CrawlResult = res.result
                dispatch_result: DispatchResult = DispatchResult(
                    task_id=res.task_id,
                    memory_usage=res.memory_usage,
                    peak_memory=res.peak_memory,
                    start_time=res.start_time,
                    end_time=res.end_time,
                    error_message=res.error_message
                )
                _res.dispatch_result = dispatch_result
                results.append(_res)
            return results
    async def aclear_cache(self):
        """Clear the cache database."""
--- a/crawl4ai/dispatcher.py
+++ b/crawl4ai/dispatcher.py
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,8 +1,70 @@
 from pydantic import BaseModel, HttpUrl
-from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
+from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple
-from dataclasses import dataclass
+from enum import Enum
 from dataclasses import dataclass, field
 from .ssl_certificate import SSLCertificate
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from typing import Optional
 from datetime import timedelta
 ###############################
 # Dispatcher Models
 ###############################
@dataclass
 class DomainState:
    last_request_time: float = 0
    current_delay: float = 0
    fail_count: int = 0
@dataclass
 class CrawlerTaskResult:
    task_id: str
    url: str
    result: "CrawlResult"
    memory_usage: float
    peak_memory: float
    start_time: datetime
    end_time: datetime
    error_message: str = ""
 class CrawlStatus(Enum):
    QUEUED = "QUEUED"
    IN_PROGRESS = "IN_PROGRESS"
    COMPLETED = "COMPLETED"
    FAILED = "FAILED"
@dataclass
 class CrawlStats:
    task_id: str
    url: str
    status: CrawlStatus
    start_time: Optional[datetime] = None
    end_time: Optional[datetime] = None
    memory_usage: float = 0.0
    peak_memory: float = 0.0
    error_message: str = ""
    @property
    def duration(self) -> str:
        if not self.start_time:
            return "0:00"
        end = self.end_time or datetime.now()
        duration = end - self.start_time
        return str(timedelta(seconds=int(duration.total_seconds())))
 class DisplayMode(Enum):
    DETAILED = "DETAILED"
    AGGREGATED = "AGGREGATED"
 ###############################
 # Crawler Models
 ###############################
@dataclass
 class TokenUsage:
    completion_tokens: int = 0
@@ -23,6 +85,13 @@ class MarkdownGenerationResult(BaseModel):
    fit_markdown: Optional[str] = None
    fit_html: Optional[str] = None
 class DispatchResult(BaseModel):
    task_id: str
    memory_usage: float
    peak_memory: float
    start_time: datetime
    end_time: datetime
    error_message: str = ""
 class CrawlResult(BaseModel):
    url: str
    html: str
@@ -44,6 +113,7 @@ class CrawlResult(BaseModel):
    response_headers: Optional[dict] = None
    status_code: Optional[int] = None
    ssl_certificate: Optional[SSLCertificate] = None
    dispatch_result: Optional[DispatchResult] = None
    class Config:
        arbitrary_types_allowed = True
--- a/docs/examples/dispatcher_example.py
+++ b/docs/examples/dispatcher_example.py
@@ -1,67 +1,121 @@
-import asyncio, time
+import asyncio
-from crawl4ai.async_webcrawler import AsyncWebCrawler
+import time
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
+from rich import print
-from crawl4ai.dispatcher import DisplayMode
+from rich.table import Table
-
+from crawl4ai import (
-async def crawl_with_rate_limiting(urls):
+    AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, 
-    """
+    MemoryAdaptiveDispatcher, SemaphoreDispatcher,
-    Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
+    RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
-    
+)
    Args:
        urls (List[str]): List of URLs to crawl
    Returns:
        List[CrawlResult]: List of crawl results for each URL
    """
    # Configure browser settings
    browser_config = BrowserConfig(
        headless=True,  # Run browser in headless mode
        verbose=False   # Minimize browser logging
    )
    # Configure crawler settings with rate limiting
    run_config = CrawlerRunConfig(
        # Enable rate limiting
        enable_rate_limiting=True,
        rate_limit_config=RateLimitConfig(
            base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds between requests
            max_delay=30.0,         # Maximum delay after rate limit hits
            max_retries=2,          # Number of retries before giving up
            rate_limit_codes=[429, 503]  # HTTP status codes to trigger rate limiting
        ),
        # Resource monitoring settings
        memory_threshold_percent=70.0,  # Pause crawling if memory usage exceeds this
        check_interval=0.5,            # How often to check resource usage
        max_session_permit=10,          # Maximum concurrent crawls
        display_mode=DisplayMode.DETAILED.value  # Show detailed progress
    )
    # Create and use crawler with context manager
    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(urls, config=run_config)
        return results
 def main():
    # Example URLs (replace with real URLs)
    urls = [
        f"https://example.com/page{i}" for i in range(1, 40)
    ]
 async def memory_adaptive(urls, browser_config, run_config):
    """Memory adaptive crawler with monitoring"""
    start = time.perf_counter()
    async with AsyncWebCrawler(config=browser_config) as crawler:
        dispatcher = MemoryAdaptiveDispatcher(
            memory_threshold_percent=70.0,
            max_session_permit=10,
            monitor=CrawlerMonitor(
                max_visible_rows=15,
                display_mode=DisplayMode.DETAILED
            )
        )
        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
    duration = time.perf_counter() - start
    return len(results), duration
-    # Run the crawler
+async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
-    results = asyncio.run(crawl_with_rate_limiting(urls))
+    """Memory adaptive crawler with rate limiting"""
    start = time.perf_counter()
    async with AsyncWebCrawler(config=browser_config) as crawler:
        dispatcher = MemoryAdaptiveDispatcher(
            memory_threshold_percent=70.0,
            max_session_permit=10,
            rate_limiter=RateLimiter(
                base_delay=(1.0, 2.0),
                max_delay=30.0,
                max_retries=2
            ),
            monitor=CrawlerMonitor(
                max_visible_rows=15,
                display_mode=DisplayMode.DETAILED
            )
        )
        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
    duration = time.perf_counter() - start
    return len(results), duration
-    # Process results
+async def semaphore(urls, browser_config, run_config):
-    successful_results = [result for result in results if result.success]
+    """Basic semaphore crawler"""
-    failed_results = [result for result in results if not result.success]
+    start = time.perf_counter()
    async with AsyncWebCrawler(config=browser_config) as crawler:
        dispatcher = SemaphoreDispatcher(
            semaphore_count=5,
            monitor=CrawlerMonitor(
                max_visible_rows=15,
                display_mode=DisplayMode.DETAILED
            )
        )
        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
    duration = time.perf_counter() - start
    return len(results), duration
-    end = time.perf_counter()
+async def semaphore_with_rate_limit(urls, browser_config, run_config):
    """Semaphore crawler with rate limiting"""
    start = time.perf_counter()
    async with AsyncWebCrawler(config=browser_config) as crawler:
        dispatcher = SemaphoreDispatcher(
            semaphore_count=5,
            rate_limiter=RateLimiter(
                base_delay=(1.0, 2.0),
                max_delay=30.0,
                max_retries=2
            ),
            monitor=CrawlerMonitor(
                max_visible_rows=15,
                display_mode=DisplayMode.DETAILED
            )
        )
        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
    duration = time.perf_counter() - start
    return len(results), duration
-    # Print results
+def create_performance_table(results):
-    print(f"Successful crawls: {len(successful_results)}")
+    """Creates a rich table showing performance results"""
-    print(f"Failed crawls: {len(failed_results)}")
+    table = Table(title="Crawler Strategy Performance Comparison")
-    print(f"Time taken: {end - start:.2f} seconds")
+    table.add_column("Strategy", style="cyan")
    table.add_column("URLs Crawled", justify="right", style="green")
    table.add_column("Time (seconds)", justify="right", style="yellow")
    table.add_column("URLs/second", justify="right", style="magenta")
    sorted_results = sorted(results.items(), key=lambda x: x[1][1])
    for strategy, (urls_crawled, duration) in sorted_results:
        urls_per_second = urls_crawled / duration
        table.add_row(
            strategy,
            str(urls_crawled),
            f"{duration:.2f}",
            f"{urls_per_second:.2f}"
        )
    return table
 async def main():
    urls = [f"https://example.com/page{i}" for i in range(1, 20)]
    browser_config = BrowserConfig(headless=True, verbose=False)
    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    results = {
        "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
        "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(urls, browser_config, run_config),
        "Semaphore": await semaphore(urls, browser_config, run_config),
        "Semaphore + Rate Limit": await semaphore_with_rate_limit(urls, browser_config, run_config),
    }
    table = create_performance_table(results)
    print("\nPerformance Summary:")
    print(table)
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())
--- a/docs/md_v2/advanced/multi-url-crawling
+++ b/docs/md_v2/advanced/multi-url-crawling
@@ -0,0 +1,264 @@
 # Optimized Multi-URL Crawling
 > **Note**: We’re developing a new **executor module** that uses a sophisticated algorithm to dynamically manage multi-URL crawling, optimizing for speed and memory usage. The approaches in this document remain fully valid, but keep an eye on **Crawl4AI**’s upcoming releases for this powerful feature! Follow [@unclecode](https://twitter.com/unclecode) on X and check the changelogs to stay updated.
 Crawl4AI’s **AsyncWebCrawler** can handle multiple URLs in a single run, which can greatly reduce overhead and speed up crawling. This guide shows how to:
 1. **Sequentially** crawl a list of URLs using the **same** session, avoiding repeated browser creation.  
 2. **Parallel**-crawl subsets of URLs in batches, again reusing the same browser.  
 When the entire process finishes, you close the browser once—**minimizing** memory and resource usage.
 ---
 ## 1. Why Avoid Simple Loops per URL?
 If you naively do:
 ```python
 for url in urls:
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url)
 ```
 You end up:
 1. Spinning up a **new** browser for each URL  
 2. Closing it immediately after the single crawl  
 3. Potentially using a lot of CPU/memory for short-living browsers  
 4. Missing out on session reusability if you have login or ongoing states
 **Better** approaches ensure you **create** the browser once, then crawl multiple URLs with minimal overhead.
 ---
 ## 2. Sequential Crawling with Session Reuse
 ### 2.1 Overview
 1. **One** `AsyncWebCrawler` instance for **all** URLs.  
 2. **One** session (via `session_id`) so we can preserve local storage or cookies across URLs if needed.  
 3. The crawler is only closed at the **end**.
 **This** is the simplest pattern if your workload is moderate (dozens to a few hundred URLs).
 ### 2.2 Example Code
 ```python
 import asyncio
 from typing import List
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 async def crawl_sequential(urls: List[str]):
    print("\n=== Sequential Crawling with Session Reuse ===")
    browser_config = BrowserConfig(
        headless=True,
        # For better performance in Docker or low-memory environments:
        extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
    )
    crawl_config = CrawlerRunConfig(
        markdown_generator=DefaultMarkdownGenerator()
    )
    # Create the crawler (opens the browser)
    crawler = AsyncWebCrawler(config=browser_config)
    await crawler.start()
    try:
        session_id = "session1"  # Reuse the same session across all URLs
        for url in urls:
            result = await crawler.arun(
                url=url,
                config=crawl_config,
                session_id=session_id
            )
            if result.success:
                print(f"Successfully crawled: {url}")
                # E.g. check markdown length
                print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
            else:
                print(f"Failed: {url} - Error: {result.error_message}")
    finally:
        # After all URLs are done, close the crawler (and the browser)
        await crawler.close()
 async def main():
    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3"
    ]
    await crawl_sequential(urls)
 if __name__ == "__main__":
    asyncio.run(main())
 ```
 **Why It’s Good**:
 - **One** browser launch.  
 - Minimal memory usage.  
 - If the site requires login, you can log in once in `session_id` context and preserve auth across all URLs.
 ---
 ## 3. Parallel Crawling with Browser Reuse
 ### 3.1 Overview
 To speed up crawling further, you can crawl multiple URLs in **parallel** (batches or a concurrency limit). The crawler still uses **one** browser, but spawns different sessions (or the same, depending on your logic) for each task.
 ### 3.2 Example Code
 For this example make sure to install the [psutil](https://pypi.org/project/psutil/) package.
 ```bash
 pip install psutil
 ```
 Then you can run the following code:
 ```python
 import os
 import sys
 import psutil
 import asyncio
 __location__ = os.path.dirname(os.path.abspath(__file__))
 __output__ = os.path.join(__location__, "output")
 # Append parent directory to system path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 from typing import List
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
    print("\n=== Parallel Crawling with Browser Reuse + Memory Check ===")
    # We'll keep track of peak memory usage across all tasks
    peak_memory = 0
    process = psutil.Process(os.getpid())
    def log_memory(prefix: str = ""):
        nonlocal peak_memory
        current_mem = process.memory_info().rss  # in bytes
        if current_mem > peak_memory:
            peak_memory = current_mem
        print(f"{prefix} Current Memory: {current_mem // (1024 * 1024)} MB, Peak: {peak_memory // (1024 * 1024)} MB")
    # Minimal browser config
    browser_config = BrowserConfig(
        headless=True,
        verbose=False,   # corrected from 'verbos=False'
        extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
    )
    crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    # Create the crawler instance
    crawler = AsyncWebCrawler(config=browser_config)
    await crawler.start()
    try:
        # We'll chunk the URLs in batches of 'max_concurrent'
        success_count = 0
        fail_count = 0
        for i in range(0, len(urls), max_concurrent):
            batch = urls[i : i + max_concurrent]
            tasks = []
            for j, url in enumerate(batch):
                # Unique session_id per concurrent sub-task
                session_id = f"parallel_session_{i + j}"
                task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
                tasks.append(task)
            # Check memory usage prior to launching tasks
            log_memory(prefix=f"Before batch {i//max_concurrent + 1}: ")
            # Gather results
            results = await asyncio.gather(*tasks, return_exceptions=True)
            # Check memory usage after tasks complete
            log_memory(prefix=f"After batch {i//max_concurrent + 1}: ")
            # Evaluate results
            for url, result in zip(batch, results):
                if isinstance(result, Exception):
                    print(f"Error crawling {url}: {result}")
                    fail_count += 1
                elif result.success:
                    success_count += 1
                else:
                    fail_count += 1
        print(f"\nSummary:")
        print(f"  - Successfully crawled: {success_count}")
        print(f"  - Failed: {fail_count}")
    finally:
        print("\nClosing crawler...")
        await crawler.close()
        # Final memory log
        log_memory(prefix="Final: ")
        print(f"\nPeak memory usage (MB): {peak_memory // (1024 * 1024)}")
 async def main():
    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3",
        "https://example.com/page4"
    ]
    await crawl_parallel(urls, max_concurrent=2)
 if __name__ == "__main__":
    asyncio.run(main())
 ```
 **Notes**:
 - We **reuse** the same `AsyncWebCrawler` instance for all parallel tasks, launching **one** browser.  
 - Each parallel sub-task might get its own `session_id` so they don’t share cookies/localStorage (unless that’s desired).  
 - We limit concurrency to `max_concurrent=2` or 3 to avoid saturating CPU/memory.
 ---
 ## 4. Performance Tips
 1. **Extra Browser Args**  
   - `--disable-gpu`, `--no-sandbox` can help in Docker or restricted environments.  
   - `--disable-dev-shm-usage` avoids using `/dev/shm` which can be small on some systems.
 2. **Session Reuse**  
   - If your site requires a login or you want to maintain local data across URLs, share the **same** `session_id`.  
   - If you want isolation (each URL fresh), create unique sessions.
 3. **Batching**  
   - If you have **many** URLs (like thousands), you can do parallel crawling in chunks (like `max_concurrent=5`).  
   - Use `arun_many()` for a built-in approach if you prefer, but the example above is often more flexible.
 4. **Cache**  
   - If your pages share many resources or you’re re-crawling the same domain repeatedly, consider setting `cache_mode=CacheMode.ENABLED` in `CrawlerRunConfig`.  
   - If you need fresh data each time, keep `cache_mode=CacheMode.BYPASS`.
 5. **Hooks**  
   - You can set up global hooks for each crawler (like to block images) or per-run if you want.  
   - Keep them consistent if you’re reusing sessions.
 ---
 ## 5. Summary
 - **One** `AsyncWebCrawler` + multiple calls to `.arun()` is far more efficient than launching a new crawler per URL.  
 - **Sequential** approach with a shared session is simple and memory-friendly for moderate sets of URLs.  
 - **Parallel** approach can speed up large crawls by concurrency, but keep concurrency balanced to avoid overhead.  
 - Close the crawler once at the end, ensuring the browser is only opened/closed once.
 For even more advanced memory optimizations or dynamic concurrency patterns, see future sections on hooking or distributed crawling. The patterns above suffice for the majority of multi-URL scenarios—**giving you speed, simplicity, and minimal resource usage**. Enjoy your optimized crawling!
--- a/docs/md_v2/advanced/multi-url-crawling.md
+++ b/docs/md_v2/advanced/multi-url-crawling.md
@@ -1,264 +1,205 @@
-# Optimized Multi-URL Crawling
+# Advanced Multi-URL Crawling with Dispatchers
-> **Note**: We’re developing a new **executor module** that uses a sophisticated algorithm to dynamically manage multi-URL crawling, optimizing for speed and memory usage. The approaches in this document remain fully valid, but keep an eye on **Crawl4AI**’s upcoming releases for this powerful feature! Follow [@unclecode](https://twitter.com/unclecode) on X and check the changelogs to stay updated.
+> **Heads Up**: Crawl4AI supports advanced dispatchers for **parallel** or **throttled** crawling, providing dynamic rate limiting and memory usage checks. The built-in `arun_many()` function uses these dispatchers to handle concurrency efficiently.
 ## 1. Introduction
-Crawl4AI’s **AsyncWebCrawler** can handle multiple URLs in a single run, which can greatly reduce overhead and speed up crawling. This guide shows how to:
+When crawling many URLs:
 - **Basic**: Use `arun()` in a loop (simple but less efficient)
 - **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
 - **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
-1. **Sequentially** crawl a list of URLs using the **same** session, avoiding repeated browser creation.  
+**Why Dispatchers?**  
-2. **Parallel**-crawl subsets of URLs in batches, again reusing the same browser.  
+- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
 - **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
 - **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
 - **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
-When the entire process finishes, you close the browser once—**minimizing** memory and resource usage.
+## 2. Core Components
---
+### 2.1 Rate Limiter
 ## 1. Why Avoid Simple Loops per URL?
 If you naively do:
 ```python
-for url in urls:
+class RateLimiter:
-    async with AsyncWebCrawler() as crawler:
+    def __init__(
-        result = await crawler.arun(url)
+        base_delay: Tuple[float, float] = (1.0, 3.0),  # Random delay range between requests
        max_delay: float = 60.0,                        # Maximum backoff delay
        max_retries: int = 3,                          # Retries before giving up
        rate_limit_codes: List[int] = [429, 503]       # Status codes triggering backoff
    )
 ```
-You end up:
+The RateLimiter provides:
 - Random delays between requests
 - Exponential backoff on rate limit responses
 - Domain-specific rate limiting
 - Automatic retry handling
-1. Spinning up a **new** browser for each URL  
+### 2.2 Crawler Monitor
 2. Closing it immediately after the single crawl  
 3. Potentially using a lot of CPU/memory for short-living browsers  
 4. Missing out on session reusability if you have login or ongoing states
-**Better** approaches ensure you **create** the browser once, then crawl multiple URLs with minimal overhead.
+The CrawlerMonitor provides real-time visibility into crawling operations:
 ---
 ## 2. Sequential Crawling with Session Reuse
 ### 2.1 Overview
 1. **One** `AsyncWebCrawler` instance for **all** URLs.  
 2. **One** session (via `session_id`) so we can preserve local storage or cookies across URLs if needed.  
 3. The crawler is only closed at the **end**.
 **This** is the simplest pattern if your workload is moderate (dozens to a few hundred URLs).
 ### 2.2 Example Code
 ```python
-import asyncio
+monitor = CrawlerMonitor(
-from typing import List
+    max_visible_rows=15,           # Maximum rows in live display
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+    display_mode=DisplayMode.DETAILED  # DETAILED or AGGREGATED view
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+)
 async def crawl_sequential(urls: List[str]):
    print("\n=== Sequential Crawling with Session Reuse ===")
    browser_config = BrowserConfig(
        headless=True,
        # For better performance in Docker or low-memory environments:
        extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
    )
    crawl_config = CrawlerRunConfig(
        markdown_generator=DefaultMarkdownGenerator()
    )
    # Create the crawler (opens the browser)
    crawler = AsyncWebCrawler(config=browser_config)
    await crawler.start()
    try:
        session_id = "session1"  # Reuse the same session across all URLs
        for url in urls:
            result = await crawler.arun(
                url=url,
                config=crawl_config,
                session_id=session_id
            )
            if result.success:
                print(f"Successfully crawled: {url}")
                # E.g. check markdown length
                print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
            else:
                print(f"Failed: {url} - Error: {result.error_message}")
    finally:
        # After all URLs are done, close the crawler (and the browser)
        await crawler.close()
 async def main():
    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3"
    ]
    await crawl_sequential(urls)
 if __name__ == "__main__":
    asyncio.run(main())
 ```
-**Why It’s Good**:
+**Display Modes**:
 1. **DETAILED**: Shows individual task status, memory usage, and timing
 2. **AGGREGATED**: Displays summary statistics and overall progress
- **One** browser launch.  
+## 3. Available Dispatchers
 - Minimal memory usage.  
 - If the site requires login, you can log in once in `session_id` context and preserve auth across all URLs.
---
+### 3.1 MemoryAdaptiveDispatcher (Default)
-## 3. Parallel Crawling with Browser Reuse
+Automatically manages concurrency based on system memory usage:
 ### 3.1 Overview
 To speed up crawling further, you can crawl multiple URLs in **parallel** (batches or a concurrency limit). The crawler still uses **one** browser, but spawns different sessions (or the same, depending on your logic) for each task.
 ### 3.2 Example Code
 For this example make sure to install the [psutil](https://pypi.org/project/psutil/) package.
 ```bash
 pip install psutil
 ```
 Then you can run the following code:
 ```python
-import os
+dispatcher = MemoryAdaptiveDispatcher(
-import sys
+    memory_threshold_percent=70.0,  # Pause if memory exceeds this
-import psutil
+    check_interval=1.0,             # How often to check memory
-import asyncio
+    max_session_permit=10,          # Maximum concurrent tasks
-
+    rate_limiter=RateLimiter(       # Optional rate limiting
-__location__ = os.path.dirname(os.path.abspath(__file__))
+        base_delay=(1.0, 2.0),
-__output__ = os.path.join(__location__, "output")
+        max_delay=30.0,
-
+        max_retries=2
-# Append parent directory to system path
+    ),
-parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    monitor=CrawlerMonitor(         # Optional monitoring
-sys.path.append(parent_dir)
+        max_visible_rows=15,
-
+        display_mode=DisplayMode.DETAILED
 from typing import List
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
    print("\n=== Parallel Crawling with Browser Reuse + Memory Check ===")
    # We'll keep track of peak memory usage across all tasks
    peak_memory = 0
    process = psutil.Process(os.getpid())
    def log_memory(prefix: str = ""):
        nonlocal peak_memory
        current_mem = process.memory_info().rss  # in bytes
        if current_mem > peak_memory:
            peak_memory = current_mem
        print(f"{prefix} Current Memory: {current_mem // (1024 * 1024)} MB, Peak: {peak_memory // (1024 * 1024)} MB")
    # Minimal browser config
    browser_config = BrowserConfig(
        headless=True,
        verbose=False,   # corrected from 'verbos=False'
        extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
    )
-    crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+)
    # Create the crawler instance
    crawler = AsyncWebCrawler(config=browser_config)
    await crawler.start()
    try:
        # We'll chunk the URLs in batches of 'max_concurrent'
        success_count = 0
        fail_count = 0
        for i in range(0, len(urls), max_concurrent):
            batch = urls[i : i + max_concurrent]
            tasks = []
            for j, url in enumerate(batch):
                # Unique session_id per concurrent sub-task
                session_id = f"parallel_session_{i + j}"
                task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
                tasks.append(task)
            # Check memory usage prior to launching tasks
            log_memory(prefix=f"Before batch {i//max_concurrent + 1}: ")
            # Gather results
            results = await asyncio.gather(*tasks, return_exceptions=True)
            # Check memory usage after tasks complete
            log_memory(prefix=f"After batch {i//max_concurrent + 1}: ")
            # Evaluate results
            for url, result in zip(batch, results):
                if isinstance(result, Exception):
                    print(f"Error crawling {url}: {result}")
                    fail_count += 1
                elif result.success:
                    success_count += 1
                else:
                    fail_count += 1
        print(f"\nSummary:")
        print(f"  - Successfully crawled: {success_count}")
        print(f"  - Failed: {fail_count}")
    finally:
        print("\nClosing crawler...")
        await crawler.close()
        # Final memory log
        log_memory(prefix="Final: ")
        print(f"\nPeak memory usage (MB): {peak_memory // (1024 * 1024)}")
 async def main():
    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3",
        "https://example.com/page4"
    ]
    await crawl_parallel(urls, max_concurrent=2)
 if __name__ == "__main__":
    asyncio.run(main())
 ```
-**Notes**:
+### 3.2 SemaphoreDispatcher
- We **reuse** the same `AsyncWebCrawler` instance for all parallel tasks, launching **one** browser.  
+Provides simple concurrency control with a fixed limit:
 - Each parallel sub-task might get its own `session_id` so they don’t share cookies/localStorage (unless that’s desired).  
 - We limit concurrency to `max_concurrent=2` or 3 to avoid saturating CPU/memory.
---
+```python
 dispatcher = SemaphoreDispatcher(
    semaphore_count=5,             # Fixed concurrent tasks
    rate_limiter=RateLimiter(      # Optional rate limiting
        base_delay=(0.5, 1.0),
        max_delay=10.0
    ),
    monitor=CrawlerMonitor(        # Optional monitoring
        max_visible_rows=15,
        display_mode=DisplayMode.DETAILED
    )
 )
 ```
-## 4. Performance Tips
+## 4. Usage Examples
-1. **Extra Browser Args**  
+### 4.1 Simple Usage (Default MemoryAdaptiveDispatcher)
   - `--disable-gpu`, `--no-sandbox` can help in Docker or restricted environments.  
   - `--disable-dev-shm-usage` avoids using `/dev/shm` which can be small on some systems.
-2. **Session Reuse**  
+```python
-   - If your site requires a login or you want to maintain local data across URLs, share the **same** `session_id`.  
+async with AsyncWebCrawler(config=browser_config) as crawler:
-   - If you want isolation (each URL fresh), create unique sessions.
+    results = await crawler.arun_many(urls, config=run_config)
 ```
-3. **Batching**  
+### 4.2 Memory Adaptive with Rate Limiting
   - If you have **many** URLs (like thousands), you can do parallel crawling in chunks (like `max_concurrent=5`).  
   - Use `arun_many()` for a built-in approach if you prefer, but the example above is often more flexible.
-4. **Cache**  
+```python
-   - If your pages share many resources or you’re re-crawling the same domain repeatedly, consider setting `cache_mode=CacheMode.ENABLED` in `CrawlerRunConfig`.  
+async def crawl_with_memory_adaptive(urls):
-   - If you need fresh data each time, keep `cache_mode=CacheMode.BYPASS`.
+    browser_config = BrowserConfig(headless=True, verbose=False)
    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
-5. **Hooks**  
+    dispatcher = MemoryAdaptiveDispatcher(
-   - You can set up global hooks for each crawler (like to block images) or per-run if you want.  
+        memory_threshold_percent=70.0,
-   - Keep them consistent if you’re reusing sessions.
+        max_session_permit=10,
        rate_limiter=RateLimiter(
            base_delay=(1.0, 2.0),
            max_delay=30.0,
            max_retries=2
        ),
        monitor=CrawlerMonitor(
            max_visible_rows=15,
            display_mode=DisplayMode.DETAILED
        )
    )
---
+    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(
            urls,
            config=run_config,
            dispatcher=dispatcher
        )
        return results
 ```
-## 5. Summary
+### 4.3 Semaphore with Rate Limiting
- **One** `AsyncWebCrawler` + multiple calls to `.arun()` is far more efficient than launching a new crawler per URL.  
+```python
- **Sequential** approach with a shared session is simple and memory-friendly for moderate sets of URLs.  
+async def crawl_with_semaphore(urls):
- **Parallel** approach can speed up large crawls by concurrency, but keep concurrency balanced to avoid overhead.  
+    browser_config = BrowserConfig(headless=True, verbose=False)
- Close the crawler once at the end, ensuring the browser is only opened/closed once.
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
-For even more advanced memory optimizations or dynamic concurrency patterns, see future sections on hooking or distributed crawling. The patterns above suffice for the majority of multi-URL scenarios—**giving you speed, simplicity, and minimal resource usage**. Enjoy your optimized crawling!
+    dispatcher = SemaphoreDispatcher(
        semaphore_count=5,
        rate_limiter=RateLimiter(
            base_delay=(0.5, 1.0),
            max_delay=10.0
        ),
        monitor=CrawlerMonitor(
            max_visible_rows=15,
            display_mode=DisplayMode.DETAILED
        )
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(
            urls, 
            config=run_config,
            dispatcher=dispatcher
        )
        return results
 ```
 ## 5. Dispatch Results
 Each crawl result includes dispatch information:
 ```python
@dataclass
 class DispatchResult:
    task_id: str
    memory_usage: float
    peak_memory: float
    start_time: datetime
    end_time: datetime
    error_message: str = ""
 ```
 Access via `result.dispatch_result`:
 ```python
 for result in results:
    if result.success:
        dr = result.dispatch_result
        print(f"URL: {result.url}")
        print(f"Memory: {dr.memory_usage:.1f}MB")
        print(f"Duration: {dr.end_time - dr.start_time}")
 ```
 ## 6. Summary
 1. **Two Dispatcher Types**:
   - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
   - SemaphoreDispatcher: Fixed concurrency limit
 2. **Optional Components**:
   - RateLimiter: Smart request pacing and backoff
   - CrawlerMonitor: Real-time progress visualization
 3. **Key Benefits**:
   - Automatic memory management
   - Built-in rate limiting
   - Live progress monitoring
   - Flexible concurrency control
 Choose the dispatcher that best fits your needs:
 - **MemoryAdaptiveDispatcher**: For large crawls or limited resources
 - **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios
--- a/docs/md_v2/api/arun.md
+++ b/docs/md_v2/api/arun.md
@@ -1,7 +1,3 @@
 Below is a **revised parameter guide** for **`arun()`** in **AsyncWebCrawler**, reflecting the **new** approach where all parameters are passed via a **`CrawlerRunConfig`** instead of directly to `arun()`. Each section includes example usage in the new style, ensuring a clear, modern approach.
 ---
 # `arun()` Parameter Guide (New Approach)
 In Crawl4AI’s **latest** configuration model, nearly all parameters that once went directly to `arun()` are now part of **`CrawlerRunConfig`**. When calling `arun()`, you provide:
--- a/docs/md_v2/api/arun_many.md
+++ b/docs/md_v2/api/arun_many.md
@@ -0,0 +1,100 @@
 # `arun_many(...)` Reference
 > **Note**: This function is very similar to [`arun()`](./arun.md) but focused on **concurrent** or **batch** crawling. If you’re unfamiliar with `arun()` usage, please read that doc first, then review this for differences.
 ## Function Signature
 ```python
 async def arun_many(
    urls: Union[List[str], List[Any]],
    config: Optional[CrawlerRunConfig] = None,
    dispatcher: Optional[BaseDispatcher] = None,
    ...
 ) -> List[CrawlResult]:
    """
    Crawl multiple URLs concurrently or in batches.
    :param urls: A list of URLs (or tasks) to crawl.
    :param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
    :param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
    ...
    :return: A list of `CrawlResult` objects, one per URL.
    """
 ```
 ## Differences from `arun()`
 1. **Multiple URLs**:  
   - Instead of crawling a single URL, you pass a list of them (strings or tasks).  
   - The function returns a **list** of `CrawlResult`, in the same order as `urls`.
 2. **Concurrency & Dispatchers**:  
   - **`dispatcher`** param allows advanced concurrency control.  
   - If omitted, a default dispatcher (like `MemoryAdaptiveDispatcher`) is used internally.  
   - Dispatchers handle concurrency, rate limiting, and memory-based adaptive throttling (see [Multi-URL Crawling](../advanced/multi-url-crawling.md)).
 3. **Parallel** Execution**:  
   - `arun_many()` can run multiple requests concurrently under the hood.  
   - Each `CrawlResult` might also include a **`dispatch_result`** with concurrency details (like memory usage, start/end times).
 ### Basic Example
 ```python
 # Minimal usage: The default dispatcher will be used
 results = await crawler.arun_many(
    urls=["https://site1.com", "https://site2.com"],
    config=my_run_config
 )
 for res in results:
    if res.success:
        print(res.url, "crawled OK!")
    else:
        print("Failed:", res.url, "-", res.error_message)
 ```
 ### With a Custom Dispatcher
 ```python
 dispatcher = MemoryAdaptiveDispatcher(
    memory_threshold_percent=70.0,
    max_session_permit=10
 )
 results = await crawler.arun_many(
    urls=["https://site1.com", "https://site2.com", "https://site3.com"],
    config=my_run_config,
    dispatcher=dispatcher
 )
 ```
 **Key Points**:
 - Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
 - `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.  
 - If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
 ### Return Value
 A **list** of [`CrawlResult`](./crawl-result.md) objects, one per URL. You can iterate to check `result.success` or read each item’s `extracted_content`, `markdown`, or `dispatch_result`.
 ---
 ## Dispatcher Reference
 - **`MemoryAdaptiveDispatcher`**: Dynamically manages concurrency based on system memory usage.  
 - **`SemaphoreDispatcher`**: Fixed concurrency limit, simpler but less adaptive.  
 For advanced usage or custom settings, see [Multi-URL Crawling with Dispatchers](../advanced/multi-url-crawling.md).
 ---
 ## Common Pitfalls
 1. **Large Lists**: If you pass thousands of URLs, be mindful of memory or rate-limits. A dispatcher can help.  
 2. **Session Reuse**: If you need specialized logins or persistent contexts, ensure your dispatcher or tasks handle sessions accordingly.  
 3. **Error Handling**: Each `CrawlResult` might fail for different reasons—always check `result.success` or the `error_message` before proceeding.
 ---
 ## Conclusion
 Use `arun_many()` when you want to **crawl multiple URLs** simultaneously or in controlled parallel tasks. If you need advanced concurrency features (like memory-based adaptive throttling or complex rate-limiting), provide a **dispatcher**. Each result is a standard `CrawlResult`, possibly augmented with concurrency stats (`dispatch_result`) for deeper inspection. For more details on concurrency logic and dispatchers, see the [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) docs.
--- a/docs/md_v2/api/async-webcrawler.md
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -130,51 +130,88 @@ For **backward** compatibility, `arun()` can still accept direct arguments like
 ---
-## 4. Helper Methods
+## 4. Batch Processing: `arun_many()`
 ### 4.1 `arun_many()`
 ```python
 async def arun_many(
    self,
    urls: List[str],
    config: Optional[CrawlerRunConfig] = None,
-    # Legacy parameters...
+    # Legacy parameters maintained for backwards compatibility...
 ) -> List[CrawlResult]:
-    ...
+    """
    Process multiple URLs with intelligent rate limiting and resource monitoring.
    """
 ```
-Crawls multiple URLs in concurrency. Accepts the same style `CrawlerRunConfig`. Example:
+### 4.1 Resource-Aware Crawling
 The `arun_many()` method now uses an intelligent dispatcher that:
 - Monitors system memory usage
 - Implements adaptive rate limiting
 - Provides detailed progress monitoring
 - Manages concurrent crawls efficiently
 ### 4.2 Example Usage
 ```python
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
 from crawl4ai.dispatcher import DisplayMode
 # Configure browser
 browser_cfg = BrowserConfig(headless=True)
 # Configure crawler with rate limiting
 run_cfg = CrawlerRunConfig(
-    # e.g., concurrency, wait_for, caching, extraction, etc.
+    # Enable rate limiting
-    semaphore_count=5
+    enable_rate_limiting=True,
    rate_limit_config=RateLimitConfig(
        base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds
        max_delay=30.0,         # Maximum delay after rate limit hits
        max_retries=2,          # Number of retries before giving up
        rate_limit_codes=[429, 503]  # Status codes that trigger rate limiting
    ),
    # Resource monitoring
    memory_threshold_percent=70.0,  # Pause if memory exceeds this
    check_interval=0.5,            # How often to check resources
    max_session_permit=3,          # Maximum concurrent crawls
    display_mode=DisplayMode.DETAILED.value  # Show detailed progress
 )
 urls = [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3"
 ]
 async with AsyncWebCrawler(config=browser_cfg) as crawler:
-    results = await crawler.arun_many(
+    results = await crawler.arun_many(urls, config=run_cfg)
-        urls=["https://example.com", "https://another.com"],
+    for result in results:
-        config=run_cfg
+        print(f"URL: {result.url}, Success: {result.success}")
    )
    for r in results:
        print(r.url, ":", len(r.cleaned_html))
 ```
-### 4.2 `start()` & `close()`
+### 4.3 Key Features
-Allows manual lifecycle usage instead of context manager:
+1. **Rate Limiting**
   - Automatic delay between requests
   - Exponential backoff on rate limit detection
   - Domain-specific rate limiting
   - Configurable retry strategy
-```python
+2. **Resource Monitoring**
-crawler = AsyncWebCrawler(config=browser_cfg)
+   - Memory usage tracking
-await crawler.start()
+   - Adaptive concurrency based on system load
   - Automatic pausing when resources are constrained
-# Perform multiple operations
+3. **Progress Monitoring**
-resultA = await crawler.arun("https://exampleA.com", config=run_cfg)
+   - Detailed or aggregated progress display
-resultB = await crawler.arun("https://exampleB.com", config=run_cfg)
+   - Real-time status updates
   - Memory usage statistics
-await crawler.close()
+4. **Error Handling**
-```
+   - Graceful handling of rate limits
   - Automatic retries with backoff
   - Detailed error reporting
 ---
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -26,6 +26,7 @@ class CrawlResult(BaseModel):
    response_headers: Optional[dict] = None
    status_code: Optional[int] = None
    ssl_certificate: Optional[SSLCertificate] = None
    dispatch_result: Optional[DispatchResult] = None
    ...
 ```
@@ -262,7 +263,31 @@ if result.metadata:
 ---
-## 6. Example: Accessing Everything
+## 6. `dispatch_result` (optional)
 A `DispatchResult` object providing additional concurrency and resource usage information when crawling URLs in parallel (e.g., via `arun_many()` with custom dispatchers). It contains:
 - **`task_id`**: A unique identifier for the parallel task.
 - **`memory_usage`** (float): The memory (in MB) used at the time of completion.
 - **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task’s execution.
 - **`start_time`** / **`end_time`** (datetime): Time range for this crawling task.
 - **`error_message`** (str): Any dispatcher- or concurrency-related error encountered.
 ```python
 # Example usage:
 for result in results:
    if result.success and result.dispatch_result:
        dr = result.dispatch_result
        print(f"URL: {result.url}, Task ID: {dr.task_id}")
        print(f"Memory: {dr.memory_usage:.1f} MB (Peak: {dr.peak_memory:.1f} MB)")
        print(f"Duration: {dr.end_time - dr.start_time}")
 ```
 > **Note**: This field is typically populated when using `arun_many(...)` alongside a **dispatcher** (e.g., `MemoryAdaptiveDispatcher` or `SemaphoreDispatcher`). If no concurrency or dispatcher is used, `dispatch_result` may remain `None`. 
 ---
 ## 7. Example: Accessing Everything
 ```python
 async def handle_result(result: CrawlResult):
@@ -306,7 +331,7 @@ async def handle_result(result: CrawlResult):
 ---
-## 7. Key Points & Future
+## 8. Key Points & Future
 1. **`markdown_v2` vs `markdown`**  
   - Right now, `markdown_v2` is the more robust container (`MarkdownGenerationResult`), providing **raw_markdown**, **markdown_with_citations**, references, plus possible **fit_markdown**.  
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -157,7 +157,32 @@ Use these for link-level content filtering (often to keep crawls “internal”
 ---
-### G) **Debug & Logging**
+### G) **Rate Limiting & Resource Management**
 | **Parameter**                | **Type / Default**                     | **What It Does**                                                                                                           |
 |------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
 | **`enable_rate_limiting`**  | `bool` (default: `False`)              | Enable intelligent rate limiting for multiple URLs                                                                          |
 | **`rate_limit_config`**     | `RateLimitConfig` (default: `None`)    | Configuration for rate limiting behavior                                                                                   |
 The `RateLimitConfig` class has these fields:
 | **Field**           | **Type / Default**                     | **What It Does**                                                                                                           |
 |--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
 | **`base_delay`**   | `Tuple[float, float]` (1.0, 3.0)      | Random delay range between requests to the same domain                                                                      |
 | **`max_delay`**    | `float` (60.0)                        | Maximum delay after rate limit detection                                                                                    |
 | **`max_retries`**  | `int` (3)                             | Number of retries before giving up on rate-limited requests                                                                 |
 | **`rate_limit_codes`** | `List[int]` ([429, 503])          | HTTP status codes that trigger rate limiting behavior                                                                       |
 | **Parameter**                  | **Type / Default**                     | **What It Does**                                                                                                           |
 |-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
 | **`memory_threshold_percent`** | `float` (70.0)                        | Maximum memory usage before pausing new crawls                                                                              |
 | **`check_interval`**          | `float` (1.0)                         | How often to check system resources (in seconds)                                                                           |
 | **`max_session_permit`**      | `int` (20)                            | Maximum number of concurrent crawl sessions                                                                                |
 | **`display_mode`**            | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information                                                                                     |
 ---
 ### H) **Debug & Logging**
 | **Parameter**  | **Type / Default** | **What It Does**                                                         |
 |----------------|--------------------|---------------------------------------------------------------------------|
@@ -170,7 +195,7 @@ Use these for link-level content filtering (often to keep crawls “internal”
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig
 async def main():
    # Configure the browser
@@ -190,7 +215,18 @@ async def main():
        excluded_tags=["script", "style"],
        exclude_external_links=True,
        wait_for="css:.article-loaded",
-        screenshot=True
+        screenshot=True,
        enable_rate_limiting=True,
        rate_limit_config=RateLimitConfig(
            base_delay=(1.0, 3.0),
            max_delay=60.0,
            max_retries=3,
            rate_limit_codes=[429, 503]
        ),
        memory_threshold_percent=70.0,
        check_interval=1.0,
        max_session_permit=20,
        display_mode="DETAILED"
    )
    async with AsyncWebCrawler(config=browser_cfg) as crawler:
@@ -223,4 +259,3 @@ if __name__ == "__main__":
 - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.  
 - **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.  
 - **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).  
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -116,6 +116,12 @@ class CrawlerRunConfig:
        wait_for=None,
        screenshot=False,
        pdf=False,
        enable_rate_limiting=False,
        rate_limit_config=None,
        memory_threshold_percent=70.0,
        check_interval=1.0,
        max_session_permit=20,
        display_mode=None,
        verbose=True,
        # ... other advanced parameters omitted
    ):
@@ -156,6 +162,58 @@ class CrawlerRunConfig:
   - Logs additional runtime details.  
   - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
 9. **`enable_rate_limiting`**:  
   - If `True`, enables rate limiting for batch processing.  
   - Requires `rate_limit_config` to be set.
 10. **`rate_limit_config`**:  
    - A `RateLimitConfig` object controlling rate limiting behavior.  
    - See below for details.
 11. **`memory_threshold_percent`**:  
    - The memory threshold (as a percentage) to monitor.  
    - If exceeded, the crawler will pause or slow down.
 12. **`check_interval`**:  
    - The interval (in seconds) to check system resources.  
    - Affects how often memory and CPU usage are monitored.
 13. **`max_session_permit`**:  
    - The maximum number of concurrent crawl sessions.  
    - Helps prevent overwhelming the system.
 14. **`display_mode`**:  
    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
    - Affects how much information is printed during the crawl.
 ### Rate Limiting & Resource Management
 For batch processing with `arun_many()`, you can enable intelligent rate limiting:
 ```python
 from crawl4ai import RateLimitConfig
 config = CrawlerRunConfig(
    enable_rate_limiting=True,
    rate_limit_config=RateLimitConfig(
        base_delay=(1.0, 3.0),    # Random delay range
        max_delay=60.0,           # Max delay after rate limits
        max_retries=3,            # Retries before giving up
        rate_limit_codes=[429, 503]  # Status codes to watch
    ),
    memory_threshold_percent=70.0,  # Memory threshold
    check_interval=1.0,            # Resource check interval
    max_session_permit=20,         # Max concurrent crawls
    display_mode="DETAILED"        # Progress display mode
 )
 ```
 This configuration:
 - Implements intelligent rate limiting per domain
 - Monitors system resources
 - Provides detailed progress information
 - Manages concurrent crawls efficiently
 **Minimal Example**:
 ```python
@@ -164,7 +222,14 @@ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 crawl_conf = CrawlerRunConfig(
    js_code="document.querySelector('button#loadMore')?.click()",
    wait_for="css:.loaded-content",
-    screenshot=True
+    screenshot=True,
    enable_rate_limiting=True,
    rate_limit_config=RateLimitConfig(
        base_delay=(1.0, 3.0),
        max_delay=60.0,
        max_retries=3,
        rate_limit_codes=[429, 503]
    )
 )
 async with AsyncWebCrawler() as crawler:
@@ -205,7 +270,14 @@ async def main():
    # 3) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
        extraction_strategy=extraction,
-        cache_mode=CacheMode.BYPASS
+        cache_mode=CacheMode.BYPASS,
        enable_rate_limiting=True,
        rate_limit_config=RateLimitConfig(
            base_delay=(1.0, 3.0),
            max_delay=60.0,
            max_retries=3,
            rate_limit_codes=[429, 503]
        )
    )
    async with AsyncWebCrawler(config=browser_conf) as crawler:
--- a/docs/md_v2/core/quickstart.md
+++ b/docs/md_v2/core/quickstart.md
@@ -1,7 +1,3 @@
 Below is the **revised Quickstart** guide with the **Installation** section removed, plus an updated **dynamic content** crawl example that uses `BrowserConfig` and `CrawlerRunConfig` (instead of passing parameters directly to `arun()`). Everything else remains as before.
 ---
 # Getting Started with Crawl4AI
 Welcome to **Crawl4AI**, an open-source LLM-friendly Web Crawler & Scraper. In this tutorial, you’ll:
@@ -254,7 +250,39 @@ if __name__ == "__main__":
 ---
-## 7. Dynamic Content Example
+## 7. Multi-URL Concurrency (Preview)
 If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse:
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 async def quick_parallel_example():
    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3"
    ]
    run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun_many(urls, config=run_conf)
        for res in results:
            if res.success:
                print(f"[OK] {res.url}, length: {len(res.markdown_v2.raw_markdown)}")
            else:
                print(f"[ERROR] {res.url} => {res.error_message}")
 if __name__ == "__main__":
    asyncio.run(quick_parallel_example())
 ```
 For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
 ## 8. Dynamic Content Example
 Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**:
@@ -343,7 +371,7 @@ if __name__ == "__main__":
 ---
-## 8. Next Steps
+## 9. Next Steps
 Congratulations! You have:
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -44,6 +44,7 @@ nav:
  - API Reference:
    - "AsyncWebCrawler": "api/async-webcrawler.md"
    - "arun()": "api/arun.md"
    - "arun_many()": "api/arun_many.md"
    - "Browser & Crawler Config": "api/parameters.md"
    - "CrawlResult": "api/crawl-result.md"
    - "Strategies": "api/strategies.md"
--- a/tests/async/test_dispatchers.py
+++ b/tests/async/test_dispatchers.py
@@ -0,0 +1,147 @@
 import pytest
 import asyncio, time
 from crawl4ai import (
    AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
    MemoryAdaptiveDispatcher, SemaphoreDispatcher,
    RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
 )
@pytest.fixture
 def browser_config():
    return BrowserConfig(
        headless=True,
        verbose=False
    )
@pytest.fixture
 def run_config():
    return CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        verbose=False
    )
@pytest.fixture
 def test_urls():
    return [
        "http://example.com",
        "http://example.com/page1",
        "http://example.com/page2"
    ]
@pytest.mark.asyncio
 class TestDispatchStrategies:
    async def test_memory_adaptive_basic(self, browser_config, run_config, test_urls):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = MemoryAdaptiveDispatcher(
                memory_threshold_percent=70.0,
                max_session_permit=2,
                check_interval=0.1
            )
            results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
            assert len(results) == len(test_urls)
            assert all(r.success for r in results)
    async def test_memory_adaptive_with_rate_limit(self, browser_config, run_config, test_urls):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = MemoryAdaptiveDispatcher(
                memory_threshold_percent=70.0,
                max_session_permit=2,
                check_interval=0.1,
                rate_limiter=RateLimiter(
                    base_delay=(0.1, 0.2),
                    max_delay=1.0,
                    max_retries=2
                )
            )
            results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
            assert len(results) == len(test_urls)
            assert all(r.success for r in results)
    async def test_semaphore_basic(self, browser_config, run_config, test_urls):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = SemaphoreDispatcher(
                semaphore_count=2
            )
            results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
            assert len(results) == len(test_urls)
            assert all(r.success for r in results)
    async def test_semaphore_with_rate_limit(self, browser_config, run_config, test_urls):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = SemaphoreDispatcher(
                semaphore_count=2,
                rate_limiter=RateLimiter(
                    base_delay=(0.1, 0.2),
                    max_delay=1.0,
                    max_retries=2
                )
            )
            results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
            assert len(results) == len(test_urls)
            assert all(r.success for r in results)
    async def test_memory_adaptive_memory_error(self, browser_config, run_config, test_urls):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = MemoryAdaptiveDispatcher(
                memory_threshold_percent=1.0,  # Set unrealistically low threshold
                max_session_permit=2,
                check_interval=0.1,
                memory_wait_timeout=1.0  # Short timeout for testing
            )
            with pytest.raises(MemoryError):
                await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
    async def test_empty_urls(self, browser_config, run_config):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
            results = await crawler.arun_many([], config=run_config, dispatcher=dispatcher)
            assert len(results) == 0
    async def test_single_url(self, browser_config, run_config):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
            results = await crawler.arun_many(["http://example.com"], config=run_config, dispatcher=dispatcher)
            assert len(results) == 1
            assert results[0].success
    async def test_invalid_urls(self, browser_config, run_config):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
            results = await crawler.arun_many(["http://invalid.url.that.doesnt.exist"], config=run_config, dispatcher=dispatcher)
            assert len(results) == 1
            assert not results[0].success
    async def test_rate_limit_backoff(self, browser_config, run_config):
        urls = ["http://example.com"] * 5  # Multiple requests to same domain
        async with AsyncWebCrawler(config=browser_config) as crawler:
            dispatcher = MemoryAdaptiveDispatcher(
                max_session_permit=2,
                rate_limiter=RateLimiter(
                    base_delay=(0.1, 0.2),
                    max_delay=1.0,
                    max_retries=2,
                    rate_limit_codes=[200]  # Force rate limiting for testing
                )
            )
            start_time = time.time()
            results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
            duration = time.time() - start_time
            assert len(results) == len(urls)
            assert duration > 1.0  # Ensure rate limiting caused delays
    async def test_monitor_integration(self, browser_config, run_config, test_urls):
        async with AsyncWebCrawler(config=browser_config) as crawler:
            monitor = CrawlerMonitor(max_visible_rows=5, display_mode=DisplayMode.DETAILED)
            dispatcher = MemoryAdaptiveDispatcher(
                max_session_permit=2,
                monitor=monitor
            )
            results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
            assert len(results) == len(test_urls)
            # Check monitor stats
            assert len(monitor.stats) == len(test_urls)
            assert all(stat.end_time is not None for stat in monitor.stats.values())
 if __name__ == "__main__":
    pytest.main([__file__, "-v",  "--asyncio-mode=auto"])
--- a/tests/test_dispatcher.py
+++ b/tests/test_dispatcher.py
@@ -1,38 +0,0 @@
 import pytest
 import asyncio
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
 from crawl4ai.dispatcher import DisplayMode
@pytest.mark.asyncio
 async def test_crawler_with_dispatcher():
    # Create test URLs
    urls = [f"https://example.com/page_{i}" for i in range(5)]
    # Configure browser
    browser_config = BrowserConfig(headless=True, verbose=False)
    # Configure crawler with rate limiting
    run_config = CrawlerRunConfig(
        enable_rate_limiting=True,
        rate_limit_config=RateLimitConfig(
            base_delay=(1.0, 2.0),
            max_delay=30.0,
            max_retries=2,
            rate_limit_codes=[429, 503]
        ),
        memory_threshold_percent=70.0,
        check_interval=0.5,
        max_session_permit=3,
        display_mode=DisplayMode.DETAILED.value
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(urls, config=run_config)
        # Basic validation
        assert len(results) == len(urls)
        for result in results:
            assert result is not None
            # Note: example.com URLs will fail, which is expected for this test
            assert not result.success  # We expect these to fail since they're fake URLs