feat(crawler): add memory-adaptive dispatcher with rate limiting
Implements a new MemoryAdaptiveDispatcher class to manage concurrent crawling operations with memory monitoring and rate limiting capabilities. Changes include: - Added RateLimitConfig dataclass for configuring rate limiting behavior - Extended CrawlerRunConfig with dispatcher-related settings - Refactored arun_many to use the new dispatcher system - Added memory threshold and session permit controls - Integrated optional progress monitoring display BREAKING CHANGE: The arun_many method now uses MemoryAdaptiveDispatcher by default, which may affect concurrent crawling behavior
This commit is contained in:
@@ -11,8 +11,8 @@ from .user_agent_generator import UserAgentGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from typing import Union, List
|
||||
|
||||
from typing import Union, List, Tuple, Optional
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
class BrowserConfig:
|
||||
"""
|
||||
@@ -184,6 +184,14 @@ class BrowserConfig:
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RateLimitConfig:
|
||||
base_delay: Tuple[float, float] = (1.0, 3.0)
|
||||
max_delay: float = 60.0
|
||||
max_retries: int = 3
|
||||
rate_limit_codes: List[int] = field(default_factory=lambda: [429, 503])
|
||||
|
||||
|
||||
class CrawlerRunConfig:
|
||||
"""
|
||||
Configuration class for controlling how the crawler runs each crawl operation.
|
||||
@@ -311,6 +319,15 @@ class CrawlerRunConfig:
|
||||
Default: True.
|
||||
log_console (bool): If True, log console messages from the page.
|
||||
Default: False.
|
||||
|
||||
# Dispatcher configuration
|
||||
memory_threshold_percent: float = 70.0
|
||||
check_interval: float = 1.0
|
||||
max_session_permit: int = 20
|
||||
enable_rate_limiting: bool = False
|
||||
rate_limit_config: Optional[RateLimitConfig] = None
|
||||
display_mode: Optional[str] = None
|
||||
url: str = None
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -383,6 +400,13 @@ class CrawlerRunConfig:
|
||||
verbose: bool = True,
|
||||
log_console: bool = False,
|
||||
|
||||
# Dispatcher configuration
|
||||
memory_threshold_percent: float = 70.0,
|
||||
check_interval: float = 1.0,
|
||||
max_session_permit: int = 20,
|
||||
enable_rate_limiting: bool = False,
|
||||
rate_limit_config: Optional[RateLimitConfig] = None,
|
||||
display_mode: Optional[str] = None,
|
||||
url: str = None,
|
||||
):
|
||||
self.url = url
|
||||
@@ -455,6 +479,14 @@ class CrawlerRunConfig:
|
||||
self.verbose = verbose
|
||||
self.log_console = log_console
|
||||
|
||||
# Dispatcher configuration
|
||||
self.memory_threshold_percent = memory_threshold_percent
|
||||
self.check_interval = check_interval
|
||||
self.max_session_permit = max_session_permit
|
||||
self.enable_rate_limiting = enable_rate_limiting
|
||||
self.rate_limit_config = rate_limit_config
|
||||
self.display_mode = display_mode
|
||||
|
||||
# Validate type of extraction strategy and chunking strategy if they are provided
|
||||
if self.extraction_strategy is not None and not isinstance(
|
||||
self.extraction_strategy, ExtractionStrategy
|
||||
@@ -541,6 +573,13 @@ class CrawlerRunConfig:
|
||||
verbose=kwargs.get("verbose", True),
|
||||
log_console=kwargs.get("log_console", False),
|
||||
|
||||
# Dispatcher configuration
|
||||
memory_threshold_percent=kwargs.get("memory_threshold_percent", 70.0),
|
||||
check_interval=kwargs.get("check_interval", 1.0),
|
||||
max_session_permit=kwargs.get("max_session_permit", 20),
|
||||
enable_rate_limiting=kwargs.get("enable_rate_limiting", False),
|
||||
rate_limit_config=kwargs.get("rate_limit_config"),
|
||||
display_mode=kwargs.get("display_mode"),
|
||||
url=kwargs.get("url"),
|
||||
)
|
||||
|
||||
@@ -599,5 +638,11 @@ class CrawlerRunConfig:
|
||||
"exclude_domains": self.exclude_domains,
|
||||
"verbose": self.verbose,
|
||||
"log_console": self.log_console,
|
||||
"memory_threshold_percent": self.memory_threshold_percent,
|
||||
"check_interval": self.check_interval,
|
||||
"max_session_permit": self.max_session_permit,
|
||||
"enable_rate_limiting": self.enable_rate_limiting,
|
||||
"rate_limit_config": self.rate_limit_config,
|
||||
"display_mode": self.display_mode,
|
||||
"url": self.url,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user