feat(crawler): add memory-adaptive dispatcher with rate limiting
Implements a new MemoryAdaptiveDispatcher class to manage concurrent crawling operations with memory monitoring and rate limiting capabilities. Changes include: - Added RateLimitConfig dataclass for configuring rate limiting behavior - Extended CrawlerRunConfig with dispatcher-related settings - Refactored arun_many to use the new dispatcher system - Added memory threshold and session permit controls - Integrated optional progress monitoring display BREAKING CHANGE: The arun_many method now uses MemoryAdaptiveDispatcher by default, which may affect concurrent crawling behavior
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -225,4 +225,5 @@ tree.md
|
|||||||
.scripts
|
.scripts
|
||||||
.local
|
.local
|
||||||
.do
|
.do
|
||||||
/plans
|
/plans
|
||||||
|
todo/
|
||||||
@@ -11,8 +11,8 @@ from .user_agent_generator import UserAgentGenerator
|
|||||||
from .extraction_strategy import ExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy
|
from .chunking_strategy import ChunkingStrategy
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||||
from typing import Union, List
|
from typing import Union, List, Tuple, Optional
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
"""
|
"""
|
||||||
@@ -184,6 +184,14 @@ class BrowserConfig:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RateLimitConfig:
|
||||||
|
base_delay: Tuple[float, float] = (1.0, 3.0)
|
||||||
|
max_delay: float = 60.0
|
||||||
|
max_retries: int = 3
|
||||||
|
rate_limit_codes: List[int] = field(default_factory=lambda: [429, 503])
|
||||||
|
|
||||||
|
|
||||||
class CrawlerRunConfig:
|
class CrawlerRunConfig:
|
||||||
"""
|
"""
|
||||||
Configuration class for controlling how the crawler runs each crawl operation.
|
Configuration class for controlling how the crawler runs each crawl operation.
|
||||||
@@ -311,6 +319,15 @@ class CrawlerRunConfig:
|
|||||||
Default: True.
|
Default: True.
|
||||||
log_console (bool): If True, log console messages from the page.
|
log_console (bool): If True, log console messages from the page.
|
||||||
Default: False.
|
Default: False.
|
||||||
|
|
||||||
|
# Dispatcher configuration
|
||||||
|
memory_threshold_percent: float = 70.0
|
||||||
|
check_interval: float = 1.0
|
||||||
|
max_session_permit: int = 20
|
||||||
|
enable_rate_limiting: bool = False
|
||||||
|
rate_limit_config: Optional[RateLimitConfig] = None
|
||||||
|
display_mode: Optional[str] = None
|
||||||
|
url: str = None
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -383,6 +400,13 @@ class CrawlerRunConfig:
|
|||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
log_console: bool = False,
|
log_console: bool = False,
|
||||||
|
|
||||||
|
# Dispatcher configuration
|
||||||
|
memory_threshold_percent: float = 70.0,
|
||||||
|
check_interval: float = 1.0,
|
||||||
|
max_session_permit: int = 20,
|
||||||
|
enable_rate_limiting: bool = False,
|
||||||
|
rate_limit_config: Optional[RateLimitConfig] = None,
|
||||||
|
display_mode: Optional[str] = None,
|
||||||
url: str = None,
|
url: str = None,
|
||||||
):
|
):
|
||||||
self.url = url
|
self.url = url
|
||||||
@@ -455,6 +479,14 @@ class CrawlerRunConfig:
|
|||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.log_console = log_console
|
self.log_console = log_console
|
||||||
|
|
||||||
|
# Dispatcher configuration
|
||||||
|
self.memory_threshold_percent = memory_threshold_percent
|
||||||
|
self.check_interval = check_interval
|
||||||
|
self.max_session_permit = max_session_permit
|
||||||
|
self.enable_rate_limiting = enable_rate_limiting
|
||||||
|
self.rate_limit_config = rate_limit_config
|
||||||
|
self.display_mode = display_mode
|
||||||
|
|
||||||
# Validate type of extraction strategy and chunking strategy if they are provided
|
# Validate type of extraction strategy and chunking strategy if they are provided
|
||||||
if self.extraction_strategy is not None and not isinstance(
|
if self.extraction_strategy is not None and not isinstance(
|
||||||
self.extraction_strategy, ExtractionStrategy
|
self.extraction_strategy, ExtractionStrategy
|
||||||
@@ -541,6 +573,13 @@ class CrawlerRunConfig:
|
|||||||
verbose=kwargs.get("verbose", True),
|
verbose=kwargs.get("verbose", True),
|
||||||
log_console=kwargs.get("log_console", False),
|
log_console=kwargs.get("log_console", False),
|
||||||
|
|
||||||
|
# Dispatcher configuration
|
||||||
|
memory_threshold_percent=kwargs.get("memory_threshold_percent", 70.0),
|
||||||
|
check_interval=kwargs.get("check_interval", 1.0),
|
||||||
|
max_session_permit=kwargs.get("max_session_permit", 20),
|
||||||
|
enable_rate_limiting=kwargs.get("enable_rate_limiting", False),
|
||||||
|
rate_limit_config=kwargs.get("rate_limit_config"),
|
||||||
|
display_mode=kwargs.get("display_mode"),
|
||||||
url=kwargs.get("url"),
|
url=kwargs.get("url"),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -599,5 +638,11 @@ class CrawlerRunConfig:
|
|||||||
"exclude_domains": self.exclude_domains,
|
"exclude_domains": self.exclude_domains,
|
||||||
"verbose": self.verbose,
|
"verbose": self.verbose,
|
||||||
"log_console": self.log_console,
|
"log_console": self.log_console,
|
||||||
|
"memory_threshold_percent": self.memory_threshold_percent,
|
||||||
|
"check_interval": self.check_interval,
|
||||||
|
"max_session_permit": self.max_session_permit,
|
||||||
|
"enable_rate_limiting": self.enable_rate_limiting,
|
||||||
|
"rate_limit_config": self.rate_limit_config,
|
||||||
|
"display_mode": self.display_mode,
|
||||||
"url": self.url,
|
"url": self.url,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -690,7 +690,7 @@ class AsyncWebCrawler:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[CrawlResult]:
|
) -> List[CrawlResult]:
|
||||||
"""
|
"""
|
||||||
Runs the crawler for multiple URLs concurrently.
|
Runs the crawler for multiple URLs concurrently using MemoryAdaptiveDispatcher.
|
||||||
|
|
||||||
Migration Guide:
|
Migration Guide:
|
||||||
Old way (deprecated):
|
Old way (deprecated):
|
||||||
@@ -700,35 +700,35 @@ class AsyncWebCrawler:
|
|||||||
screenshot=True,
|
screenshot=True,
|
||||||
...
|
...
|
||||||
)
|
)
|
||||||
|
|
||||||
New way (recommended):
|
New way (recommended):
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
word_count_threshold=200,
|
word_count_threshold=200,
|
||||||
screenshot=True,
|
screenshot=True,
|
||||||
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(...),
|
||||||
...
|
...
|
||||||
)
|
)
|
||||||
results = await crawler.arun_many(urls, crawler_config=config)
|
results = await crawler.arun_many(urls, config=config)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: List of URLs to crawl
|
urls: List of URLs to crawl
|
||||||
crawler_config: Configuration object controlling crawl behavior for all URLs
|
config: Configuration object controlling crawl behavior for all URLs
|
||||||
[other parameters maintained for backwards compatibility]
|
[other parameters maintained for backwards compatibility]
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[CrawlResult]: Results for each URL
|
List[CrawlResult]: Results for each URL
|
||||||
"""
|
"""
|
||||||
crawler_config = config
|
|
||||||
# Handle configuration
|
# Handle configuration
|
||||||
if crawler_config is not None:
|
if config is not None:
|
||||||
if any(param is not None for param in [
|
if any(param is not None for param in [
|
||||||
word_count_threshold, extraction_strategy, chunking_strategy,
|
word_count_threshold, extraction_strategy, chunking_strategy,
|
||||||
content_filter, cache_mode, css_selector, screenshot, pdf
|
content_filter, cache_mode, css_selector, screenshot, pdf
|
||||||
]):
|
]):
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
|
message="Both config and legacy parameters provided. config will take precedence.",
|
||||||
tag="WARNING"
|
tag="WARNING"
|
||||||
)
|
)
|
||||||
config = crawler_config
|
|
||||||
else:
|
else:
|
||||||
# Merge all parameters into a single kwargs dict for config creation
|
# Merge all parameters into a single kwargs dict for config creation
|
||||||
config_kwargs = {
|
config_kwargs = {
|
||||||
@@ -758,67 +758,31 @@ class AsyncWebCrawler:
|
|||||||
if config.cache_mode is None:
|
if config.cache_mode is None:
|
||||||
config.cache_mode = CacheMode.BYPASS
|
config.cache_mode = CacheMode.BYPASS
|
||||||
|
|
||||||
semaphore_count = config.semaphore_count or 5
|
from .dispatcher import MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
|
||||||
semaphore = asyncio.Semaphore(semaphore_count)
|
|
||||||
|
|
||||||
async def crawl_with_semaphore(url):
|
# Create dispatcher with configuration from CrawlerRunConfig
|
||||||
# Handle rate limiting per domain
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
domain = urlparse(url).netloc
|
crawler=self,
|
||||||
current_time = time.time()
|
memory_threshold_percent=config.memory_threshold_percent,
|
||||||
|
check_interval=config.check_interval,
|
||||||
self.logger.debug(
|
max_session_permit=config.max_session_permit,
|
||||||
message="Started task for {url:.50}...",
|
enable_rate_limiting=config.enable_rate_limiting,
|
||||||
tag="PARALLEL",
|
rate_limit_config=vars(config.rate_limit_config) if config.rate_limit_config else None
|
||||||
params={"url": url}
|
)
|
||||||
|
|
||||||
|
# Create monitor if display mode is specified
|
||||||
|
monitor = None
|
||||||
|
if config.display_mode:
|
||||||
|
monitor = CrawlerMonitor(
|
||||||
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode(config.display_mode)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get delay settings from config
|
# Run URLs through dispatcher
|
||||||
mean_delay = config.mean_delay
|
task_results = await dispatcher.run_urls(urls, config, monitor=monitor)
|
||||||
max_range = config.max_range
|
|
||||||
|
# Convert CrawlerTaskResult to CrawlResult
|
||||||
# Apply rate limiting
|
return [task_result.result for task_result in task_results]
|
||||||
if domain in self._domain_last_hit:
|
|
||||||
time_since_last = current_time - self._domain_last_hit[domain]
|
|
||||||
if time_since_last < mean_delay:
|
|
||||||
delay = mean_delay + random.uniform(0, max_range)
|
|
||||||
await asyncio.sleep(delay)
|
|
||||||
|
|
||||||
self._domain_last_hit[domain] = current_time
|
|
||||||
|
|
||||||
async with semaphore:
|
|
||||||
return await self.arun(
|
|
||||||
url,
|
|
||||||
crawler_config=config, # Pass the entire config object
|
|
||||||
user_agent=user_agent # Maintain user_agent override capability
|
|
||||||
)
|
|
||||||
|
|
||||||
# Log start of concurrent crawling
|
|
||||||
self.logger.info(
|
|
||||||
message="Starting concurrent crawling for {count} URLs...",
|
|
||||||
tag="INIT",
|
|
||||||
params={"count": len(urls)}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute concurrent crawls
|
|
||||||
start_time = time.perf_counter()
|
|
||||||
tasks = [crawl_with_semaphore(url) for url in urls]
|
|
||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
||||||
end_time = time.perf_counter()
|
|
||||||
|
|
||||||
# Log completion
|
|
||||||
self.logger.success(
|
|
||||||
message="Concurrent crawling completed for {count} URLs | Total time: {timing}",
|
|
||||||
tag="COMPLETE",
|
|
||||||
params={
|
|
||||||
"count": len(urls),
|
|
||||||
"timing": f"{end_time - start_time:.2f}s"
|
|
||||||
},
|
|
||||||
colors={
|
|
||||||
"timing": Fore.YELLOW
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
|
||||||
|
|
||||||
async def aclear_cache(self):
|
async def aclear_cache(self):
|
||||||
"""Clear the cache database."""
|
"""Clear the cache database."""
|
||||||
|
|||||||
490
crawl4ai/dispatcher.py
Normal file
490
crawl4ai/dispatcher.py
Normal file
@@ -0,0 +1,490 @@
|
|||||||
|
from typing import Dict, Optional, Any, List, Tuple
|
||||||
|
from .models import CrawlResult
|
||||||
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
|
from .content_filter_strategy import PruningContentFilter
|
||||||
|
from rich.live import Live
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.style import Style
|
||||||
|
from rich import box
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
import time
|
||||||
|
import psutil
|
||||||
|
import asyncio
|
||||||
|
import uuid
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DomainState:
|
||||||
|
last_request_time: float = 0
|
||||||
|
current_delay: float = 0
|
||||||
|
fail_count: int = 0
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CrawlerTaskResult:
|
||||||
|
task_id: str
|
||||||
|
url: str
|
||||||
|
result: CrawlResult
|
||||||
|
memory_usage: float
|
||||||
|
peak_memory: float
|
||||||
|
start_time: datetime
|
||||||
|
end_time: datetime
|
||||||
|
error_message: str = ""
|
||||||
|
|
||||||
|
class CrawlStatus(Enum):
|
||||||
|
QUEUED = "QUEUED"
|
||||||
|
IN_PROGRESS = "IN_PROGRESS"
|
||||||
|
COMPLETED = "COMPLETED"
|
||||||
|
FAILED = "FAILED"
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CrawlStats:
|
||||||
|
task_id: str
|
||||||
|
url: str
|
||||||
|
status: CrawlStatus
|
||||||
|
start_time: Optional[datetime] = None
|
||||||
|
end_time: Optional[datetime] = None
|
||||||
|
memory_usage: float = 0.0
|
||||||
|
peak_memory: float = 0.0
|
||||||
|
error_message: str = ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def duration(self) -> str:
|
||||||
|
if not self.start_time:
|
||||||
|
return "0:00"
|
||||||
|
end = self.end_time or datetime.now()
|
||||||
|
duration = end - self.start_time
|
||||||
|
return str(timedelta(seconds=int(duration.total_seconds())))
|
||||||
|
|
||||||
|
class DisplayMode(Enum):
|
||||||
|
DETAILED = "DETAILED"
|
||||||
|
AGGREGATED = "AGGREGATED"
|
||||||
|
|
||||||
|
class RateLimiter:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_delay: Tuple[float, float] = (1.0, 3.0),
|
||||||
|
max_delay: float = 60.0,
|
||||||
|
max_retries: int = 3,
|
||||||
|
rate_limit_codes: List[int] = [429, 503]
|
||||||
|
):
|
||||||
|
self.base_delay = base_delay
|
||||||
|
self.max_delay = max_delay
|
||||||
|
self.max_retries = max_retries
|
||||||
|
self.rate_limit_codes = rate_limit_codes
|
||||||
|
self.domains: Dict[str, DomainState] = {}
|
||||||
|
|
||||||
|
def get_domain(self, url: str) -> str:
|
||||||
|
return urlparse(url).netloc
|
||||||
|
|
||||||
|
async def wait_if_needed(self, url: str) -> None:
|
||||||
|
domain = self.get_domain(url)
|
||||||
|
state = self.domains.get(domain)
|
||||||
|
|
||||||
|
if not state:
|
||||||
|
self.domains[domain] = DomainState()
|
||||||
|
state = self.domains[domain]
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
if state.last_request_time:
|
||||||
|
wait_time = max(0, state.current_delay - (now - state.last_request_time))
|
||||||
|
if wait_time > 0:
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
|
# Random delay within base range if no current delay
|
||||||
|
if state.current_delay == 0:
|
||||||
|
state.current_delay = random.uniform(*self.base_delay)
|
||||||
|
|
||||||
|
state.last_request_time = time.time()
|
||||||
|
|
||||||
|
def update_delay(self, url: str, status_code: int) -> bool:
|
||||||
|
domain = self.get_domain(url)
|
||||||
|
state = self.domains[domain]
|
||||||
|
|
||||||
|
if status_code in self.rate_limit_codes:
|
||||||
|
state.fail_count += 1
|
||||||
|
if state.fail_count > self.max_retries:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Exponential backoff with random jitter
|
||||||
|
state.current_delay = min(
|
||||||
|
state.current_delay * 2 * random.uniform(0.75, 1.25),
|
||||||
|
self.max_delay
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Gradually reduce delay on success
|
||||||
|
state.current_delay = max(
|
||||||
|
random.uniform(*self.base_delay),
|
||||||
|
state.current_delay * 0.75
|
||||||
|
)
|
||||||
|
state.fail_count = 0
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
class CrawlerMonitor:
|
||||||
|
def __init__(self, max_visible_rows: int = 15, display_mode: DisplayMode = DisplayMode.DETAILED):
|
||||||
|
self.console = Console()
|
||||||
|
self.max_visible_rows = max_visible_rows
|
||||||
|
self.display_mode = display_mode
|
||||||
|
self.stats: Dict[str, CrawlStats] = {}
|
||||||
|
self.process = psutil.Process()
|
||||||
|
self.start_time = datetime.now()
|
||||||
|
self.live = Live(self._create_table(), refresh_per_second=2)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.live.start()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.live.stop()
|
||||||
|
|
||||||
|
def add_task(self, task_id: str, url: str):
|
||||||
|
self.stats[task_id] = CrawlStats(task_id=task_id, url=url, status=CrawlStatus.QUEUED)
|
||||||
|
self.live.update(self._create_table())
|
||||||
|
|
||||||
|
def update_task(self, task_id: str, **kwargs):
|
||||||
|
if task_id in self.stats:
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
setattr(self.stats[task_id], key, value)
|
||||||
|
self.live.update(self._create_table())
|
||||||
|
|
||||||
|
def _create_aggregated_table(self) -> Table:
|
||||||
|
"""Creates a compact table showing only aggregated statistics"""
|
||||||
|
table = Table(
|
||||||
|
box=box.ROUNDED,
|
||||||
|
title="Crawler Status Overview",
|
||||||
|
title_style="bold magenta",
|
||||||
|
header_style="bold blue",
|
||||||
|
show_lines=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate statistics
|
||||||
|
total_tasks = len(self.stats)
|
||||||
|
queued = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED)
|
||||||
|
in_progress = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS)
|
||||||
|
completed = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED)
|
||||||
|
failed = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED)
|
||||||
|
|
||||||
|
# Memory statistics
|
||||||
|
current_memory = self.process.memory_info().rss / (1024 * 1024)
|
||||||
|
total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
|
||||||
|
peak_memory = max((stat.peak_memory for stat in self.stats.values()), default=0.0)
|
||||||
|
|
||||||
|
# Duration
|
||||||
|
duration = datetime.now() - self.start_time
|
||||||
|
|
||||||
|
# Create status row
|
||||||
|
table.add_column("Status", style="bold cyan")
|
||||||
|
table.add_column("Count", justify="right")
|
||||||
|
table.add_column("Percentage", justify="right")
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
"Total Tasks",
|
||||||
|
str(total_tasks),
|
||||||
|
"100%"
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[yellow]In Queue[/yellow]",
|
||||||
|
str(queued),
|
||||||
|
f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[blue]In Progress[/blue]",
|
||||||
|
str(in_progress),
|
||||||
|
f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[green]Completed[/green]",
|
||||||
|
str(completed),
|
||||||
|
f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[red]Failed[/red]",
|
||||||
|
str(failed),
|
||||||
|
f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add memory information
|
||||||
|
table.add_section()
|
||||||
|
table.add_row(
|
||||||
|
"[magenta]Current Memory[/magenta]",
|
||||||
|
f"{current_memory:.1f} MB",
|
||||||
|
""
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[magenta]Total Task Memory[/magenta]",
|
||||||
|
f"{total_task_memory:.1f} MB",
|
||||||
|
""
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[magenta]Peak Task Memory[/magenta]",
|
||||||
|
f"{peak_memory:.1f} MB",
|
||||||
|
""
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[yellow]Runtime[/yellow]",
|
||||||
|
str(timedelta(seconds=int(duration.total_seconds()))),
|
||||||
|
""
|
||||||
|
)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
def _create_detailed_table(self) -> Table:
|
||||||
|
table = Table(
|
||||||
|
box=box.ROUNDED,
|
||||||
|
title="Crawler Performance Monitor",
|
||||||
|
title_style="bold magenta",
|
||||||
|
header_style="bold blue"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add columns
|
||||||
|
table.add_column("Task ID", style="cyan", no_wrap=True)
|
||||||
|
table.add_column("URL", style="cyan", no_wrap=True)
|
||||||
|
table.add_column("Status", style="bold")
|
||||||
|
table.add_column("Memory (MB)", justify="right")
|
||||||
|
table.add_column("Peak (MB)", justify="right")
|
||||||
|
table.add_column("Duration", justify="right")
|
||||||
|
table.add_column("Info", style="italic")
|
||||||
|
|
||||||
|
# Add summary row
|
||||||
|
total_memory = sum(stat.memory_usage for stat in self.stats.values())
|
||||||
|
active_count = sum(1 for stat in self.stats.values()
|
||||||
|
if stat.status == CrawlStatus.IN_PROGRESS)
|
||||||
|
completed_count = sum(1 for stat in self.stats.values()
|
||||||
|
if stat.status == CrawlStatus.COMPLETED)
|
||||||
|
failed_count = sum(1 for stat in self.stats.values()
|
||||||
|
if stat.status == CrawlStatus.FAILED)
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
"[bold yellow]SUMMARY",
|
||||||
|
f"Total: {len(self.stats)}",
|
||||||
|
f"Active: {active_count}",
|
||||||
|
f"{total_memory:.1f}",
|
||||||
|
f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
|
||||||
|
str(timedelta(seconds=int((datetime.now() - self.start_time).total_seconds()))),
|
||||||
|
f"✓{completed_count} ✗{failed_count}",
|
||||||
|
style="bold"
|
||||||
|
)
|
||||||
|
|
||||||
|
table.add_section()
|
||||||
|
|
||||||
|
# Add rows for each task
|
||||||
|
visible_stats = sorted(
|
||||||
|
self.stats.values(),
|
||||||
|
key=lambda x: (
|
||||||
|
x.status != CrawlStatus.IN_PROGRESS,
|
||||||
|
x.status != CrawlStatus.QUEUED,
|
||||||
|
x.end_time or datetime.max
|
||||||
|
)
|
||||||
|
)[:self.max_visible_rows]
|
||||||
|
|
||||||
|
for stat in visible_stats:
|
||||||
|
status_style = {
|
||||||
|
CrawlStatus.QUEUED: "white",
|
||||||
|
CrawlStatus.IN_PROGRESS: "yellow",
|
||||||
|
CrawlStatus.COMPLETED: "green",
|
||||||
|
CrawlStatus.FAILED: "red"
|
||||||
|
}[stat.status]
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
stat.task_id[:8], # Show first 8 chars of task ID
|
||||||
|
stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
|
||||||
|
f"[{status_style}]{stat.status.value}[/{status_style}]",
|
||||||
|
f"{stat.memory_usage:.1f}",
|
||||||
|
f"{stat.peak_memory:.1f}",
|
||||||
|
stat.duration,
|
||||||
|
stat.error_message[:40] if stat.error_message else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
def _create_table(self) -> Table:
|
||||||
|
"""Creates the appropriate table based on display mode"""
|
||||||
|
if self.display_mode == DisplayMode.AGGREGATED:
|
||||||
|
return self._create_aggregated_table()
|
||||||
|
return self._create_detailed_table()
|
||||||
|
|
||||||
|
class MemoryAdaptiveDispatcher:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
crawler: AsyncWebCrawler,
|
||||||
|
memory_threshold_percent: float = 70.0,
|
||||||
|
check_interval: float = 1.0,
|
||||||
|
max_session_permit: int = 20,
|
||||||
|
enable_rate_limiting: bool = False,
|
||||||
|
rate_limit_config: Optional[Dict[str, Any]] = None
|
||||||
|
):
|
||||||
|
self.crawler = crawler
|
||||||
|
self.memory_threshold_percent = memory_threshold_percent
|
||||||
|
self.check_interval = check_interval
|
||||||
|
self.max_session_permit = max_session_permit
|
||||||
|
self.concurrent_sessions = 0
|
||||||
|
self.enable_rate_limiting = enable_rate_limiting
|
||||||
|
self.rate_limiter = RateLimiter(**(rate_limit_config or {})) if enable_rate_limiting else None
|
||||||
|
|
||||||
|
async def crawl_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
config: CrawlerRunConfig,
|
||||||
|
task_id: str,
|
||||||
|
monitor: Optional[CrawlerMonitor] = None
|
||||||
|
) -> CrawlerTaskResult:
|
||||||
|
start_time = datetime.now()
|
||||||
|
error_message = ""
|
||||||
|
memory_usage = peak_memory = 0.0
|
||||||
|
|
||||||
|
try:
|
||||||
|
if monitor:
|
||||||
|
monitor.update_task(task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time)
|
||||||
|
self.concurrent_sessions += 1
|
||||||
|
|
||||||
|
if self.enable_rate_limiting:
|
||||||
|
await self.rate_limiter.wait_if_needed(url)
|
||||||
|
|
||||||
|
process = psutil.Process()
|
||||||
|
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||||
|
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
|
||||||
|
memory_usage = peak_memory = end_memory - start_memory
|
||||||
|
|
||||||
|
if self.enable_rate_limiting and result.status_code:
|
||||||
|
if not self.rate_limiter.update_delay(url, result.status_code):
|
||||||
|
error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
|
||||||
|
if monitor:
|
||||||
|
monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=result,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=datetime.now(),
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
error_message = result.error_message
|
||||||
|
if monitor:
|
||||||
|
monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
elif monitor:
|
||||||
|
monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_message = str(e)
|
||||||
|
if monitor:
|
||||||
|
monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
result = CrawlResult(url = url, html = "", metadata = {}, success=False, error_message=str(e))
|
||||||
|
|
||||||
|
finally:
|
||||||
|
end_time = datetime.now()
|
||||||
|
if monitor:
|
||||||
|
monitor.update_task(
|
||||||
|
task_id,
|
||||||
|
end_time=end_time,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
self.concurrent_sessions -= 1
|
||||||
|
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=result,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_urls(
|
||||||
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
config: CrawlerRunConfig,
|
||||||
|
monitor: Optional[CrawlerMonitor] = None
|
||||||
|
) -> List[CrawlerTaskResult]:
|
||||||
|
if monitor:
|
||||||
|
monitor.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
pending_tasks = []
|
||||||
|
active_tasks = []
|
||||||
|
task_queue = []
|
||||||
|
|
||||||
|
# Queue all tasks
|
||||||
|
for url in urls:
|
||||||
|
task_id = str(uuid.uuid4())
|
||||||
|
if monitor:
|
||||||
|
monitor.add_task(task_id, url)
|
||||||
|
task_queue.append((url, task_id))
|
||||||
|
|
||||||
|
while task_queue or active_tasks:
|
||||||
|
# Fill up to max_session_permit
|
||||||
|
while len(active_tasks) < self.max_session_permit and task_queue:
|
||||||
|
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
|
||||||
|
break
|
||||||
|
|
||||||
|
url, task_id = task_queue.pop(0)
|
||||||
|
task = asyncio.create_task(self.crawl_url(url, config, task_id, monitor))
|
||||||
|
active_tasks.append(task)
|
||||||
|
|
||||||
|
if not active_tasks:
|
||||||
|
await asyncio.sleep(self.check_interval)
|
||||||
|
continue
|
||||||
|
|
||||||
|
done, pending = await asyncio.wait(
|
||||||
|
active_tasks,
|
||||||
|
return_when=asyncio.FIRST_COMPLETED
|
||||||
|
)
|
||||||
|
|
||||||
|
pending_tasks.extend(done)
|
||||||
|
active_tasks = list(pending)
|
||||||
|
|
||||||
|
return await asyncio.gather(*pending_tasks)
|
||||||
|
finally:
|
||||||
|
if monitor:
|
||||||
|
monitor.stop()
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
|
content_filter=PruningContentFilter(threshold=0.48)
|
||||||
|
),
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
urls = ["https://example.com/page1"] * 10
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
crawler=crawler,
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
check_interval=1.0,
|
||||||
|
max_session_permit=10
|
||||||
|
)
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
crawler=crawler,
|
||||||
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config={
|
||||||
|
'base_delay': (1.0, 3.0), # Random range
|
||||||
|
'max_delay': 60.0,
|
||||||
|
'max_retries': 3,
|
||||||
|
'rate_limit_codes': [429, 503]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Optional monitor
|
||||||
|
monitor = CrawlerMonitor(max_visible_rows=15, display_mode=DisplayMode.DETAILED)
|
||||||
|
results = await dispatcher.run_urls(urls, run_config, monitor=monitor)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
67
docs/examples/dispatcher_example.py
Normal file
67
docs/examples/dispatcher_example.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
import asyncio, time
|
||||||
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
||||||
|
from crawl4ai.dispatcher import DisplayMode
|
||||||
|
|
||||||
|
async def crawl_with_rate_limiting(urls):
|
||||||
|
"""
|
||||||
|
Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls (List[str]): List of URLs to crawl
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[CrawlResult]: List of crawl results for each URL
|
||||||
|
"""
|
||||||
|
# Configure browser settings
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True, # Run browser in headless mode
|
||||||
|
verbose=False # Minimize browser logging
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configure crawler settings with rate limiting
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
# Enable rate limiting
|
||||||
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(
|
||||||
|
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds between requests
|
||||||
|
max_delay=30.0, # Maximum delay after rate limit hits
|
||||||
|
max_retries=2, # Number of retries before giving up
|
||||||
|
rate_limit_codes=[429, 503] # HTTP status codes to trigger rate limiting
|
||||||
|
),
|
||||||
|
# Resource monitoring settings
|
||||||
|
memory_threshold_percent=70.0, # Pause crawling if memory usage exceeds this
|
||||||
|
check_interval=0.5, # How often to check resource usage
|
||||||
|
max_session_permit=10, # Maximum concurrent crawls
|
||||||
|
display_mode=DisplayMode.DETAILED.value # Show detailed progress
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create and use crawler with context manager
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
results = await crawler.arun_many(urls, config=run_config)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Example URLs (replace with real URLs)
|
||||||
|
urls = [
|
||||||
|
f"https://example.com/page{i}" for i in range(1, 40)
|
||||||
|
]
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
# Run the crawler
|
||||||
|
results = asyncio.run(crawl_with_rate_limiting(urls))
|
||||||
|
|
||||||
|
# Process results
|
||||||
|
successful_results = [result for result in results if result.success]
|
||||||
|
failed_results = [result for result in results if not result.success]
|
||||||
|
|
||||||
|
end = time.perf_counter()
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"Successful crawls: {len(successful_results)}")
|
||||||
|
print(f"Failed crawls: {len(failed_results)}")
|
||||||
|
print(f"Time taken: {end - start:.2f} seconds")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
38
tests/test_dispatcher.py
Normal file
38
tests/test_dispatcher.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import pytest
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
||||||
|
from crawl4ai.dispatcher import DisplayMode
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_crawler_with_dispatcher():
|
||||||
|
# Create test URLs
|
||||||
|
urls = [f"https://example.com/page_{i}" for i in range(5)]
|
||||||
|
|
||||||
|
# Configure browser
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
|
|
||||||
|
# Configure crawler with rate limiting
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(
|
||||||
|
base_delay=(1.0, 2.0),
|
||||||
|
max_delay=30.0,
|
||||||
|
max_retries=2,
|
||||||
|
rate_limit_codes=[429, 503]
|
||||||
|
),
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
check_interval=0.5,
|
||||||
|
max_session_permit=3,
|
||||||
|
display_mode=DisplayMode.DETAILED.value
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
results = await crawler.arun_many(urls, config=run_config)
|
||||||
|
|
||||||
|
# Basic validation
|
||||||
|
assert len(results) == len(urls)
|
||||||
|
for result in results:
|
||||||
|
assert result is not None
|
||||||
|
# Note: example.com URLs will fail, which is expected for this test
|
||||||
|
assert not result.success # We expect these to fail since they're fake URLs
|
||||||
Reference in New Issue
Block a user