feat(api): implement crawler pool manager for improved resource handling
Adds a new CrawlerManager class to handle browser instance pooling and failover: - Implements auto-scaling based on system resources - Adds primary/backup crawler management - Integrates memory monitoring and throttling - Adds streaming support with memory tracking - Updates API endpoints to use pooled crawlers BREAKING CHANGE: API endpoints now require CrawlerManager initialization
This commit is contained in:
@@ -48,6 +48,38 @@ security:
|
||||
content_security_policy: "default-src 'self'"
|
||||
strict_transport_security: "max-age=63072000; includeSubDomains"
|
||||
|
||||
# Crawler Pool Configuration
|
||||
crawler_pool:
|
||||
enabled: true # Set to false to disable the pool
|
||||
|
||||
# --- Option 1: Auto-calculate size ---
|
||||
auto_calculate_size: true
|
||||
calculation_params:
|
||||
mem_headroom_mb: 512 # Memory reserved for OS/other apps
|
||||
avg_page_mem_mb: 150 # Estimated MB per concurrent "tab"/page in browsers
|
||||
fd_per_page: 20 # Estimated file descriptors per page
|
||||
core_multiplier: 4 # Max crawlers per CPU core
|
||||
min_pool_size: 2 # Minimum number of primary crawlers
|
||||
max_pool_size: 16 # Maximum number of primary crawlers
|
||||
|
||||
# --- Option 2: Manual size (ignored if auto_calculate_size is true) ---
|
||||
# pool_size: 8
|
||||
|
||||
# --- Other Pool Settings ---
|
||||
backup_pool_size: 1 # Number of backup crawlers
|
||||
max_wait_time_s: 30.0 # Max seconds a request waits for a free crawler
|
||||
throttle_threshold_percent: 70.0 # Start throttling delay above this % usage
|
||||
throttle_delay_min_s: 0.1 # Min throttle delay
|
||||
throttle_delay_max_s: 0.5 # Max throttle delay
|
||||
|
||||
# --- Browser Config for Pooled Crawlers ---
|
||||
browser_config:
|
||||
# No need for "type": "BrowserConfig" here, just params
|
||||
headless: true
|
||||
verbose: false # Keep pool crawlers less verbose in production
|
||||
# user_agent: "MyPooledCrawler/1.0" # Example
|
||||
# Add other BrowserConfig params as needed (e.g., proxy, viewport)
|
||||
|
||||
# Crawler Configuration
|
||||
crawler:
|
||||
memory_threshold_percent: 95.0
|
||||
@@ -61,6 +93,8 @@ crawler:
|
||||
logging:
|
||||
level: "INFO"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
file: "logs/app.log"
|
||||
verbose: true
|
||||
|
||||
# Observability Configuration
|
||||
observability:
|
||||
|
||||
Reference in New Issue
Block a user