feat(api): implement crawler pool manager for improved resource handling

Adds a new CrawlerManager class to handle browser instance pooling and failover:
- Implements auto-scaling based on system resources
- Adds primary/backup crawler management
- Integrates memory monitoring and throttling
- Adds streaming support with memory tracking
- Updates API endpoints to use pooled crawlers

BREAKING CHANGE: API endpoints now require CrawlerManager initialization
This commit is contained in:
UncleCode
2025-04-18 22:26:24 +08:00
parent 907cba194f
commit 16b2318242
9 changed files with 2082 additions and 59 deletions

View File

@@ -48,6 +48,38 @@ security:
content_security_policy: "default-src 'self'"
strict_transport_security: "max-age=63072000; includeSubDomains"
# Crawler Pool Configuration
crawler_pool:
enabled: true # Set to false to disable the pool
# --- Option 1: Auto-calculate size ---
auto_calculate_size: true
calculation_params:
mem_headroom_mb: 512 # Memory reserved for OS/other apps
avg_page_mem_mb: 150 # Estimated MB per concurrent "tab"/page in browsers
fd_per_page: 20 # Estimated file descriptors per page
core_multiplier: 4 # Max crawlers per CPU core
min_pool_size: 2 # Minimum number of primary crawlers
max_pool_size: 16 # Maximum number of primary crawlers
# --- Option 2: Manual size (ignored if auto_calculate_size is true) ---
# pool_size: 8
# --- Other Pool Settings ---
backup_pool_size: 1 # Number of backup crawlers
max_wait_time_s: 30.0 # Max seconds a request waits for a free crawler
throttle_threshold_percent: 70.0 # Start throttling delay above this % usage
throttle_delay_min_s: 0.1 # Min throttle delay
throttle_delay_max_s: 0.5 # Max throttle delay
# --- Browser Config for Pooled Crawlers ---
browser_config:
# No need for "type": "BrowserConfig" here, just params
headless: true
verbose: false # Keep pool crawlers less verbose in production
# user_agent: "MyPooledCrawler/1.0" # Example
# Add other BrowserConfig params as needed (e.g., proxy, viewport)
# Crawler Configuration
crawler:
memory_threshold_percent: 95.0
@@ -61,6 +93,8 @@ crawler:
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file: "logs/app.log"
verbose: true
# Observability Configuration
observability: