Adds a new CrawlerManager class to handle browser instance pooling and failover: - Implements auto-scaling based on system resources - Adds primary/backup crawler management - Integrates memory monitoring and throttling - Adds streaming support with memory tracking - Updates API endpoints to use pooled crawlers BREAKING CHANGE: API endpoints now require CrawlerManager initialization
106 lines
2.9 KiB
YAML
106 lines
2.9 KiB
YAML
# Application Configuration
|
|
app:
|
|
title: "Crawl4AI API"
|
|
version: "1.0.0"
|
|
host: "0.0.0.0"
|
|
port: 8020
|
|
reload: False
|
|
timeout_keep_alive: 300
|
|
|
|
# Default LLM Configuration
|
|
llm:
|
|
provider: "openai/gpt-4o-mini"
|
|
api_key_env: "OPENAI_API_KEY"
|
|
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
|
|
|
# Redis Configuration
|
|
redis:
|
|
host: "localhost"
|
|
port: 6379
|
|
db: 0
|
|
password: ""
|
|
ssl: False
|
|
ssl_cert_reqs: None
|
|
ssl_ca_certs: None
|
|
ssl_certfile: None
|
|
ssl_keyfile: None
|
|
ssl_cert_reqs: None
|
|
ssl_ca_certs: None
|
|
ssl_certfile: None
|
|
ssl_keyfile: None
|
|
|
|
# Rate Limiting Configuration
|
|
rate_limiting:
|
|
enabled: True
|
|
default_limit: "1000/minute"
|
|
trusted_proxies: []
|
|
storage_uri: "memory://" # Use "redis://localhost:6379" for production
|
|
|
|
# Security Configuration
|
|
security:
|
|
enabled: false
|
|
jwt_enabled: false
|
|
https_redirect: false
|
|
trusted_hosts: ["*"]
|
|
headers:
|
|
x_content_type_options: "nosniff"
|
|
x_frame_options: "DENY"
|
|
content_security_policy: "default-src 'self'"
|
|
strict_transport_security: "max-age=63072000; includeSubDomains"
|
|
|
|
# Crawler Pool Configuration
|
|
crawler_pool:
|
|
enabled: true # Set to false to disable the pool
|
|
|
|
# --- Option 1: Auto-calculate size ---
|
|
auto_calculate_size: true
|
|
calculation_params:
|
|
mem_headroom_mb: 512 # Memory reserved for OS/other apps
|
|
avg_page_mem_mb: 150 # Estimated MB per concurrent "tab"/page in browsers
|
|
fd_per_page: 20 # Estimated file descriptors per page
|
|
core_multiplier: 4 # Max crawlers per CPU core
|
|
min_pool_size: 2 # Minimum number of primary crawlers
|
|
max_pool_size: 16 # Maximum number of primary crawlers
|
|
|
|
# --- Option 2: Manual size (ignored if auto_calculate_size is true) ---
|
|
# pool_size: 8
|
|
|
|
# --- Other Pool Settings ---
|
|
backup_pool_size: 1 # Number of backup crawlers
|
|
max_wait_time_s: 30.0 # Max seconds a request waits for a free crawler
|
|
throttle_threshold_percent: 70.0 # Start throttling delay above this % usage
|
|
throttle_delay_min_s: 0.1 # Min throttle delay
|
|
throttle_delay_max_s: 0.5 # Max throttle delay
|
|
|
|
# --- Browser Config for Pooled Crawlers ---
|
|
browser_config:
|
|
# No need for "type": "BrowserConfig" here, just params
|
|
headless: true
|
|
verbose: false # Keep pool crawlers less verbose in production
|
|
# user_agent: "MyPooledCrawler/1.0" # Example
|
|
# Add other BrowserConfig params as needed (e.g., proxy, viewport)
|
|
|
|
# Crawler Configuration
|
|
crawler:
|
|
memory_threshold_percent: 95.0
|
|
rate_limiter:
|
|
base_delay: [1.0, 2.0]
|
|
timeouts:
|
|
stream_init: 30.0 # Timeout for stream initialization
|
|
batch_process: 300.0 # Timeout for batch processing
|
|
|
|
# Logging Configuration
|
|
logging:
|
|
level: "INFO"
|
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
file: "logs/app.log"
|
|
verbose: true
|
|
|
|
# Observability Configuration
|
|
observability:
|
|
prometheus:
|
|
enabled: True
|
|
endpoint: "/metrics"
|
|
health_check:
|
|
endpoint: "/health"
|