crawl4ai/deploy/docker/config.yml

# Application Configuration
app:
  title: "Crawl4AI API"
  version: "1.0.0"
  host: "0.0.0.0"
  port: 8020
  reload: False
  timeout_keep_alive: 300

# Default LLM Configuration
llm:
  provider: "openai/gpt-4o-mini"
  api_key_env: "OPENAI_API_KEY"
  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored

# Redis Configuration
redis:
  host: "localhost"
  port: 6379
  db: 0
  password: ""
  ssl: False
  ssl_cert_reqs: None
  ssl_ca_certs: None
  ssl_certfile: None
  ssl_keyfile: None
  ssl_cert_reqs: None
  ssl_ca_certs: None
  ssl_certfile: None
  ssl_keyfile: None

# Rate Limiting Configuration
rate_limiting:
  enabled: True
  default_limit: "1000/minute"
  trusted_proxies: []
  storage_uri: "memory://"  # Use "redis://localhost:6379" for production

# Security Configuration
security:
  enabled: false
  jwt_enabled: false
  https_redirect: false
  trusted_hosts: ["*"]
  headers:
    x_content_type_options: "nosniff"
    x_frame_options: "DENY"
    content_security_policy: "default-src 'self'"
    strict_transport_security: "max-age=63072000; includeSubDomains"

# Crawler Pool Configuration
crawler_pool:
  enabled: true # Set to false to disable the pool

  # --- Option 1: Auto-calculate size ---
  auto_calculate_size: true
  calculation_params:
    mem_headroom_mb: 512     # Memory reserved for OS/other apps
    avg_page_mem_mb: 150     # Estimated MB per concurrent "tab"/page in browsers
    fd_per_page: 20          # Estimated file descriptors per page
    core_multiplier: 4       # Max crawlers per CPU core
    min_pool_size: 2         # Minimum number of primary crawlers
    max_pool_size: 16        # Maximum number of primary crawlers

  # --- Option 2: Manual size (ignored if auto_calculate_size is true) ---
  # pool_size: 8

  # --- Other Pool Settings ---
  backup_pool_size: 1        # Number of backup crawlers
  max_wait_time_s: 30.0      # Max seconds a request waits for a free crawler
  throttle_threshold_percent: 70.0 # Start throttling delay above this % usage
  throttle_delay_min_s: 0.1  # Min throttle delay
  throttle_delay_max_s: 0.5  # Max throttle delay

  # --- Browser Config for Pooled Crawlers ---
  browser_config:
    # No need for "type": "BrowserConfig" here, just params
    headless: true
    verbose: false # Keep pool crawlers less verbose in production
    # user_agent: "MyPooledCrawler/1.0" # Example
    # Add other BrowserConfig params as needed (e.g., proxy, viewport)

# Crawler Configuration
crawler:
  memory_threshold_percent: 95.0
  rate_limiter:
    base_delay: [1.0, 2.0]
  timeouts:
    stream_init: 30.0  # Timeout for stream initialization
    batch_process: 300.0  # Timeout for batch processing

# Logging Configuration
logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  file: "logs/app.log"
  verbose: true

# Observability Configuration
observability:
  prometheus:
    enabled: True
    endpoint: "/metrics"
  health_check:
    endpoint: "/health"