Major refactoring to eliminate memory leaks and enable high-scale crawling: - **Smart 3-Tier Browser Pool**: - Permanent browser (always-ready default config) - Hot pool (configs used 3+ times, longer TTL) - Cold pool (new/rare configs, short TTL) - Auto-promotion: cold → hot after 3 uses - 100% pool reuse achieved in tests - **Container-Aware Memory Detection**: - Read cgroup v1/v2 memory limits (not host metrics) - Accurate memory pressure detection in Docker - Memory-based browser creation blocking - **Adaptive Janitor**: - Dynamic cleanup intervals (10s/30s/60s based on memory) - Tiered TTLs: cold 30-300s, hot 120-600s - Aggressive cleanup at high memory pressure - **Unified Pool Usage**: - All endpoints now use pool (/html, /screenshot, /pdf, /execute_js, /md, /llm) - Fixed config signature mismatch (permanent browser matches endpoints) - get_default_browser_config() helper for consistency - **Configuration**: - Reduced idle_ttl: 1800s → 300s (30min → 5min) - Fixed port: 11234 → 11235 (match Gunicorn) **Performance Results** (from stress tests): - Memory: 10x reduction (500-700MB × N → 270MB permanent) - Latency: 30-50x faster (<100ms pool hits vs 3-5s startup) - Reuse: 100% for default config, 60%+ for variants - Capacity: 100+ concurrent requests (vs ~20 before) - Leak: 0 MB/cycle (stable across tests) **Test Infrastructure**: - 7-phase sequential test suite (tests/) - Docker stats integration + log analysis - Pool promotion verification - Memory leak detection - Full endpoint coverage Fixes memory issues reported in production deployments.
90 lines
2.1 KiB
YAML
90 lines
2.1 KiB
YAML
# Application Configuration
|
||
app:
|
||
title: "Crawl4AI API"
|
||
version: "1.0.0"
|
||
host: "0.0.0.0"
|
||
port: 11235
|
||
reload: False
|
||
workers: 1
|
||
timeout_keep_alive: 300
|
||
|
||
# Default LLM Configuration
|
||
llm:
|
||
provider: "openai/gpt-4o-mini"
|
||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
||
|
||
# Redis Configuration
|
||
redis:
|
||
host: "localhost"
|
||
port: 6379
|
||
db: 0
|
||
password: ""
|
||
ssl: False
|
||
ssl_cert_reqs: None
|
||
ssl_ca_certs: None
|
||
ssl_certfile: None
|
||
ssl_keyfile: None
|
||
ssl_cert_reqs: None
|
||
ssl_ca_certs: None
|
||
ssl_certfile: None
|
||
ssl_keyfile: None
|
||
|
||
# Rate Limiting Configuration
|
||
rate_limiting:
|
||
enabled: True
|
||
default_limit: "1000/minute"
|
||
trusted_proxies: []
|
||
storage_uri: "memory://" # Use "redis://localhost:6379" for production
|
||
|
||
# Security Configuration
|
||
security:
|
||
enabled: false
|
||
jwt_enabled: false
|
||
https_redirect: false
|
||
trusted_hosts: ["*"]
|
||
headers:
|
||
x_content_type_options: "nosniff"
|
||
x_frame_options: "DENY"
|
||
content_security_policy: "default-src 'self'"
|
||
strict_transport_security: "max-age=63072000; includeSubDomains"
|
||
|
||
# Crawler Configuration
|
||
crawler:
|
||
base_config:
|
||
simulate_user: true
|
||
memory_threshold_percent: 95.0
|
||
rate_limiter:
|
||
enabled: true
|
||
base_delay: [1.0, 2.0]
|
||
timeouts:
|
||
stream_init: 30.0 # Timeout for stream initialization
|
||
batch_process: 300.0 # Timeout for batch processing
|
||
pool:
|
||
max_pages: 40 # ← GLOBAL_SEM permits
|
||
idle_ttl_sec: 300 # ← 30 min janitor cutoff
|
||
browser:
|
||
kwargs:
|
||
headless: true
|
||
text_mode: true
|
||
extra_args:
|
||
# - "--single-process"
|
||
- "--no-sandbox"
|
||
- "--disable-dev-shm-usage"
|
||
- "--disable-gpu"
|
||
- "--disable-software-rasterizer"
|
||
- "--disable-web-security"
|
||
- "--allow-insecure-localhost"
|
||
- "--ignore-certificate-errors"
|
||
|
||
# Logging Configuration
|
||
logging:
|
||
level: "INFO"
|
||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||
|
||
# Observability Configuration
|
||
observability:
|
||
prometheus:
|
||
enabled: True
|
||
endpoint: "/metrics"
|
||
health_check:
|
||
endpoint: "/health" |