diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index ebd2859d..8ffc720c 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -49,6 +49,9 @@ from .utils import (
preprocess_html_for_schema,
)
+# Import telemetry
+from .telemetry import capture_exception, telemetry_decorator, async_telemetry_decorator
+
class AsyncWebCrawler:
"""
@@ -201,6 +204,7 @@ class AsyncWebCrawler:
"""异步空上下文管理器"""
yield
+ @async_telemetry_decorator
async def arun(
self,
url: str,
@@ -430,6 +434,7 @@ class AsyncWebCrawler:
)
)
+ @async_telemetry_decorator
async def aprocess_html(
self,
url: str,
diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py
index 51b53500..12cb3758 100644
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1385,6 +1385,97 @@ def profiles_cmd():
# Run interactive profile manager
anyio.run(manage_profiles)
+@cli.group("telemetry")
+def telemetry_cmd():
+ """Manage telemetry settings for Crawl4AI
+
+ Telemetry helps improve Crawl4AI by sending anonymous crash reports.
+ No personal data or crawled content is ever collected.
+ """
+ pass
+
+@telemetry_cmd.command("enable")
+@click.option("--email", "-e", help="Optional email for follow-up on critical issues")
+@click.option("--always/--once", default=True, help="Always send errors (default) or just once")
+def telemetry_enable_cmd(email: Optional[str], always: bool):
+ """Enable telemetry to help improve Crawl4AI
+
+ Examples:
+ crwl telemetry enable # Enable telemetry
+ crwl telemetry enable --email me@ex.com # Enable with email
+ crwl telemetry enable --once # Send only next error
+ """
+ from crawl4ai.telemetry import enable
+
+ try:
+ enable(email=email, always=always, once=not always)
+ console.print("[green]✅ Telemetry enabled successfully[/green]")
+
+ if email:
+ console.print(f" Email: {email}")
+ console.print(f" Mode: {'Always send errors' if always else 'Send next error only'}")
+
+ except Exception as e:
+ console.print(f"[red]❌ Failed to enable telemetry: {e}[/red]")
+ sys.exit(1)
+
+@telemetry_cmd.command("disable")
+def telemetry_disable_cmd():
+ """Disable telemetry
+
+ Stop sending anonymous crash reports to help improve Crawl4AI.
+ """
+ from crawl4ai.telemetry import disable
+
+ try:
+ disable()
+ console.print("[green]✅ Telemetry disabled successfully[/green]")
+ except Exception as e:
+ console.print(f"[red]❌ Failed to disable telemetry: {e}[/red]")
+ sys.exit(1)
+
+@telemetry_cmd.command("status")
+def telemetry_status_cmd():
+ """Show current telemetry status
+
+ Display whether telemetry is enabled and current settings.
+ """
+ from crawl4ai.telemetry import status
+
+ try:
+ info = status()
+
+ # Create status table
+ table = Table(title="Telemetry Status", show_header=False)
+ table.add_column("Setting", style="cyan")
+ table.add_column("Value")
+
+ # Status emoji
+ status_icon = "✅" if info['enabled'] else "❌"
+
+ table.add_row("Status", f"{status_icon} {'Enabled' if info['enabled'] else 'Disabled'}")
+ table.add_row("Consent", info['consent'].replace('_', ' ').title())
+
+ if info['email']:
+ table.add_row("Email", info['email'])
+
+ table.add_row("Environment", info['environment'])
+ table.add_row("Provider", info['provider'])
+
+ if info['errors_sent'] > 0:
+ table.add_row("Errors Sent", str(info['errors_sent']))
+
+ console.print(table)
+
+ # Add helpful messages
+ if not info['enabled']:
+ console.print("\n[yellow]ℹ️ Telemetry is disabled. Enable it to help improve Crawl4AI:[/yellow]")
+ console.print(" [dim]crwl telemetry enable[/dim]")
+
+ except Exception as e:
+ console.print(f"[red]❌ Failed to get telemetry status: {e}[/red]")
+ sys.exit(1)
+
@cli.command(name="")
@click.argument("url", required=False)
@click.option("--example", is_flag=True, help="Show usage examples")
diff --git a/crawl4ai/telemetry/__init__.py b/crawl4ai/telemetry/__init__.py
new file mode 100644
index 00000000..cc7e0afe
--- /dev/null
+++ b/crawl4ai/telemetry/__init__.py
@@ -0,0 +1,440 @@
+"""
+Crawl4AI Telemetry Module.
+Provides opt-in error tracking to improve stability.
+"""
+
+import os
+import sys
+import functools
+import traceback
+from typing import Optional, Any, Dict, Callable, Type
+from contextlib import contextmanager, asynccontextmanager
+
+from .base import TelemetryProvider, NullProvider
+from .config import TelemetryConfig, TelemetryConsent
+from .consent import ConsentManager
+from .environment import Environment, EnvironmentDetector
+
+
+class TelemetryManager:
+ """
+ Main telemetry manager for Crawl4AI.
+ Coordinates provider, config, and consent management.
+ """
+
+ _instance: Optional['TelemetryManager'] = None
+
+ def __init__(self):
+ """Initialize telemetry manager."""
+ self.config = TelemetryConfig()
+ self.consent_manager = ConsentManager(self.config)
+ self.environment = EnvironmentDetector.detect()
+ self._provider: Optional[TelemetryProvider] = None
+ self._initialized = False
+ self._error_count = 0
+ self._max_errors = 100 # Prevent telemetry spam
+
+ # Load provider based on config
+ self._setup_provider()
+
+ @classmethod
+ def get_instance(cls) -> 'TelemetryManager':
+ """
+ Get singleton instance of telemetry manager.
+
+ Returns:
+ TelemetryManager instance
+ """
+ if cls._instance is None:
+ cls._instance = cls()
+ return cls._instance
+
+ def _setup_provider(self) -> None:
+ """Setup telemetry provider based on configuration."""
+ # Update config from environment
+ self.config.update_from_env()
+
+ # Check if telemetry is enabled
+ if not self.config.is_enabled():
+ self._provider = NullProvider()
+ return
+
+ # Try to load Sentry provider
+ try:
+ from .providers.sentry import SentryProvider
+
+ # Get Crawl4AI version for release tracking
+ try:
+ from crawl4ai import __version__
+ release = f"crawl4ai@{__version__}"
+ except ImportError:
+ release = "crawl4ai@unknown"
+
+ self._provider = SentryProvider(
+ environment=self.environment.value,
+ release=release
+ )
+
+ # Initialize provider
+ if not self._provider.initialize():
+ # Fallback to null provider if init fails
+ self._provider = NullProvider()
+
+ except ImportError:
+ # Sentry not installed - use null provider
+ self._provider = NullProvider()
+
+ self._initialized = True
+
+ def capture_exception(
+ self,
+ exception: Exception,
+ context: Optional[Dict[str, Any]] = None
+ ) -> bool:
+ """
+ Capture and send an exception.
+
+ Args:
+ exception: The exception to capture
+ context: Optional additional context
+
+ Returns:
+ True if exception was sent
+ """
+ # Check error count limit
+ if self._error_count >= self._max_errors:
+ return False
+
+ # Check consent on first error
+ if self._error_count == 0:
+ consent = self.consent_manager.check_and_prompt()
+
+ # Update provider if consent changed
+ if consent == TelemetryConsent.DENIED:
+ self._provider = NullProvider()
+ return False
+ elif consent in [TelemetryConsent.ONCE, TelemetryConsent.ALWAYS]:
+ if isinstance(self._provider, NullProvider):
+ self._setup_provider()
+
+ # Check if we should send this error
+ if not self.config.should_send_current():
+ return False
+
+ # Prepare context
+ full_context = EnvironmentDetector.get_environment_context()
+ if context:
+ full_context.update(context)
+
+ # Add user email if available
+ email = self.config.get_email()
+ if email:
+ full_context['email'] = email
+
+ # Add source info
+ full_context['source'] = 'crawl4ai'
+
+ # Send exception
+ try:
+ if self._provider:
+ success = self._provider.send_exception(exception, full_context)
+ if success:
+ self._error_count += 1
+ return success
+ except Exception:
+ # Telemetry itself failed - ignore
+ pass
+
+ return False
+
+ def capture_message(
+ self,
+ message: str,
+ level: str = 'info',
+ context: Optional[Dict[str, Any]] = None
+ ) -> bool:
+ """
+ Capture a message event.
+
+ Args:
+ message: Message to send
+ level: Message level (info, warning, error)
+ context: Optional context
+
+ Returns:
+ True if message was sent
+ """
+ if not self.config.is_enabled():
+ return False
+
+ payload = {
+ 'level': level,
+ 'message': message
+ }
+ if context:
+ payload.update(context)
+
+ try:
+ if self._provider:
+ return self._provider.send_event(message, payload)
+ except Exception:
+ pass
+
+ return False
+
+ def enable(
+ self,
+ email: Optional[str] = None,
+ always: bool = True,
+ once: bool = False
+ ) -> None:
+ """
+ Enable telemetry.
+
+ Args:
+ email: Optional email for follow-up
+ always: If True, always send errors
+ once: If True, send only next error
+ """
+ if once:
+ consent = TelemetryConsent.ONCE
+ elif always:
+ consent = TelemetryConsent.ALWAYS
+ else:
+ consent = TelemetryConsent.ALWAYS
+
+ self.config.set_consent(consent, email)
+ self._setup_provider()
+
+ print("✅ Telemetry enabled")
+ if email:
+ print(f" Email: {email}")
+ print(f" Mode: {'once' if once else 'always'}")
+
+ def disable(self) -> None:
+ """Disable telemetry."""
+ self.config.set_consent(TelemetryConsent.DENIED)
+ self._provider = NullProvider()
+ print("✅ Telemetry disabled")
+
+ def status(self) -> Dict[str, Any]:
+ """
+ Get telemetry status.
+
+ Returns:
+ Dictionary with status information
+ """
+ return {
+ 'enabled': self.config.is_enabled(),
+ 'consent': self.config.get_consent().value,
+ 'email': self.config.get_email(),
+ 'environment': self.environment.value,
+ 'provider': type(self._provider).__name__ if self._provider else 'None',
+ 'errors_sent': self._error_count
+ }
+
+ def flush(self) -> None:
+ """Flush any pending telemetry data."""
+ if self._provider:
+ self._provider.flush()
+
+ def shutdown(self) -> None:
+ """Shutdown telemetry."""
+ if self._provider:
+ self._provider.shutdown()
+
+
+# Global instance
+_telemetry_manager: Optional[TelemetryManager] = None
+
+
+def get_telemetry() -> TelemetryManager:
+ """
+ Get global telemetry manager instance.
+
+ Returns:
+ TelemetryManager instance
+ """
+ global _telemetry_manager
+ if _telemetry_manager is None:
+ _telemetry_manager = TelemetryManager.get_instance()
+ return _telemetry_manager
+
+
+def capture_exception(
+ exception: Exception,
+ context: Optional[Dict[str, Any]] = None
+) -> bool:
+ """
+ Capture an exception for telemetry.
+
+ Args:
+ exception: Exception to capture
+ context: Optional context
+
+ Returns:
+ True if sent successfully
+ """
+ try:
+ return get_telemetry().capture_exception(exception, context)
+ except Exception:
+ return False
+
+
+def telemetry_decorator(func: Callable) -> Callable:
+ """
+ Decorator to capture exceptions from a function.
+
+ Args:
+ func: Function to wrap
+
+ Returns:
+ Wrapped function
+ """
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ try:
+ return func(*args, **kwargs)
+ except Exception as e:
+ # Capture exception
+ capture_exception(e, {
+ 'function': func.__name__,
+ 'module': func.__module__
+ })
+ # Re-raise the exception
+ raise
+
+ return wrapper
+
+
+def async_telemetry_decorator(func: Callable) -> Callable:
+ """
+ Decorator to capture exceptions from an async function.
+
+ Args:
+ func: Async function to wrap
+
+ Returns:
+ Wrapped async function
+ """
+ @functools.wraps(func)
+ async def wrapper(*args, **kwargs):
+ try:
+ return await func(*args, **kwargs)
+ except Exception as e:
+ # Capture exception
+ capture_exception(e, {
+ 'function': func.__name__,
+ 'module': func.__module__
+ })
+ # Re-raise the exception
+ raise
+
+ return wrapper
+
+
+@contextmanager
+def telemetry_context(operation: str):
+ """
+ Context manager for capturing exceptions.
+
+ Args:
+ operation: Name of the operation
+
+ Example:
+ with telemetry_context("web_crawl"):
+ # Your code here
+ pass
+ """
+ try:
+ yield
+ except Exception as e:
+ capture_exception(e, {'operation': operation})
+ raise
+
+
+@asynccontextmanager
+async def async_telemetry_context(operation: str):
+ """
+ Async context manager for capturing exceptions in async code.
+
+ Args:
+ operation: Name of the operation
+
+ Example:
+ async with async_telemetry_context("async_crawl"):
+ # Your async code here
+ await something()
+ """
+ try:
+ yield
+ except Exception as e:
+ capture_exception(e, {'operation': operation})
+ raise
+
+
+def install_exception_handler():
+ """Install global exception handler for uncaught exceptions."""
+ original_hook = sys.excepthook
+
+ def telemetry_exception_hook(exc_type, exc_value, exc_traceback):
+ """Custom exception hook with telemetry."""
+ # Don't capture KeyboardInterrupt
+ if not issubclass(exc_type, KeyboardInterrupt):
+ capture_exception(exc_value, {
+ 'uncaught': True,
+ 'type': exc_type.__name__
+ })
+
+ # Call original hook
+ original_hook(exc_type, exc_value, exc_traceback)
+
+ sys.excepthook = telemetry_exception_hook
+
+
+# Public API
+def enable(email: Optional[str] = None, always: bool = True, once: bool = False) -> None:
+ """
+ Enable telemetry.
+
+ Args:
+ email: Optional email for follow-up
+ always: If True, always send errors (default)
+ once: If True, send only the next error
+ """
+ get_telemetry().enable(email=email, always=always, once=once)
+
+
+def disable() -> None:
+ """Disable telemetry."""
+ get_telemetry().disable()
+
+
+def status() -> Dict[str, Any]:
+ """
+ Get telemetry status.
+
+ Returns:
+ Dictionary with status information
+ """
+ return get_telemetry().status()
+
+
+# Auto-install exception handler on import
+# (Only for main library usage, not for Docker/API)
+if EnvironmentDetector.detect() not in [Environment.DOCKER, Environment.API_SERVER]:
+ install_exception_handler()
+
+
+__all__ = [
+ 'TelemetryManager',
+ 'get_telemetry',
+ 'capture_exception',
+ 'telemetry_decorator',
+ 'async_telemetry_decorator',
+ 'telemetry_context',
+ 'async_telemetry_context',
+ 'enable',
+ 'disable',
+ 'status',
+]
\ No newline at end of file
diff --git a/crawl4ai/telemetry/base.py b/crawl4ai/telemetry/base.py
new file mode 100644
index 00000000..760771b7
--- /dev/null
+++ b/crawl4ai/telemetry/base.py
@@ -0,0 +1,140 @@
+"""
+Base telemetry provider interface for Crawl4AI.
+Provides abstraction for different telemetry backends.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional, Union
+import traceback
+
+
+class TelemetryProvider(ABC):
+ """Abstract base class for telemetry providers."""
+
+ def __init__(self, **kwargs):
+ """Initialize the provider with optional configuration."""
+ self.config = kwargs
+ self._initialized = False
+
+ @abstractmethod
+ def initialize(self) -> bool:
+ """
+ Initialize the telemetry provider.
+ Returns True if initialization successful, False otherwise.
+ """
+ pass
+
+ @abstractmethod
+ def send_exception(
+ self,
+ exc: Exception,
+ context: Optional[Dict[str, Any]] = None
+ ) -> bool:
+ """
+ Send an exception to the telemetry backend.
+
+ Args:
+ exc: The exception to report
+ context: Optional context data (email, environment, etc.)
+
+ Returns:
+ True if sent successfully, False otherwise
+ """
+ pass
+
+ @abstractmethod
+ def send_event(
+ self,
+ event_name: str,
+ payload: Optional[Dict[str, Any]] = None
+ ) -> bool:
+ """
+ Send a generic telemetry event.
+
+ Args:
+ event_name: Name of the event
+ payload: Optional event data
+
+ Returns:
+ True if sent successfully, False otherwise
+ """
+ pass
+
+ @abstractmethod
+ def flush(self) -> None:
+ """Flush any pending telemetry data."""
+ pass
+
+ @abstractmethod
+ def shutdown(self) -> None:
+ """Clean shutdown of the provider."""
+ pass
+
+ def sanitize_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Remove sensitive information from telemetry data.
+ Override in subclasses for custom sanitization.
+
+ Args:
+ data: Raw data dictionary
+
+ Returns:
+ Sanitized data dictionary
+ """
+ # Default implementation - remove common sensitive fields
+ sensitive_keys = {
+ 'password', 'token', 'api_key', 'secret', 'credential',
+ 'auth', 'authorization', 'cookie', 'session'
+ }
+
+ def _sanitize_dict(d: Dict) -> Dict:
+ sanitized = {}
+ for key, value in d.items():
+ key_lower = key.lower()
+ if any(sensitive in key_lower for sensitive in sensitive_keys):
+ sanitized[key] = '[REDACTED]'
+ elif isinstance(value, dict):
+ sanitized[key] = _sanitize_dict(value)
+ elif isinstance(value, list):
+ sanitized[key] = [
+ _sanitize_dict(item) if isinstance(item, dict) else item
+ for item in value
+ ]
+ else:
+ sanitized[key] = value
+ return sanitized
+
+ return _sanitize_dict(data) if isinstance(data, dict) else data
+
+
+class NullProvider(TelemetryProvider):
+ """No-op provider for when telemetry is disabled."""
+
+ def initialize(self) -> bool:
+ """No initialization needed for null provider."""
+ self._initialized = True
+ return True
+
+ def send_exception(
+ self,
+ exc: Exception,
+ context: Optional[Dict[str, Any]] = None
+ ) -> bool:
+ """No-op exception sending."""
+ return True
+
+ def send_event(
+ self,
+ event_name: str,
+ payload: Optional[Dict[str, Any]] = None
+ ) -> bool:
+ """No-op event sending."""
+ return True
+
+ def flush(self) -> None:
+ """No-op flush."""
+ pass
+
+ def shutdown(self) -> None:
+ """No-op shutdown."""
+ pass
\ No newline at end of file
diff --git a/crawl4ai/telemetry/config.py b/crawl4ai/telemetry/config.py
new file mode 100644
index 00000000..a090b24d
--- /dev/null
+++ b/crawl4ai/telemetry/config.py
@@ -0,0 +1,196 @@
+"""
+Configuration management for Crawl4AI telemetry.
+Handles user preferences and persistence.
+"""
+
+import json
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+from enum import Enum
+
+
+class TelemetryConsent(Enum):
+ """Telemetry consent levels."""
+ NOT_SET = "not_set"
+ DENIED = "denied"
+ ONCE = "once" # Send current error only
+ ALWAYS = "always" # Send all errors
+
+
+class TelemetryConfig:
+ """Manages telemetry configuration and persistence."""
+
+ def __init__(self, config_dir: Optional[Path] = None):
+ """
+ Initialize configuration manager.
+
+ Args:
+ config_dir: Optional custom config directory
+ """
+ if config_dir:
+ self.config_dir = config_dir
+ else:
+ # Default to ~/.crawl4ai/
+ self.config_dir = Path.home() / '.crawl4ai'
+
+ self.config_file = self.config_dir / 'config.json'
+ self._config: Dict[str, Any] = {}
+ self._load_config()
+
+ def _ensure_config_dir(self) -> None:
+ """Ensure configuration directory exists."""
+ self.config_dir.mkdir(parents=True, exist_ok=True)
+
+ def _load_config(self) -> None:
+ """Load configuration from disk."""
+ if self.config_file.exists():
+ try:
+ with open(self.config_file, 'r') as f:
+ self._config = json.load(f)
+ except (json.JSONDecodeError, IOError):
+ # Corrupted or inaccessible config - start fresh
+ self._config = {}
+ else:
+ self._config = {}
+
+ def _save_config(self) -> bool:
+ """
+ Save configuration to disk.
+
+ Returns:
+ True if saved successfully
+ """
+ try:
+ self._ensure_config_dir()
+
+ # Write to temporary file first
+ temp_file = self.config_file.with_suffix('.tmp')
+ with open(temp_file, 'w') as f:
+ json.dump(self._config, f, indent=2)
+
+ # Atomic rename
+ temp_file.replace(self.config_file)
+ return True
+
+ except (IOError, OSError):
+ return False
+
+ def get_telemetry_settings(self) -> Dict[str, Any]:
+ """
+ Get current telemetry settings.
+
+ Returns:
+ Dictionary with telemetry settings
+ """
+ return self._config.get('telemetry', {
+ 'consent': TelemetryConsent.NOT_SET.value,
+ 'email': None
+ })
+
+ def get_consent(self) -> TelemetryConsent:
+ """
+ Get current consent status.
+
+ Returns:
+ TelemetryConsent enum value
+ """
+ settings = self.get_telemetry_settings()
+ consent_value = settings.get('consent', TelemetryConsent.NOT_SET.value)
+
+ # Handle legacy boolean values
+ if isinstance(consent_value, bool):
+ consent_value = TelemetryConsent.ALWAYS.value if consent_value else TelemetryConsent.DENIED.value
+
+ try:
+ return TelemetryConsent(consent_value)
+ except ValueError:
+ return TelemetryConsent.NOT_SET
+
+ def set_consent(
+ self,
+ consent: TelemetryConsent,
+ email: Optional[str] = None
+ ) -> bool:
+ """
+ Set telemetry consent and optional email.
+
+ Args:
+ consent: Consent level
+ email: Optional email for follow-up
+
+ Returns:
+ True if saved successfully
+ """
+ if 'telemetry' not in self._config:
+ self._config['telemetry'] = {}
+
+ self._config['telemetry']['consent'] = consent.value
+
+ # Only update email if provided
+ if email is not None:
+ self._config['telemetry']['email'] = email
+
+ return self._save_config()
+
+ def get_email(self) -> Optional[str]:
+ """
+ Get stored email if any.
+
+ Returns:
+ Email address or None
+ """
+ settings = self.get_telemetry_settings()
+ return settings.get('email')
+
+ def is_enabled(self) -> bool:
+ """
+ Check if telemetry is enabled.
+
+ Returns:
+ True if telemetry should send data
+ """
+ consent = self.get_consent()
+ return consent in [TelemetryConsent.ONCE, TelemetryConsent.ALWAYS]
+
+ def should_send_current(self) -> bool:
+ """
+ Check if current error should be sent.
+ Used for one-time consent.
+
+ Returns:
+ True if current error should be sent
+ """
+ consent = self.get_consent()
+ if consent == TelemetryConsent.ONCE:
+ # After sending once, reset to NOT_SET
+ self.set_consent(TelemetryConsent.NOT_SET)
+ return True
+ return consent == TelemetryConsent.ALWAYS
+
+ def clear(self) -> bool:
+ """
+ Clear all telemetry settings.
+
+ Returns:
+ True if cleared successfully
+ """
+ if 'telemetry' in self._config:
+ del self._config['telemetry']
+ return self._save_config()
+ return True
+
+ def update_from_env(self) -> None:
+ """Update configuration from environment variables."""
+ # Check for telemetry disable flag
+ if os.environ.get('CRAWL4AI_TELEMETRY') == '0':
+ self.set_consent(TelemetryConsent.DENIED)
+
+ # Check for email override
+ env_email = os.environ.get('CRAWL4AI_TELEMETRY_EMAIL')
+ if env_email and self.is_enabled():
+ current_settings = self.get_telemetry_settings()
+ self.set_consent(
+ TelemetryConsent(current_settings['consent']),
+ email=env_email
+ )
\ No newline at end of file
diff --git a/crawl4ai/telemetry/consent.py b/crawl4ai/telemetry/consent.py
new file mode 100644
index 00000000..a5232017
--- /dev/null
+++ b/crawl4ai/telemetry/consent.py
@@ -0,0 +1,314 @@
+"""
+User consent handling for Crawl4AI telemetry.
+Provides interactive prompts for different environments.
+"""
+
+import sys
+from typing import Optional, Tuple
+from .config import TelemetryConsent, TelemetryConfig
+from .environment import Environment, EnvironmentDetector
+
+
+class ConsentManager:
+ """Manages user consent for telemetry."""
+
+ def __init__(self, config: Optional[TelemetryConfig] = None):
+ """
+ Initialize consent manager.
+
+ Args:
+ config: Optional TelemetryConfig instance
+ """
+ self.config = config or TelemetryConfig()
+ self.environment = EnvironmentDetector.detect()
+
+ def check_and_prompt(self) -> TelemetryConsent:
+ """
+ Check consent status and prompt if needed.
+
+ Returns:
+ Current consent status
+ """
+ current_consent = self.config.get_consent()
+
+ # If already set, return current value
+ if current_consent != TelemetryConsent.NOT_SET:
+ return current_consent
+
+ # Docker/API server: default enabled (check env var)
+ if self.environment in [Environment.DOCKER, Environment.API_SERVER]:
+ return self._handle_docker_consent()
+
+ # Interactive environments: prompt user
+ if EnvironmentDetector.is_interactive():
+ return self._prompt_for_consent()
+
+ # Non-interactive: default disabled
+ return TelemetryConsent.DENIED
+
+ def _handle_docker_consent(self) -> TelemetryConsent:
+ """
+ Handle consent in Docker environment.
+ Default enabled unless disabled via env var.
+ """
+ import os
+
+ if os.environ.get('CRAWL4AI_TELEMETRY') == '0':
+ self.config.set_consent(TelemetryConsent.DENIED)
+ return TelemetryConsent.DENIED
+
+ # Default enabled for Docker
+ self.config.set_consent(TelemetryConsent.ALWAYS)
+ return TelemetryConsent.ALWAYS
+
+ def _prompt_for_consent(self) -> TelemetryConsent:
+ """
+ Prompt user for consent based on environment.
+
+ Returns:
+ User's consent choice
+ """
+ if self.environment == Environment.CLI:
+ return self._cli_prompt()
+ elif self.environment in [Environment.JUPYTER, Environment.COLAB]:
+ return self._notebook_prompt()
+ else:
+ return TelemetryConsent.DENIED
+
+ def _cli_prompt(self) -> TelemetryConsent:
+ """
+ Show CLI prompt for consent.
+
+ Returns:
+ User's consent choice
+ """
+ print("\n" + "="*60)
+ print("🚨 Crawl4AI Error Detection")
+ print("="*60)
+ print("\nWe noticed an error occurred. Help improve Crawl4AI by")
+ print("sending anonymous crash reports?")
+ print("\n[1] Yes, send this error only")
+ print("[2] Yes, always send errors")
+ print("[3] No, don't send")
+ print("\n" + "-"*60)
+
+ # Get choice
+ while True:
+ try:
+ choice = input("Your choice (1/2/3): ").strip()
+ if choice == '1':
+ consent = TelemetryConsent.ONCE
+ break
+ elif choice == '2':
+ consent = TelemetryConsent.ALWAYS
+ break
+ elif choice == '3':
+ consent = TelemetryConsent.DENIED
+ break
+ else:
+ print("Please enter 1, 2, or 3")
+ except (KeyboardInterrupt, EOFError):
+ # User cancelled - treat as denial
+ consent = TelemetryConsent.DENIED
+ break
+
+ # Optional email
+ email = None
+ if consent != TelemetryConsent.DENIED:
+ print("\nOptional: Enter email for follow-up (or press Enter to skip):")
+ try:
+ email_input = input("Email: ").strip()
+ if email_input and '@' in email_input:
+ email = email_input
+ except (KeyboardInterrupt, EOFError):
+ pass
+
+ # Save choice
+ self.config.set_consent(consent, email)
+
+ if consent != TelemetryConsent.DENIED:
+ print("\n✅ Thank you for helping improve Crawl4AI!")
+ else:
+ print("\n✅ Telemetry disabled. You can enable it anytime with:")
+ print(" crawl4ai telemetry enable")
+
+ print("="*60 + "\n")
+
+ return consent
+
+ def _notebook_prompt(self) -> TelemetryConsent:
+ """
+ Show notebook prompt for consent.
+ Uses widgets if available, falls back to print + code.
+
+ Returns:
+ User's consent choice
+ """
+ if EnvironmentDetector.supports_widgets():
+ return self._widget_prompt()
+ else:
+ return self._notebook_fallback_prompt()
+
+ def _widget_prompt(self) -> TelemetryConsent:
+ """
+ Show interactive widget prompt in Jupyter/Colab.
+
+ Returns:
+ User's consent choice
+ """
+ try:
+ import ipywidgets as widgets
+ from IPython.display import display, HTML
+
+ # Create styled HTML
+ html = HTML("""
+
+
🚨 Crawl4AI Error Detected
+
Help us improve by sending anonymous crash reports?
+
+ """)
+ display(html)
+
+ # Create buttons
+ btn_once = widgets.Button(
+ description='Send this error',
+ button_style='info',
+ icon='check'
+ )
+ btn_always = widgets.Button(
+ description='Always send',
+ button_style='success',
+ icon='check-circle'
+ )
+ btn_never = widgets.Button(
+ description='Don\'t send',
+ button_style='danger',
+ icon='times'
+ )
+
+ # Email input
+ email_input = widgets.Text(
+ placeholder='Optional: your@email.com',
+ description='Email:',
+ style={'description_width': 'initial'}
+ )
+
+ # Output area for feedback
+ output = widgets.Output()
+
+ # Container
+ button_box = widgets.HBox([btn_once, btn_always, btn_never])
+ container = widgets.VBox([button_box, email_input, output])
+
+ # Variable to store choice
+ consent_choice = {'value': None}
+
+ def on_button_click(btn):
+ """Handle button click."""
+ with output:
+ output.clear_output()
+
+ if btn == btn_once:
+ consent_choice['value'] = TelemetryConsent.ONCE
+ print("✅ Sending this error only")
+ elif btn == btn_always:
+ consent_choice['value'] = TelemetryConsent.ALWAYS
+ print("✅ Always sending errors")
+ else:
+ consent_choice['value'] = TelemetryConsent.DENIED
+ print("✅ Telemetry disabled")
+
+ # Save with email if provided
+ email = email_input.value.strip() if email_input.value else None
+ self.config.set_consent(consent_choice['value'], email)
+
+ # Disable buttons after choice
+ btn_once.disabled = True
+ btn_always.disabled = True
+ btn_never.disabled = True
+ email_input.disabled = True
+
+ # Attach handlers
+ btn_once.on_click(on_button_click)
+ btn_always.on_click(on_button_click)
+ btn_never.on_click(on_button_click)
+
+ # Display widget
+ display(container)
+
+ # Wait for user choice (in notebook, this is non-blocking)
+ # Return NOT_SET for now, actual choice will be saved via callback
+ return consent_choice.get('value', TelemetryConsent.NOT_SET)
+
+ except Exception:
+ # Fallback if widgets fail
+ return self._notebook_fallback_prompt()
+
+ def _notebook_fallback_prompt(self) -> TelemetryConsent:
+ """
+ Fallback prompt for notebooks without widget support.
+
+ Returns:
+ User's consent choice (defaults to DENIED)
+ """
+ try:
+ from IPython.display import display, Markdown
+
+ markdown_content = """
+### 🚨 Crawl4AI Error Detected
+
+Help us improve by sending anonymous crash reports.
+
+**Telemetry is currently OFF.** To enable, run:
+
+```python
+import crawl4ai
+crawl4ai.telemetry.enable(email="your@email.com", always=True)
+```
+
+To send just this error:
+```python
+crawl4ai.telemetry.enable(once=True)
+```
+
+To keep telemetry disabled:
+```python
+crawl4ai.telemetry.disable()
+```
+ """
+
+ display(Markdown(markdown_content))
+
+ except ImportError:
+ # Pure print fallback
+ print("\n" + "="*60)
+ print("🚨 Crawl4AI Error Detected")
+ print("="*60)
+ print("\nTelemetry is OFF. To enable, run:")
+ print("\nimport crawl4ai")
+ print('crawl4ai.telemetry.enable(email="you@example.com", always=True)')
+ print("\n" + "="*60)
+
+ # Default to disabled in fallback mode
+ return TelemetryConsent.DENIED
+
+ def force_prompt(self) -> Tuple[TelemetryConsent, Optional[str]]:
+ """
+ Force a consent prompt regardless of current settings.
+ Used for manual telemetry configuration.
+
+ Returns:
+ Tuple of (consent choice, optional email)
+ """
+ # Temporarily reset consent to force prompt
+ original_consent = self.config.get_consent()
+ self.config.set_consent(TelemetryConsent.NOT_SET)
+
+ try:
+ new_consent = self._prompt_for_consent()
+ email = self.config.get_email()
+ return new_consent, email
+ except Exception:
+ # Restore original on error
+ self.config.set_consent(original_consent)
+ raise
\ No newline at end of file
diff --git a/crawl4ai/telemetry/environment.py b/crawl4ai/telemetry/environment.py
new file mode 100644
index 00000000..816ae028
--- /dev/null
+++ b/crawl4ai/telemetry/environment.py
@@ -0,0 +1,199 @@
+"""
+Environment detection for Crawl4AI telemetry.
+Detects whether we're running in CLI, Docker, Jupyter, etc.
+"""
+
+import os
+import sys
+from enum import Enum
+from typing import Optional
+
+
+class Environment(Enum):
+ """Detected runtime environment."""
+ CLI = "cli"
+ DOCKER = "docker"
+ JUPYTER = "jupyter"
+ COLAB = "colab"
+ API_SERVER = "api_server"
+ UNKNOWN = "unknown"
+
+
+class EnvironmentDetector:
+ """Detects the current runtime environment."""
+
+ @staticmethod
+ def detect() -> Environment:
+ """
+ Detect current runtime environment.
+
+ Returns:
+ Environment enum value
+ """
+ # Check for Docker
+ if EnvironmentDetector._is_docker():
+ # Further check if it's API server
+ if EnvironmentDetector._is_api_server():
+ return Environment.API_SERVER
+ return Environment.DOCKER
+
+ # Check for Google Colab
+ if EnvironmentDetector._is_colab():
+ return Environment.COLAB
+
+ # Check for Jupyter
+ if EnvironmentDetector._is_jupyter():
+ return Environment.JUPYTER
+
+ # Check for CLI
+ if EnvironmentDetector._is_cli():
+ return Environment.CLI
+
+ return Environment.UNKNOWN
+
+ @staticmethod
+ def _is_docker() -> bool:
+ """Check if running inside Docker container."""
+ # Check for Docker-specific files
+ if os.path.exists('/.dockerenv'):
+ return True
+
+ # Check cgroup for docker signature
+ try:
+ with open('/proc/1/cgroup', 'r') as f:
+ return 'docker' in f.read()
+ except (IOError, OSError):
+ pass
+
+ # Check environment variable (if set in Dockerfile)
+ return os.environ.get('CRAWL4AI_DOCKER', '').lower() == 'true'
+
+ @staticmethod
+ def _is_api_server() -> bool:
+ """Check if running as API server."""
+ # Check for API server indicators
+ return (
+ os.environ.get('CRAWL4AI_API_SERVER', '').lower() == 'true' or
+ 'deploy/docker/server.py' in ' '.join(sys.argv) or
+ 'deploy/docker/api.py' in ' '.join(sys.argv)
+ )
+
+ @staticmethod
+ def _is_jupyter() -> bool:
+ """Check if running in Jupyter notebook."""
+ try:
+ # Check for IPython
+ from IPython import get_ipython
+ ipython = get_ipython()
+
+ if ipython is None:
+ return False
+
+ # Check for notebook kernel
+ if 'IPKernelApp' in ipython.config:
+ return True
+
+ # Check for Jupyter-specific attributes
+ if hasattr(ipython, 'kernel'):
+ return True
+
+ except (ImportError, AttributeError):
+ pass
+
+ return False
+
+ @staticmethod
+ def _is_colab() -> bool:
+ """Check if running in Google Colab."""
+ try:
+ import google.colab
+ return True
+ except ImportError:
+ pass
+
+ # Alternative check
+ return 'COLAB_GPU' in os.environ or 'COLAB_TPU_ADDR' in os.environ
+
+ @staticmethod
+ def _is_cli() -> bool:
+ """Check if running from command line."""
+ # Check if we have a terminal
+ return (
+ hasattr(sys, 'ps1') or
+ sys.stdin.isatty() or
+ bool(os.environ.get('TERM'))
+ )
+
+ @staticmethod
+ def is_interactive() -> bool:
+ """
+ Check if environment supports interactive prompts.
+
+ Returns:
+ True if interactive prompts are supported
+ """
+ env = EnvironmentDetector.detect()
+
+ # Docker/API server are non-interactive
+ if env in [Environment.DOCKER, Environment.API_SERVER]:
+ return False
+
+ # CLI with TTY is interactive
+ if env == Environment.CLI:
+ return sys.stdin.isatty()
+
+ # Jupyter/Colab can be interactive with widgets
+ if env in [Environment.JUPYTER, Environment.COLAB]:
+ return True
+
+ return False
+
+ @staticmethod
+ def supports_widgets() -> bool:
+ """
+ Check if environment supports IPython widgets.
+
+ Returns:
+ True if widgets are supported
+ """
+ env = EnvironmentDetector.detect()
+
+ if env not in [Environment.JUPYTER, Environment.COLAB]:
+ return False
+
+ try:
+ import ipywidgets
+ from IPython.display import display
+ return True
+ except ImportError:
+ return False
+
+ @staticmethod
+ def get_environment_context() -> dict:
+ """
+ Get environment context for telemetry.
+
+ Returns:
+ Dictionary with environment information
+ """
+ env = EnvironmentDetector.detect()
+
+ context = {
+ 'environment_type': env.value,
+ 'python_version': f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
+ 'platform': sys.platform,
+ }
+
+ # Add environment-specific context
+ if env == Environment.DOCKER:
+ context['docker'] = True
+ context['container_id'] = os.environ.get('HOSTNAME', 'unknown')
+
+ elif env == Environment.COLAB:
+ context['colab'] = True
+ context['gpu'] = bool(os.environ.get('COLAB_GPU'))
+
+ elif env == Environment.JUPYTER:
+ context['jupyter'] = True
+
+ return context
\ No newline at end of file
diff --git a/crawl4ai/telemetry/providers/__init__.py b/crawl4ai/telemetry/providers/__init__.py
new file mode 100644
index 00000000..0d332bb7
--- /dev/null
+++ b/crawl4ai/telemetry/providers/__init__.py
@@ -0,0 +1,15 @@
+"""
+Telemetry providers for Crawl4AI.
+"""
+
+from ..base import TelemetryProvider, NullProvider
+
+__all__ = ['TelemetryProvider', 'NullProvider']
+
+# Try to import Sentry provider if available
+try:
+ from .sentry import SentryProvider
+ __all__.append('SentryProvider')
+except ImportError:
+ # Sentry SDK not installed
+ pass
\ No newline at end of file
diff --git a/crawl4ai/telemetry/providers/sentry.py b/crawl4ai/telemetry/providers/sentry.py
new file mode 100644
index 00000000..34c90f2f
--- /dev/null
+++ b/crawl4ai/telemetry/providers/sentry.py
@@ -0,0 +1,234 @@
+"""
+Sentry telemetry provider for Crawl4AI.
+"""
+
+import os
+from typing import Dict, Any, Optional
+from ..base import TelemetryProvider
+
+# Hardcoded DSN for Crawl4AI project
+# This is safe to embed as it's the public part of the DSN
+# TODO: Replace with actual Crawl4AI Sentry project DSN before release
+# Format: "https://@.ingest.sentry.io/"
+DEFAULT_SENTRY_DSN = "https://your-public-key@sentry.io/your-project-id"
+
+
+class SentryProvider(TelemetryProvider):
+ """Sentry implementation of telemetry provider."""
+
+ def __init__(self, dsn: Optional[str] = None, **kwargs):
+ """
+ Initialize Sentry provider.
+
+ Args:
+ dsn: Optional DSN override (for testing/development)
+ **kwargs: Additional Sentry configuration
+ """
+ super().__init__(**kwargs)
+
+ # Allow DSN override via environment variable or parameter
+ self.dsn = (
+ dsn or
+ os.environ.get('CRAWL4AI_SENTRY_DSN') or
+ DEFAULT_SENTRY_DSN
+ )
+
+ self._sentry_sdk = None
+ self.environment = kwargs.get('environment', 'production')
+ self.release = kwargs.get('release', None)
+
+ def initialize(self) -> bool:
+ """Initialize Sentry SDK."""
+ try:
+ import sentry_sdk
+ from sentry_sdk.integrations.stdlib import StdlibIntegration
+ from sentry_sdk.integrations.excepthook import ExcepthookIntegration
+
+ # Initialize Sentry with minimal integrations
+ sentry_sdk.init(
+ dsn=self.dsn,
+
+ environment=self.environment,
+ release=self.release,
+
+ # Performance monitoring disabled by default
+ traces_sample_rate=0.0,
+
+ # Only capture errors, not transactions
+ # profiles_sample_rate=0.0,
+
+ # Minimal integrations
+ integrations=[
+ StdlibIntegration(),
+ ExcepthookIntegration(always_run=False),
+ ],
+
+ # Privacy settings
+ send_default_pii=False,
+ attach_stacktrace=True,
+
+ # Before send hook for additional sanitization
+ before_send=self._before_send,
+
+ # Disable automatic breadcrumbs
+ max_breadcrumbs=0,
+
+ # Disable request data collection
+ # request_bodies='never',
+
+ # # Custom transport options
+ # transport_options={
+ # 'keepalive': True,
+ # },
+ )
+
+ self._sentry_sdk = sentry_sdk
+ self._initialized = True
+ return True
+
+ except ImportError:
+ # Sentry SDK not installed
+ return False
+ except Exception:
+ # Initialization failed silently
+ return False
+
+ def _before_send(self, event: Dict[str, Any], hint: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ """
+ Process event before sending to Sentry.
+ Provides additional privacy protection.
+ """
+ # Remove sensitive data
+ if 'request' in event:
+ event['request'] = self._sanitize_request(event['request'])
+
+ # Remove local variables that might contain sensitive data
+ if 'exception' in event and 'values' in event['exception']:
+ for exc in event['exception']['values']:
+ if 'stacktrace' in exc and 'frames' in exc['stacktrace']:
+ for frame in exc['stacktrace']['frames']:
+ # Remove local variables from frames
+ frame.pop('vars', None)
+
+ # Apply general sanitization
+ event = self.sanitize_data(event)
+
+ return event
+
+ def _sanitize_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
+ """Sanitize request data to remove sensitive information."""
+ sanitized = request_data.copy()
+
+ # Remove sensitive fields
+ sensitive_fields = ['cookies', 'headers', 'data', 'query_string', 'env']
+ for field in sensitive_fields:
+ if field in sanitized:
+ sanitized[field] = '[REDACTED]'
+
+ # Keep only safe fields
+ safe_fields = ['method', 'url']
+ return {k: v for k, v in sanitized.items() if k in safe_fields}
+
+ def send_exception(
+ self,
+ exc: Exception,
+ context: Optional[Dict[str, Any]] = None
+ ) -> bool:
+ """
+ Send exception to Sentry.
+
+ Args:
+ exc: Exception to report
+ context: Optional context (email, environment info)
+
+ Returns:
+ True if sent successfully
+ """
+ if not self._initialized:
+ if not self.initialize():
+ return False
+
+ try:
+ if self._sentry_sdk:
+ with self._sentry_sdk.push_scope() as scope:
+ # Add user context if email provided
+ if context and 'email' in context:
+ scope.set_user({'email': context['email']})
+
+ # Add additional context
+ if context:
+ for key, value in context.items():
+ if key != 'email':
+ scope.set_context(key, value)
+
+ # Add tags for filtering
+ scope.set_tag('source', context.get('source', 'unknown'))
+ scope.set_tag('environment_type', context.get('environment_type', 'unknown'))
+
+ # Capture the exception
+ self._sentry_sdk.capture_exception(exc)
+
+ return True
+
+ except Exception:
+ # Silently fail - telemetry should never crash the app
+ return False
+
+ return False
+
+ def send_event(
+ self,
+ event_name: str,
+ payload: Optional[Dict[str, Any]] = None
+ ) -> bool:
+ """
+ Send custom event to Sentry.
+
+ Args:
+ event_name: Name of the event
+ payload: Event data
+
+ Returns:
+ True if sent successfully
+ """
+ if not self._initialized:
+ if not self.initialize():
+ return False
+
+ try:
+ if self._sentry_sdk:
+ # Sanitize payload
+ safe_payload = self.sanitize_data(payload) if payload else {}
+
+ # Send as a message with extra data
+ self._sentry_sdk.capture_message(
+ event_name,
+ level='info',
+ extras=safe_payload
+ )
+ return True
+
+ except Exception:
+ return False
+
+ return False
+
+ def flush(self) -> None:
+ """Flush pending events to Sentry."""
+ if self._initialized and self._sentry_sdk:
+ try:
+ self._sentry_sdk.flush(timeout=2.0)
+ except Exception:
+ pass
+
+ def shutdown(self) -> None:
+ """Shutdown Sentry client."""
+ if self._initialized and self._sentry_sdk:
+ try:
+ self._sentry_sdk.flush(timeout=2.0)
+ # Note: sentry_sdk doesn't have a shutdown method
+ # Flush is sufficient for cleanup
+ except Exception:
+ pass
+ finally:
+ self._initialized = False
\ No newline at end of file
diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
index d463c641..144c440a 100644
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -15,3 +15,4 @@ PyJWT==2.10.1
mcp>=1.6.0
websockets>=15.0.1
httpx[http2]>=0.27.2
+sentry-sdk>=2.0.0
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index 57fd3d6d..6298e301 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -74,6 +74,32 @@ setup_logging(config)
__version__ = "0.5.1-d1"
+# ───────────────────── telemetry setup ────────────────────────
+# Docker/API server telemetry: enabled by default unless CRAWL4AI_TELEMETRY=0
+import os as _os
+if _os.environ.get('CRAWL4AI_TELEMETRY') != '0':
+ # Set environment variable to indicate we're in API server mode
+ _os.environ['CRAWL4AI_API_SERVER'] = 'true'
+
+ # Import and enable telemetry for Docker/API environment
+ from crawl4ai.telemetry import enable as enable_telemetry
+ from crawl4ai.telemetry import capture_exception
+
+ # Enable telemetry automatically in Docker mode
+ enable_telemetry(always=True)
+
+ import logging
+ telemetry_logger = logging.getLogger("telemetry")
+ telemetry_logger.info("✅ Telemetry enabled for Docker/API server")
+else:
+ # Define no-op for capture_exception if telemetry is disabled
+ def capture_exception(exc, context=None):
+ pass
+
+ import logging
+ telemetry_logger = logging.getLogger("telemetry")
+ telemetry_logger.info("❌ Telemetry disabled via CRAWL4AI_TELEMETRY=0")
+
# ── global page semaphore (hard cap) ─────────────────────────
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
diff --git a/docs/md_v2/core/telemetry.md b/docs/md_v2/core/telemetry.md
new file mode 100644
index 00000000..2afedf36
--- /dev/null
+++ b/docs/md_v2/core/telemetry.md
@@ -0,0 +1,242 @@
+# Telemetry
+
+Crawl4AI includes **opt-in telemetry** to help improve stability by capturing anonymous crash reports. No personal data or crawled content is ever collected.
+
+!!! info "Privacy First"
+ Telemetry is completely optional and respects your privacy. Only exception information is collected - no URLs, no personal data, no crawled content.
+
+## Overview
+
+- **Privacy-first**: Only exceptions and crashes are reported
+- **Opt-in by default**: You control when telemetry is enabled (except in Docker where it's on by default)
+- **No PII**: No URLs, request data, or personal information is collected
+- **Provider-agnostic**: Currently uses Sentry, but designed to support multiple backends
+
+## Installation
+
+Telemetry requires the optional Sentry SDK:
+
+```bash
+# Install with telemetry support
+pip install crawl4ai[telemetry]
+
+# Or install Sentry SDK separately
+pip install sentry-sdk>=2.0.0
+```
+
+## Environments
+
+### 1. Python Library & CLI
+
+On first exception, you'll see an interactive prompt:
+
+```
+🚨 Crawl4AI Error Detection
+==============================================================
+We noticed an error occurred. Help improve Crawl4AI by
+sending anonymous crash reports?
+
+[1] Yes, send this error only
+[2] Yes, always send errors
+[3] No, don't send
+
+Your choice (1/2/3):
+```
+
+Control via CLI:
+```bash
+# Enable telemetry
+crwl telemetry enable
+crwl telemetry enable --email you@example.com
+
+# Disable telemetry
+crwl telemetry disable
+
+# Check status
+crwl telemetry status
+```
+
+### 2. Docker / API Server
+
+!!! warning "Default Enabled in Docker"
+ Telemetry is **enabled by default** in Docker environments to help identify container-specific issues. This is different from the CLI where it's opt-in.
+
+To disable:
+```bash
+# Via environment variable
+docker run -e CRAWL4AI_TELEMETRY=0 ...
+
+# In docker-compose.yml
+environment:
+ - CRAWL4AI_TELEMETRY=0
+```
+
+### 3. Jupyter / Google Colab
+
+In notebooks, you'll see an interactive widget (if available) or a code snippet:
+
+```python
+import crawl4ai
+
+# Enable telemetry
+crawl4ai.telemetry.enable(email="you@example.com", always=True)
+
+# Send only next error
+crawl4ai.telemetry.enable(once=True)
+
+# Disable telemetry
+crawl4ai.telemetry.disable()
+
+# Check status
+crawl4ai.telemetry.status()
+```
+
+## Python API
+
+### Basic Usage
+
+```python
+from crawl4ai import telemetry
+
+# Enable/disable telemetry
+telemetry.enable(email="optional@email.com", always=True)
+telemetry.disable()
+
+# Check current status
+status = telemetry.status()
+print(f"Telemetry enabled: {status['enabled']}")
+print(f"Consent: {status['consent']}")
+```
+
+### Manual Exception Capture
+
+```python
+from crawl4ai.telemetry import capture_exception
+
+try:
+ # Your code here
+ risky_operation()
+except Exception as e:
+ # Manually capture exception with context
+ capture_exception(e, {
+ 'operation': 'custom_crawler',
+ 'url': 'https://example.com' # Will be sanitized
+ })
+ raise
+```
+
+### Decorator Pattern
+
+```python
+from crawl4ai.telemetry import telemetry_decorator
+
+@telemetry_decorator
+def my_crawler_function():
+ # Exceptions will be automatically captured
+ pass
+```
+
+### Context Manager
+
+```python
+from crawl4ai.telemetry import telemetry_context
+
+with telemetry_context("data_extraction"):
+ # Any exceptions in this block will be captured
+ result = extract_data(html)
+```
+
+## Configuration
+
+Settings are stored in `~/.crawl4ai/config.json`:
+
+```json
+{
+ "telemetry": {
+ "consent": "always",
+ "email": "user@example.com"
+ }
+}
+```
+
+Consent levels:
+- `"not_set"` - No decision made yet
+- `"denied"` - Telemetry disabled
+- `"once"` - Send current error only
+- `"always"` - Always send errors
+
+## Environment Variables
+
+- `CRAWL4AI_TELEMETRY=0` - Disable telemetry (overrides config)
+- `CRAWL4AI_TELEMETRY_EMAIL=email@example.com` - Set email for follow-up
+- `CRAWL4AI_SENTRY_DSN=https://...` - Override default DSN (for maintainers)
+
+## What's Collected
+
+### Collected ✅
+- Exception type and traceback
+- Crawl4AI version
+- Python version
+- Operating system
+- Environment type (CLI, Docker, Jupyter)
+- Optional email (if provided)
+
+### NOT Collected ❌
+- URLs being crawled
+- HTML content
+- Request/response data
+- Cookies or authentication tokens
+- IP addresses
+- Any personally identifiable information
+
+## Provider Architecture
+
+Telemetry is designed to be provider-agnostic:
+
+```python
+from crawl4ai.telemetry.base import TelemetryProvider
+
+class CustomProvider(TelemetryProvider):
+ def send_exception(self, exc, context=None):
+ # Your implementation
+ pass
+```
+
+## FAQ
+
+### Q: Can I completely disable telemetry?
+A: Yes! Use `crwl telemetry disable` or set `CRAWL4AI_TELEMETRY=0`
+
+### Q: Is telemetry required?
+A: No, it's completely optional (except enabled by default in Docker)
+
+### Q: What if I don't install sentry-sdk?
+A: Telemetry will gracefully degrade to a no-op state
+
+### Q: Can I see what's being sent?
+A: Yes, check the source code in `crawl4ai/telemetry/`
+
+### Q: How do I remove my email?
+A: Delete `~/.crawl4ai/config.json` or edit it to remove the email field
+
+## Privacy Commitment
+
+1. **Transparency**: All telemetry code is open source
+2. **Control**: You can enable/disable at any time
+3. **Minimal**: Only crash data, no user content
+4. **Secure**: Data transmitted over HTTPS to Sentry
+5. **Anonymous**: No tracking or user identification
+
+## Contributing
+
+Help improve telemetry:
+- Report issues with telemetry itself
+- Suggest privacy improvements
+- Add new provider backends
+
+## Support
+
+If you have concerns about telemetry:
+- Open an issue on GitHub
+- Email the maintainers
+- Review the code in `crawl4ai/telemetry/`
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index ff148547..bb725776 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -35,6 +35,7 @@ nav:
- "Page Interaction": "core/page-interaction.md"
- "Content Selection": "core/content-selection.md"
- "Cache Modes": "core/cache-modes.md"
+ - "Telemetry": "core/telemetry.md"
- "Local Files & Raw HTML": "core/local-files.md"
- "Link & Media": "core/link-media.md"
- Advanced:
diff --git a/pyproject.toml b/pyproject.toml
index 9b00bd28..1abc1832 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,6 +64,7 @@ torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers", "sentence-transformers"]
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
sync = ["selenium"]
+telemetry = ["sentry-sdk>=2.0.0", "ipywidgets>=8.0.0"]
all = [
"PyPDF2",
"torch",
@@ -72,7 +73,9 @@ all = [
"transformers",
"tokenizers",
"sentence-transformers",
- "selenium"
+ "selenium",
+ "sentry-sdk>=2.0.0",
+ "ipywidgets>=8.0.0"
]
[project.scripts]
diff --git a/tests/telemetry/test_telemetry.py b/tests/telemetry/test_telemetry.py
new file mode 100644
index 00000000..f15ea346
--- /dev/null
+++ b/tests/telemetry/test_telemetry.py
@@ -0,0 +1,237 @@
+"""
+Tests for Crawl4AI telemetry functionality.
+"""
+
+import pytest
+import os
+import tempfile
+from pathlib import Path
+import json
+from unittest.mock import Mock, patch, MagicMock
+
+from crawl4ai.telemetry import (
+ TelemetryManager,
+ capture_exception,
+ enable,
+ disable,
+ status
+)
+from crawl4ai.telemetry.config import TelemetryConfig, TelemetryConsent
+from crawl4ai.telemetry.environment import Environment, EnvironmentDetector
+from crawl4ai.telemetry.base import NullProvider
+from crawl4ai.telemetry.consent import ConsentManager
+
+
+class TestTelemetryConfig:
+ """Test telemetry configuration management."""
+
+ def test_config_initialization(self):
+ """Test config initialization with custom directory."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ config = TelemetryConfig(config_dir=Path(tmpdir))
+ assert config.config_dir == Path(tmpdir)
+ assert config.get_consent() == TelemetryConsent.NOT_SET
+
+ def test_consent_persistence(self):
+ """Test that consent is saved and loaded correctly."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ config = TelemetryConfig(config_dir=Path(tmpdir))
+
+ # Set consent
+ config.set_consent(TelemetryConsent.ALWAYS, email="test@example.com")
+
+ # Create new config instance to test persistence
+ config2 = TelemetryConfig(config_dir=Path(tmpdir))
+ assert config2.get_consent() == TelemetryConsent.ALWAYS
+ assert config2.get_email() == "test@example.com"
+
+ def test_environment_variable_override(self):
+ """Test that environment variables override config."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ config = TelemetryConfig(config_dir=Path(tmpdir))
+ config.set_consent(TelemetryConsent.ALWAYS)
+
+ # Set environment variable to disable
+ os.environ['CRAWL4AI_TELEMETRY'] = '0'
+ try:
+ config.update_from_env()
+ assert config.get_consent() == TelemetryConsent.DENIED
+ finally:
+ del os.environ['CRAWL4AI_TELEMETRY']
+
+
+class TestEnvironmentDetection:
+ """Test environment detection functionality."""
+
+ def test_cli_detection(self):
+ """Test CLI environment detection."""
+ # Mock sys.stdin.isatty
+ with patch('sys.stdin.isatty', return_value=True):
+ env = EnvironmentDetector.detect()
+ # Should detect as CLI in most test environments
+ assert env in [Environment.CLI, Environment.UNKNOWN]
+
+ def test_docker_detection(self):
+ """Test Docker environment detection."""
+ # Mock Docker environment
+ with patch.dict(os.environ, {'CRAWL4AI_DOCKER': 'true'}):
+ env = EnvironmentDetector.detect()
+ assert env == Environment.DOCKER
+
+ def test_api_server_detection(self):
+ """Test API server detection."""
+ with patch.dict(os.environ, {'CRAWL4AI_API_SERVER': 'true', 'CRAWL4AI_DOCKER': 'true'}):
+ env = EnvironmentDetector.detect()
+ assert env == Environment.API_SERVER
+
+
+class TestTelemetryManager:
+ """Test the main telemetry manager."""
+
+ def test_singleton_pattern(self):
+ """Test that TelemetryManager is a singleton."""
+ manager1 = TelemetryManager.get_instance()
+ manager2 = TelemetryManager.get_instance()
+ assert manager1 is manager2
+
+ def test_exception_capture(self):
+ """Test exception capture functionality."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ # Create manager with custom config dir
+ with patch('crawl4ai.telemetry.TelemetryConfig') as MockConfig:
+ mock_config = Mock()
+ mock_config.get_consent.return_value = TelemetryConsent.ALWAYS
+ mock_config.is_enabled.return_value = True
+ mock_config.should_send_current.return_value = True
+ mock_config.get_email.return_value = "test@example.com"
+ mock_config.update_from_env.return_value = None
+ MockConfig.return_value = mock_config
+
+ # Mock the provider setup
+ with patch('crawl4ai.telemetry.providers.sentry.SentryProvider') as MockSentryProvider:
+ mock_provider = Mock()
+ mock_provider.initialize.return_value = True
+ mock_provider.send_exception.return_value = True
+ MockSentryProvider.return_value = mock_provider
+
+ manager = TelemetryManager()
+
+ # Test exception capture
+ test_exception = ValueError("Test error")
+ result = manager.capture_exception(test_exception, {'test': 'context'})
+
+ # Verify the exception was processed
+ assert mock_config.should_send_current.called
+
+ def test_null_provider_when_disabled(self):
+ """Test that NullProvider is used when telemetry is disabled."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ with patch('crawl4ai.telemetry.TelemetryConfig') as MockConfig:
+ mock_config = Mock()
+ mock_config.get_consent.return_value = TelemetryConsent.DENIED
+ mock_config.is_enabled.return_value = False
+ MockConfig.return_value = mock_config
+
+ manager = TelemetryManager()
+ assert isinstance(manager._provider, NullProvider)
+
+
+class TestConsentManager:
+ """Test consent management functionality."""
+
+ def test_docker_default_enabled(self):
+ """Test that Docker environment has telemetry enabled by default."""
+ with patch('crawl4ai.telemetry.consent.EnvironmentDetector.detect', return_value=Environment.DOCKER):
+ config = Mock()
+ config.get_consent.return_value = TelemetryConsent.NOT_SET
+
+ consent_manager = ConsentManager(config)
+ consent = consent_manager.check_and_prompt()
+
+ # Should be enabled by default in Docker
+ assert config.set_consent.called
+ assert config.set_consent.call_args[0][0] == TelemetryConsent.ALWAYS
+
+ def test_docker_disabled_by_env(self):
+ """Test that Docker telemetry can be disabled via environment variable."""
+ with patch('crawl4ai.telemetry.consent.EnvironmentDetector.detect', return_value=Environment.DOCKER):
+ with patch.dict(os.environ, {'CRAWL4AI_TELEMETRY': '0'}):
+ config = Mock()
+ config.get_consent.return_value = TelemetryConsent.NOT_SET
+
+ consent_manager = ConsentManager(config)
+ consent = consent_manager.check_and_prompt()
+
+ # Should be disabled
+ assert config.set_consent.called
+ assert config.set_consent.call_args[0][0] == TelemetryConsent.DENIED
+
+
+class TestPublicAPI:
+ """Test the public API functions."""
+
+ @patch('crawl4ai.telemetry.get_telemetry')
+ def test_enable_function(self, mock_get_telemetry):
+ """Test the enable() function."""
+ mock_manager = Mock()
+ mock_get_telemetry.return_value = mock_manager
+
+ enable(email="test@example.com", always=True)
+
+ mock_manager.enable.assert_called_once_with(
+ email="test@example.com",
+ always=True,
+ once=False
+ )
+
+ @patch('crawl4ai.telemetry.get_telemetry')
+ def test_disable_function(self, mock_get_telemetry):
+ """Test the disable() function."""
+ mock_manager = Mock()
+ mock_get_telemetry.return_value = mock_manager
+
+ disable()
+
+ mock_manager.disable.assert_called_once()
+
+ @patch('crawl4ai.telemetry.get_telemetry')
+ def test_status_function(self, mock_get_telemetry):
+ """Test the status() function."""
+ mock_manager = Mock()
+ mock_manager.status.return_value = {
+ 'enabled': True,
+ 'consent': 'always',
+ 'email': 'test@example.com'
+ }
+ mock_get_telemetry.return_value = mock_manager
+
+ result = status()
+
+ assert result['enabled'] is True
+ assert result['consent'] == 'always'
+ assert result['email'] == 'test@example.com'
+
+
+class TestIntegration:
+ """Integration tests for telemetry with AsyncWebCrawler."""
+
+ @pytest.mark.asyncio
+ async def test_crawler_exception_capture(self):
+ """Test that AsyncWebCrawler captures exceptions."""
+ from crawl4ai import AsyncWebCrawler
+
+ with patch('crawl4ai.telemetry.capture_exception') as mock_capture:
+ # This should trigger an exception for invalid URL
+ async with AsyncWebCrawler() as crawler:
+ try:
+ # Use an invalid URL that will cause an error
+ result = await crawler.arun(url="not-a-valid-url")
+ except Exception:
+ pass
+
+ # Check if exception was captured (may not be called if error is handled)
+ # This is more of a smoke test to ensure the integration doesn't break
+
+
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])
\ No newline at end of file