- Wrapped unquoted @scope/pkg values in double quotes across 19 SKILL.md files. - Added 'package' to ALLOWED_FIELDS in JS validator. - Added YAML validity regression test to test suite. - Updated package-lock.json. Fixes #79 Closes #80
466 lines
13 KiB
Markdown
466 lines
13 KiB
Markdown
---
|
|
name: azure-ai-voicelive-ts
|
|
description: |
|
|
Azure AI Voice Live SDK for JavaScript/TypeScript. Build real-time voice AI applications with bidirectional WebSocket communication. Use for voice assistants, conversational AI, real-time speech-to-speech, and voice-enabled chatbots in Node.js or browser environments. Triggers: "voice live", "real-time voice", "VoiceLiveClient", "VoiceLiveSession", "voice assistant TypeScript", "bidirectional audio", "speech-to-speech JavaScript".
|
|
package: "@azure/ai-voicelive"
|
|
---
|
|
|
|
# @azure/ai-voicelive (JavaScript/TypeScript)
|
|
|
|
Real-time voice AI SDK for building bidirectional voice assistants with Azure AI in Node.js and browser environments.
|
|
|
|
## Installation
|
|
|
|
```bash
|
|
npm install @azure/ai-voicelive @azure/identity
|
|
# TypeScript users
|
|
npm install @types/node
|
|
```
|
|
|
|
**Current Version**: 1.0.0-beta.3
|
|
|
|
**Supported Environments**:
|
|
- Node.js LTS versions (20+)
|
|
- Modern browsers (Chrome, Firefox, Safari, Edge)
|
|
|
|
## Environment Variables
|
|
|
|
```bash
|
|
AZURE_VOICELIVE_ENDPOINT=https://<resource>.cognitiveservices.azure.com
|
|
# Optional: API key if not using Entra ID
|
|
AZURE_VOICELIVE_API_KEY=<your-api-key>
|
|
# Optional: Logging
|
|
AZURE_LOG_LEVEL=info
|
|
```
|
|
|
|
## Authentication
|
|
|
|
### Microsoft Entra ID (Recommended)
|
|
|
|
```typescript
|
|
import { DefaultAzureCredential } from "@azure/identity";
|
|
import { VoiceLiveClient } from "@azure/ai-voicelive";
|
|
|
|
const credential = new DefaultAzureCredential();
|
|
const endpoint = "https://your-resource.cognitiveservices.azure.com";
|
|
|
|
const client = new VoiceLiveClient(endpoint, credential);
|
|
```
|
|
|
|
### API Key
|
|
|
|
```typescript
|
|
import { AzureKeyCredential } from "@azure/core-auth";
|
|
import { VoiceLiveClient } from "@azure/ai-voicelive";
|
|
|
|
const endpoint = "https://your-resource.cognitiveservices.azure.com";
|
|
const credential = new AzureKeyCredential("your-api-key");
|
|
|
|
const client = new VoiceLiveClient(endpoint, credential);
|
|
```
|
|
|
|
## Client Hierarchy
|
|
|
|
```
|
|
VoiceLiveClient
|
|
└── VoiceLiveSession (WebSocket connection)
|
|
├── updateSession() → Configure session options
|
|
├── subscribe() → Event handlers (Azure SDK pattern)
|
|
├── sendAudio() → Stream audio input
|
|
├── addConversationItem() → Add messages/function outputs
|
|
└── sendEvent() → Send raw protocol events
|
|
```
|
|
|
|
## Quick Start
|
|
|
|
```typescript
|
|
import { DefaultAzureCredential } from "@azure/identity";
|
|
import { VoiceLiveClient } from "@azure/ai-voicelive";
|
|
|
|
const credential = new DefaultAzureCredential();
|
|
const endpoint = process.env.AZURE_VOICELIVE_ENDPOINT!;
|
|
|
|
// Create client and start session
|
|
const client = new VoiceLiveClient(endpoint, credential);
|
|
const session = await client.startSession("gpt-4o-mini-realtime-preview");
|
|
|
|
// Configure session
|
|
await session.updateSession({
|
|
modalities: ["text", "audio"],
|
|
instructions: "You are a helpful AI assistant. Respond naturally.",
|
|
voice: {
|
|
type: "azure-standard",
|
|
name: "en-US-AvaNeural",
|
|
},
|
|
turnDetection: {
|
|
type: "server_vad",
|
|
threshold: 0.5,
|
|
prefixPaddingMs: 300,
|
|
silenceDurationMs: 500,
|
|
},
|
|
inputAudioFormat: "pcm16",
|
|
outputAudioFormat: "pcm16",
|
|
});
|
|
|
|
// Subscribe to events
|
|
const subscription = session.subscribe({
|
|
onResponseAudioDelta: async (event, context) => {
|
|
// Handle streaming audio output
|
|
const audioData = event.delta;
|
|
playAudioChunk(audioData);
|
|
},
|
|
onResponseTextDelta: async (event, context) => {
|
|
// Handle streaming text
|
|
process.stdout.write(event.delta);
|
|
},
|
|
onInputAudioTranscriptionCompleted: async (event, context) => {
|
|
console.log("User said:", event.transcript);
|
|
},
|
|
});
|
|
|
|
// Send audio from microphone
|
|
function sendAudioChunk(audioBuffer: ArrayBuffer) {
|
|
session.sendAudio(audioBuffer);
|
|
}
|
|
```
|
|
|
|
## Session Configuration
|
|
|
|
```typescript
|
|
await session.updateSession({
|
|
// Modalities
|
|
modalities: ["audio", "text"],
|
|
|
|
// System instructions
|
|
instructions: "You are a customer service representative.",
|
|
|
|
// Voice selection
|
|
voice: {
|
|
type: "azure-standard", // or "azure-custom", "openai"
|
|
name: "en-US-AvaNeural",
|
|
},
|
|
|
|
// Turn detection (VAD)
|
|
turnDetection: {
|
|
type: "server_vad", // or "azure_semantic_vad"
|
|
threshold: 0.5,
|
|
prefixPaddingMs: 300,
|
|
silenceDurationMs: 500,
|
|
},
|
|
|
|
// Audio formats
|
|
inputAudioFormat: "pcm16",
|
|
outputAudioFormat: "pcm16",
|
|
|
|
// Tools (function calling)
|
|
tools: [
|
|
{
|
|
type: "function",
|
|
name: "get_weather",
|
|
description: "Get current weather",
|
|
parameters: {
|
|
type: "object",
|
|
properties: {
|
|
location: { type: "string" }
|
|
},
|
|
required: ["location"]
|
|
}
|
|
}
|
|
],
|
|
toolChoice: "auto",
|
|
});
|
|
```
|
|
|
|
## Event Handling (Azure SDK Pattern)
|
|
|
|
The SDK uses a subscription-based event handling pattern:
|
|
|
|
```typescript
|
|
const subscription = session.subscribe({
|
|
// Connection lifecycle
|
|
onConnected: async (args, context) => {
|
|
console.log("Connected:", args.connectionId);
|
|
},
|
|
onDisconnected: async (args, context) => {
|
|
console.log("Disconnected:", args.code, args.reason);
|
|
},
|
|
onError: async (args, context) => {
|
|
console.error("Error:", args.error.message);
|
|
},
|
|
|
|
// Session events
|
|
onSessionCreated: async (event, context) => {
|
|
console.log("Session created:", context.sessionId);
|
|
},
|
|
onSessionUpdated: async (event, context) => {
|
|
console.log("Session updated");
|
|
},
|
|
|
|
// Audio input events (VAD)
|
|
onInputAudioBufferSpeechStarted: async (event, context) => {
|
|
console.log("Speech started at:", event.audioStartMs);
|
|
},
|
|
onInputAudioBufferSpeechStopped: async (event, context) => {
|
|
console.log("Speech stopped at:", event.audioEndMs);
|
|
},
|
|
|
|
// Transcription events
|
|
onConversationItemInputAudioTranscriptionCompleted: async (event, context) => {
|
|
console.log("User said:", event.transcript);
|
|
},
|
|
onConversationItemInputAudioTranscriptionDelta: async (event, context) => {
|
|
process.stdout.write(event.delta);
|
|
},
|
|
|
|
// Response events
|
|
onResponseCreated: async (event, context) => {
|
|
console.log("Response started");
|
|
},
|
|
onResponseDone: async (event, context) => {
|
|
console.log("Response complete");
|
|
},
|
|
|
|
// Streaming text
|
|
onResponseTextDelta: async (event, context) => {
|
|
process.stdout.write(event.delta);
|
|
},
|
|
onResponseTextDone: async (event, context) => {
|
|
console.log("\n--- Text complete ---");
|
|
},
|
|
|
|
// Streaming audio
|
|
onResponseAudioDelta: async (event, context) => {
|
|
const audioData = event.delta;
|
|
playAudioChunk(audioData);
|
|
},
|
|
onResponseAudioDone: async (event, context) => {
|
|
console.log("Audio complete");
|
|
},
|
|
|
|
// Audio transcript (what assistant said)
|
|
onResponseAudioTranscriptDelta: async (event, context) => {
|
|
process.stdout.write(event.delta);
|
|
},
|
|
|
|
// Function calling
|
|
onResponseFunctionCallArgumentsDone: async (event, context) => {
|
|
if (event.name === "get_weather") {
|
|
const args = JSON.parse(event.arguments);
|
|
const result = await getWeather(args.location);
|
|
|
|
await session.addConversationItem({
|
|
type: "function_call_output",
|
|
callId: event.callId,
|
|
output: JSON.stringify(result),
|
|
});
|
|
|
|
await session.sendEvent({ type: "response.create" });
|
|
}
|
|
},
|
|
|
|
// Catch-all for debugging
|
|
onServerEvent: async (event, context) => {
|
|
console.log("Event:", event.type);
|
|
},
|
|
});
|
|
|
|
// Clean up when done
|
|
await subscription.close();
|
|
```
|
|
|
|
## Function Calling
|
|
|
|
```typescript
|
|
// Define tools in session config
|
|
await session.updateSession({
|
|
modalities: ["audio", "text"],
|
|
instructions: "Help users with weather information.",
|
|
tools: [
|
|
{
|
|
type: "function",
|
|
name: "get_weather",
|
|
description: "Get current weather for a location",
|
|
parameters: {
|
|
type: "object",
|
|
properties: {
|
|
location: {
|
|
type: "string",
|
|
description: "City and state or country",
|
|
},
|
|
},
|
|
required: ["location"],
|
|
},
|
|
},
|
|
],
|
|
toolChoice: "auto",
|
|
});
|
|
|
|
// Handle function calls
|
|
const subscription = session.subscribe({
|
|
onResponseFunctionCallArgumentsDone: async (event, context) => {
|
|
if (event.name === "get_weather") {
|
|
const args = JSON.parse(event.arguments);
|
|
const weatherData = await fetchWeather(args.location);
|
|
|
|
// Send function result
|
|
await session.addConversationItem({
|
|
type: "function_call_output",
|
|
callId: event.callId,
|
|
output: JSON.stringify(weatherData),
|
|
});
|
|
|
|
// Trigger response generation
|
|
await session.sendEvent({ type: "response.create" });
|
|
}
|
|
},
|
|
});
|
|
```
|
|
|
|
## Voice Options
|
|
|
|
| Voice Type | Config | Example |
|
|
|------------|--------|---------|
|
|
| Azure Standard | `{ type: "azure-standard", name: "..." }` | `"en-US-AvaNeural"` |
|
|
| Azure Custom | `{ type: "azure-custom", name: "...", endpointId: "..." }` | Custom voice endpoint |
|
|
| Azure Personal | `{ type: "azure-personal", speakerProfileId: "..." }` | Personal voice clone |
|
|
| OpenAI | `{ type: "openai", name: "..." }` | `"alloy"`, `"echo"`, `"shimmer"` |
|
|
|
|
## Supported Models
|
|
|
|
| Model | Description | Use Case |
|
|
|-------|-------------|----------|
|
|
| `gpt-4o-realtime-preview` | GPT-4o with real-time audio | High-quality conversational AI |
|
|
| `gpt-4o-mini-realtime-preview` | Lightweight GPT-4o | Fast, efficient interactions |
|
|
| `phi4-mm-realtime` | Phi multimodal | Cost-effective applications |
|
|
|
|
## Turn Detection Options
|
|
|
|
```typescript
|
|
// Server VAD (default)
|
|
turnDetection: {
|
|
type: "server_vad",
|
|
threshold: 0.5,
|
|
prefixPaddingMs: 300,
|
|
silenceDurationMs: 500,
|
|
}
|
|
|
|
// Azure Semantic VAD (smarter detection)
|
|
turnDetection: {
|
|
type: "azure_semantic_vad",
|
|
}
|
|
|
|
// Azure Semantic VAD (English optimized)
|
|
turnDetection: {
|
|
type: "azure_semantic_vad_en",
|
|
}
|
|
|
|
// Azure Semantic VAD (Multilingual)
|
|
turnDetection: {
|
|
type: "azure_semantic_vad_multilingual",
|
|
}
|
|
```
|
|
|
|
## Audio Formats
|
|
|
|
| Format | Sample Rate | Use Case |
|
|
|--------|-------------|----------|
|
|
| `pcm16` | 24kHz | Default, high quality |
|
|
| `pcm16-8000hz` | 8kHz | Telephony |
|
|
| `pcm16-16000hz` | 16kHz | Voice assistants |
|
|
| `g711_ulaw` | 8kHz | Telephony (US) |
|
|
| `g711_alaw` | 8kHz | Telephony (EU) |
|
|
|
|
## Key Types Reference
|
|
|
|
| Type | Purpose |
|
|
|------|---------|
|
|
| `VoiceLiveClient` | Main client for creating sessions |
|
|
| `VoiceLiveSession` | Active WebSocket session |
|
|
| `VoiceLiveSessionHandlers` | Event handler interface |
|
|
| `VoiceLiveSubscription` | Active event subscription |
|
|
| `ConnectionContext` | Context for connection events |
|
|
| `SessionContext` | Context for session events |
|
|
| `ServerEventUnion` | Union of all server events |
|
|
|
|
## Error Handling
|
|
|
|
```typescript
|
|
import {
|
|
VoiceLiveError,
|
|
VoiceLiveConnectionError,
|
|
VoiceLiveAuthenticationError,
|
|
VoiceLiveProtocolError,
|
|
} from "@azure/ai-voicelive";
|
|
|
|
const subscription = session.subscribe({
|
|
onError: async (args, context) => {
|
|
const { error } = args;
|
|
|
|
if (error instanceof VoiceLiveConnectionError) {
|
|
console.error("Connection error:", error.message);
|
|
} else if (error instanceof VoiceLiveAuthenticationError) {
|
|
console.error("Auth error:", error.message);
|
|
} else if (error instanceof VoiceLiveProtocolError) {
|
|
console.error("Protocol error:", error.message);
|
|
}
|
|
},
|
|
|
|
onServerError: async (event, context) => {
|
|
console.error("Server error:", event.error?.message);
|
|
},
|
|
});
|
|
```
|
|
|
|
## Logging
|
|
|
|
```typescript
|
|
import { setLogLevel } from "@azure/logger";
|
|
|
|
// Enable verbose logging
|
|
setLogLevel("info");
|
|
|
|
// Or via environment variable
|
|
// AZURE_LOG_LEVEL=info
|
|
```
|
|
|
|
## Browser Usage
|
|
|
|
```typescript
|
|
// Browser requires bundler (Vite, webpack, etc.)
|
|
import { VoiceLiveClient } from "@azure/ai-voicelive";
|
|
import { InteractiveBrowserCredential } from "@azure/identity";
|
|
|
|
// Use browser-compatible credential
|
|
const credential = new InteractiveBrowserCredential({
|
|
clientId: "your-client-id",
|
|
tenantId: "your-tenant-id",
|
|
});
|
|
|
|
const client = new VoiceLiveClient(endpoint, credential);
|
|
|
|
// Request microphone access
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
const audioContext = new AudioContext({ sampleRate: 24000 });
|
|
|
|
// Process audio and send to session
|
|
// ... (see samples for full implementation)
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
1. **Always use `DefaultAzureCredential`** — Never hardcode API keys
|
|
2. **Set both modalities** — Include `["text", "audio"]` for voice assistants
|
|
3. **Use Azure Semantic VAD** — Better turn detection than basic server VAD
|
|
4. **Handle all error types** — Connection, auth, and protocol errors
|
|
5. **Clean up subscriptions** — Call `subscription.close()` when done
|
|
6. **Use appropriate audio format** — PCM16 at 24kHz for best quality
|
|
|
|
## Reference Links
|
|
|
|
| Resource | URL |
|
|
|----------|-----|
|
|
| npm Package | https://www.npmjs.com/package/@azure/ai-voicelive |
|
|
| GitHub Source | https://github.com/Azure/azure-sdk-for-js/tree/main/sdk/ai/ai-voicelive |
|
|
| Samples | https://github.com/Azure/azure-sdk-for-js/tree/main/sdk/ai/ai-voicelive/samples |
|
|
| API Reference | https://learn.microsoft.com/javascript/api/@azure/ai-voicelive |
|