feat: add voice-ai-engine-development skill for building real-time conversational AI

2026-01-27 07:24:06 +02:00
parent e9783892c1
commit d972c4fa3a
9 changed files with 3360 additions and 0 deletions
--- a/skills/voice-ai-engine-development/examples/gemini_agent_example.py
+++ b/skills/voice-ai-engine-development/examples/gemini_agent_example.py
@@ -0,0 +1,239 @@
+"""
+Example: Gemini Agent Implementation with Streaming
+
+This example shows how to implement a Gemini-powered agent
+that properly buffers responses to prevent audio jumping.
+"""
+
+import asyncio
+from typing import AsyncGenerator, List, Dict
+from dataclasses import dataclass
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Message:
+    role: str  # "user" or "assistant"
+    content: str
+
+
+@dataclass
+class GeneratedResponse:
+    message: str
+    is_interruptible: bool = True
+
+
+class GeminiAgent:
+    """
+    LLM-powered conversational agent using Google Gemini
+    
+    Key Features:
+    - Maintains conversation history
+    - Streams responses from Gemini API
+    - Buffers entire response before yielding (prevents audio jumping)
+    - Handles interrupts gracefully
+    """
+    
+    def __init__(self, config: Dict):
+        self.config = config
+        self.conversation_history: List[Message] = []
+        self.system_prompt = config.get("prompt", "You are a helpful AI assistant.")
+        self.current_task = None
+    
+    async def generate_response(
+        self,
+        user_input: str,
+        is_interrupt: bool = False
+    ) -> AsyncGenerator[GeneratedResponse, None]:
+        """
+        Generate streaming response from Gemini
+        
+        IMPORTANT: This buffers the entire LLM response before yielding
+        to prevent audio jumping/cutting off.
+        
+        Args:
+            user_input: The user's message
+            is_interrupt: Whether this is an interrupt
+            
+        Yields:
+            GeneratedResponse with complete buffered message
+        """
+        
+        # Add user message to history
+        self.conversation_history.append(
+            Message(role="user", content=user_input)
+        )
+        
+        logger.info(f"🤖 [AGENT] Generating response for: '{user_input}'")
+        
+        # Build conversation context for Gemini
+        contents = self._build_gemini_contents()
+        
+        # Stream response from Gemini and buffer it
+        full_response = ""
+        
+        try:
+            # In a real implementation, this would call Gemini API
+            # async for chunk in self._create_gemini_stream(contents):
+            #     if isinstance(chunk, str):
+            #         full_response += chunk
+            
+            # For this example, simulate streaming
+            async for chunk in self._simulate_gemini_stream(user_input):
+                full_response += chunk
+                
+                # Log progress (optional)
+                if len(full_response) % 50 == 0:
+                    logger.debug(f"🤖 [AGENT] Buffered {len(full_response)} chars...")
+        
+        except Exception as e:
+            logger.error(f"❌ [AGENT] Error generating response: {e}")
+            full_response = "I apologize, but I encountered an error. Could you please try again?"
+        
+        # CRITICAL: Only yield after buffering the ENTIRE response
+        # This prevents multiple TTS calls that cause audio jumping
+        if full_response.strip():
+            # Add to conversation history
+            self.conversation_history.append(
+                Message(role="assistant", content=full_response)
+            )
+            
+            logger.info(f"✅ [AGENT] Generated complete response ({len(full_response)} chars)")
+            
+            yield GeneratedResponse(
+                message=full_response.strip(),
+                is_interruptible=True
+            )
+    
+    def _build_gemini_contents(self) -> List[Dict]:
+        """
+        Build conversation contents for Gemini API
+        
+        Format:
+        [
+            {"role": "user", "parts": [{"text": "System: ..."}]},
+            {"role": "model", "parts": [{"text": "Understood."}]},
+            {"role": "user", "parts": [{"text": "Hello"}]},
+            {"role": "model", "parts": [{"text": "Hi there!"}]},
+            ...
+        ]
+        """
+        contents = []
+        
+        # Add system prompt as first user message
+        if self.system_prompt:
+            contents.append({
+                "role": "user",
+                "parts": [{"text": f"System Instruction: {self.system_prompt}"}]
+            })
+            contents.append({
+                "role": "model",
+                "parts": [{"text": "Understood."}]
+            })
+        
+        # Add conversation history
+        for message in self.conversation_history:
+            role = "user" if message.role == "user" else "model"
+            contents.append({
+                "role": role,
+                "parts": [{"text": message.content}]
+            })
+        
+        return contents
+    
+    async def _simulate_gemini_stream(self, user_input: str) -> AsyncGenerator[str, None]:
+        """
+        Simulate Gemini streaming response
+        
+        In a real implementation, this would be:
+        
+        async def _create_gemini_stream(self, contents):
+            response = await genai.GenerativeModel('gemini-pro').generate_content_async(
+                contents,
+                stream=True
+            )
+            async for chunk in response:
+                if chunk.text:
+                    yield chunk.text
+        """
+        # Simulate response
+        response = f"I understand you said: {user_input}. How can I assist you further?"
+        
+        # Simulate streaming by yielding chunks
+        chunk_size = 10
+        for i in range(0, len(response), chunk_size):
+            chunk = response[i:i + chunk_size]
+            await asyncio.sleep(0.05)  # Simulate network delay
+            yield chunk
+    
+    def update_last_bot_message_on_cut_off(self, partial_message: str):
+        """
+        Update conversation history when bot is interrupted
+        
+        This ensures the conversation history reflects what was actually spoken,
+        not what was planned to be spoken.
+        
+        Args:
+            partial_message: The partial message that was actually spoken
+        """
+        if self.conversation_history and self.conversation_history[-1].role == "assistant":
+            # Update the last bot message with the partial message
+            self.conversation_history[-1].content = partial_message
+            logger.info(f"📝 [AGENT] Updated history with partial message: '{partial_message}'")
+    
+    def cancel_current_task(self):
+        """Cancel the current generation task (for interrupts)"""
+        if self.current_task and not self.current_task.done():
+            self.current_task.cancel()
+            logger.info("🛑 [AGENT] Cancelled current generation task")
+    
+    def get_conversation_history(self) -> List[Message]:
+        """Get the full conversation history"""
+        return self.conversation_history.copy()
+    
+    def clear_conversation_history(self):
+        """Clear the conversation history"""
+        self.conversation_history.clear()
+        logger.info("🗑️ [AGENT] Cleared conversation history")
+
+
+# ============================================================================
+# Example Usage
+# ============================================================================
+
+async def example_usage():
+    """Example of how to use the GeminiAgent"""
+    
+    # Configure agent
+    config = {
+        "prompt": "You are a helpful AI assistant specializing in voice conversations.",
+        "llmProvider": "gemini"
+    }
+    
+    # Create agent
+    agent = GeminiAgent(config)
+    
+    # Simulate conversation
+    user_messages = [
+        "Hello, how are you?",
+        "What's the weather like today?",
+        "Thank you!"
+    ]
+    
+    for user_message in user_messages:
+        print(f"\n👤 User: {user_message}")
+        
+        # Generate response
+        async for response in agent.generate_response(user_message):
+            print(f"🤖 Bot: {response.message}")
+    
+    # Print conversation history
+    print("\n📜 Conversation History:")
+    for i, message in enumerate(agent.get_conversation_history(), 1):
+        print(f"{i}. {message.role}: {message.content}")
+
+
+if __name__ == "__main__":
+    asyncio.run(example_usage())