3:I[4126,[],""] 4:I[9630,[],""] 5:I[4278,["9856","static/chunks/9856-3b185291364d9bef.js","8172","static/chunks/8172-b3a2d6fe4ae10d40.js","3185","static/chunks/app/layout-2814fa5d15b84fe4.js"],"HeadingProvider"] 6:I[1476,["9856","static/chunks/9856-3b185291364d9bef.js","8172","static/chunks/8172-b3a2d6fe4ae10d40.js","3185","static/chunks/app/layout-2814fa5d15b84fe4.js"],"Header"] 7:I[3167,["9856","static/chunks/9856-3b185291364d9bef.js","8172","static/chunks/8172-b3a2d6fe4ae10d40.js","3185","static/chunks/app/layout-2814fa5d15b84fe4.js"],"Sidebar"] 8:I[7409,["9856","static/chunks/9856-3b185291364d9bef.js","8172","static/chunks/8172-b3a2d6fe4ae10d40.js","3185","static/chunks/app/layout-2814fa5d15b84fe4.js"],"PageFrame"] 0:["X7oMT3VrOffzp0qvbeOas",[[["",{"children":["voice",{"children":["architecture",{"children":["__PAGE__",{}]}]}]},"$undefined","$undefined",true],["",{"children":["voice",{"children":["architecture",{"children":["__PAGE__",{},[["$L1","$L2",null],null],null]},[null,["$","$L3",null,{"parallelRouterKey":"children","segmentPath":["children","voice","children","architecture","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L4",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[null,["$","$L3",null,{"parallelRouterKey":"children","segmentPath":["children","voice","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L4",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/_next/static/css/7f586cdbbaa33ff7.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","className":"h-full","children":["$","body",null,{"className":"__className_f367f3 h-full bg-white dark:bg-gray-900","children":[["$","a",null,{"href":"#main-content","className":"skip-to-content","children":"Skip to main content"}],["$","$L5",null,{"children":[["$","$L6",null,{}],["$","$L7",null,{}],["$","main",null,{"id":"main-content","className":"lg:pl-64","role":"main","aria-label":"Documentation content","children":["$","$L8",null,{"children":["$","$L3",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L4",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]]}]]}]}]],null],null],["$L9",null]]]] 9:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"Voice Mode Architecture | VoiceAssist Docs"}],["$","meta","3",{"name":"description","content":"Comprehensive documentation of VoiceAssist Voice Mode implementation, covering the end-to-end pipeline, STT/LLM/TTS stack, streaming behavior, multilingual support, and medical intelligence."}],["$","meta","4",{"name":"keywords","content":"VoiceAssist,documentation,medical AI,voice assistant,healthcare,HIPAA,API"}],["$","meta","5",{"name":"robots","content":"index, follow"}],["$","meta","6",{"name":"googlebot","content":"index, follow"}],["$","link","7",{"rel":"canonical","href":"https://assistdocs.asimo.io"}],["$","meta","8",{"property":"og:title","content":"VoiceAssist Documentation"}],["$","meta","9",{"property":"og:description","content":"Comprehensive documentation for VoiceAssist - Enterprise Medical AI Assistant"}],["$","meta","10",{"property":"og:url","content":"https://assistdocs.asimo.io"}],["$","meta","11",{"property":"og:site_name","content":"VoiceAssist Docs"}],["$","meta","12",{"property":"og:type","content":"website"}],["$","meta","13",{"name":"twitter:card","content":"summary"}],["$","meta","14",{"name":"twitter:title","content":"VoiceAssist Documentation"}],["$","meta","15",{"name":"twitter:description","content":"Comprehensive documentation for VoiceAssist - Enterprise Medical AI Assistant"}],["$","meta","16",{"name":"next-size-adjust"}]] 1:null a:I[3590,["345","static/chunks/5a5db59f-927469db231ddb1e.js","9940","static/chunks/9940-4c47bdec68cc60d5.js","3723","static/chunks/app/voice/architecture/page-09b3f977aa0e2e9a.js"],"Mermaid"] b:I[4206,["345","static/chunks/5a5db59f-927469db231ddb1e.js","9940","static/chunks/9940-4c47bdec68cc60d5.js","3723","static/chunks/app/voice/architecture/page-09b3f977aa0e2e9a.js"],"Tabs"] c:I[4206,["345","static/chunks/5a5db59f-927469db231ddb1e.js","9940","static/chunks/9940-4c47bdec68cc60d5.js","3723","static/chunks/app/voice/architecture/page-09b3f977aa0e2e9a.js"],"TabList"] d:I[4206,["345","static/chunks/5a5db59f-927469db231ddb1e.js","9940","static/chunks/9940-4c47bdec68cc60d5.js","3723","static/chunks/app/voice/architecture/page-09b3f977aa0e2e9a.js"],"Tab"] e:I[4206,["345","static/chunks/5a5db59f-927469db231ddb1e.js","9940","static/chunks/9940-4c47bdec68cc60d5.js","3723","static/chunks/app/voice/architecture/page-09b3f977aa0e2e9a.js"],"TabPanels"] f:I[4206,["345","static/chunks/5a5db59f-927469db231ddb1e.js","9940","static/chunks/9940-4c47bdec68cc60d5.js","3723","static/chunks/app/voice/architecture/page-09b3f977aa0e2e9a.js"],"TabPanel"] 2:["$","div",null,{"children":[["$","div",null,{"className":"mb-8","children":[["$","h1",null,{"className":"text-3xl font-bold text-gray-900 dark:text-white mb-4","children":"Voice Mode Architecture"}],["$","p",null,{"className":"text-lg text-gray-600 dark:text-gray-400","children":"Comprehensive technical reference for the VoiceAssist Voice Mode implementation, covering the Thinker/Talker pipeline, providers, and latency characteristics."}]]}],["$","div",null,{"className":"bg-slate-50 dark:bg-slate-800 rounded-lg p-6 mb-8","children":[["$","h3",null,{"className":"font-semibold text-slate-800 dark:text-slate-200 mb-4","children":"Voice Mode Stack"}],["$","div",null,{"className":"grid grid-cols-1 md:grid-cols-4 gap-4","children":[["$","div",null,{"children":[["$","h4",null,{"className":"font-medium text-slate-700 dark:text-slate-300 mb-2","children":"STT Provider"}],["$","ul",null,{"className":"text-sm text-slate-600 dark:text-slate-400 space-y-1","children":[["$","li",null,{"children":"• Deepgram (Primary)"}],["$","li",null,{"children":"• Whisper (Fallback)"}],["$","li",null,{"children":"• 100-150ms latency"}]]}]]}],["$","div",null,{"children":[["$","h4",null,{"className":"font-medium text-slate-700 dark:text-slate-300 mb-2","children":"LLM Layer"}],["$","ul",null,{"className":"text-sm text-slate-600 dark:text-slate-400 space-y-1","children":[["$","li",null,{"children":"• GPT-4o (Cloud)"}],["$","li",null,{"children":"• Llama (Local/PHI)"}],["$","li",null,{"children":"• Streaming tokens"}]]}]]}],["$","div",null,{"children":[["$","h4",null,{"className":"font-medium text-slate-700 dark:text-slate-300 mb-2","children":"TTS Provider"}],["$","ul",null,{"className":"text-sm text-slate-600 dark:text-slate-400 space-y-1","children":[["$","li",null,{"children":"• ElevenLabs (Primary)"}],["$","li",null,{"children":"• OpenAI TTS (Fallback)"}],["$","li",null,{"children":"• 28+ languages"}]]}]]}],["$","div",null,{"children":[["$","h4",null,{"className":"font-medium text-slate-700 dark:text-slate-300 mb-2","children":"Latency Target"}],["$","ul",null,{"className":"text-sm text-slate-600 dark:text-slate-400 space-y-1","children":[["$","li",null,{"children":"• <500ms end-to-end"}],["$","li",null,{"children":"• Streaming at all stages"}],["$","li",null,{"children":"• Barge-in support"}]]}]]}]]}]]}],["$","div",null,{"className":"space-y-12","children":[["$","div",null,{"className":"border-t border-gray-200 dark:border-gray-800 pt-8","children":["$","article",null,{"className":"prose prose-slate max-w-none dark:prose-invert","children":[["$","span",null,{"className":"inline-flex items-center rounded-full px-3 py-1 text-sm font-medium bg-blue-100 text-blue-800 dark:bg-blue-900/50 dark:text-blue-200","children":"Voice Mode v2.0 - Thinker/Talker Pipeline"}],"\n",["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-blue-200 bg-blue-50 text-blue-900 dark:border-blue-900/40 dark:bg-blue-900/20 dark:text-blue-100","children":[["$","div",null,{"className":"font-semibold mb-2","children":"Document Purpose"}],["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":"This document provides a comprehensive technical reference for the VoiceAssist Voice Mode implementation. It covers the current architecture, identifies known limitations, and serves as the authoritative source for understanding how voice interactions work in the system."}]}]]}],"\n",["$","h2",null,{"id":"overview","children":["$","a",null,{"href":"#overview","children":"Overview"}]}],"\n",["$","p",null,{"children":["VoiceAssist implements a sophisticated voice-first interface for healthcare professionals, enabling natural spoken interactions with an AI medical assistant. The system uses a ",["$","strong",null,{"children":"Thinker/Talker pipeline architecture"}]," that decouples speech recognition, language model reasoning, and speech synthesis for maximum flexibility and low latency."]}],"\n",["$","h3",null,{"id":"high-level-architecture","children":["$","a",null,{"href":"#high-level-architecture","children":"High-Level Architecture"}]}],"\n",["$","$La",null,{"chart":"flowchart LR\n subgraph Client[\"Frontend (React)\"]\n MIC[Microphone] --> VAD[Voice Activity Detection]\n VAD --> WS[WebSocket Client]\n WS --> PLAYER[Audio Player]\n end\n\n subgraph Gateway[\"API Gateway (FastAPI)\"]\n WSH[WebSocket Handler] --> STT[STT Service]\n STT --> THINKER[Thinker Service]\n THINKER --> TALKER[Talker Service]\n TALKER --> WSH\n end\n\n subgraph External[\"External APIs\"]\n DG[Deepgram STT]\n OAI[OpenAI GPT-4o]\n EL[ElevenLabs TTS]\n LLAMA[Local Llama]\n end\n\n WS <--> WSH\n STT --> DG\n THINKER --> OAI\n THINKER -.->|PHI queries| LLAMA\n TALKER --> EL"}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"current-implementation-of-voice-mode","children":["$","a",null,{"href":"#current-implementation-of-voice-mode","children":"Current Implementation of Voice Mode"}]}],"\n",["$","h3",null,{"id":"end-to-end-pipeline","children":["$","a",null,{"href":"#end-to-end-pipeline","children":"End-to-End Pipeline"}]}],"\n",["$","p",null,{"children":"The voice interaction follows this sequence:"}],"\n",["$","$La",null,{"chart":"sequenceDiagram\n autonumber\n participant User\n participant Frontend\n participant Gateway\n participant Deepgram\n participant Thinker\n participant ElevenLabs\n\n User->>Frontend: Speaks into microphone\n Frontend->>Frontend: MediaRecorder captures 16kHz PCM\n Frontend->>Gateway: WebSocket audio chunks\n Gateway->>Deepgram: Stream audio (WebSocket)\n Deepgram-->>Gateway: Interim transcripts\n Gateway-->>Frontend: transcript.interim\n Deepgram-->>Gateway: Final transcript + VAD events\n Gateway->>Thinker: Process query (GPT-4o/Llama)\n Thinker-->>Gateway: Streaming text response\n Gateway->>ElevenLabs: Text chunks for synthesis\n ElevenLabs-->>Gateway: Streaming audio (24kHz PCM)\n Gateway-->>Frontend: audio.chunk events\n Frontend->>User: Play audio response"}],"\n",["$","h3",null,{"id":"audio-capture","children":["$","a",null,{"href":"#audio-capture","children":"Audio Capture"}]}],"\n",["$","$Lb",null,{"children":[["$","$Lc",null,{"children":[["$","$Ld",null,{"index":0,"children":"Frontend Implementation"}],["$","$Ld",null,{"index":1,"children":"Audio Format"}]]}],["$","$Le",null,{"children":[["$","$Lf",null,{"index":0,"children":[["$","p",null,{"children":"Audio is captured using the Web Audio API and MediaRecorder:"}],["$","ul",null,{"children":["\n",["$","li",null,{"children":[["$","strong",null,{"children":"Hook:"}]," ",["$","code",null,{"children":"useThinkerTalkerSession.ts"}]," manages the voice session"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Component:"}]," ",["$","code",null,{"children":"ThinkerTalkerVoicePanel.tsx"}]," provides the UI"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Capture:"}]," MediaRecorder API with ",["$","code",null,{"children":"audio/webm;codecs=opus"}]," encoding"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Sample Rate:"}]," 16kHz mono (resampled for Deepgram)"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Chunk Size:"}]," 250ms intervals for streaming"]}],"\n"]}]]}],["$","$Lf",null,{"index":1,"children":["$","pre",null,{"children":["$","code",null,{"children":"Format: PCM 16-bit mono\nSample Rate: 16000 Hz (input) / 24000 Hz (output)\nEncoding: Linear16 for STT, PCM for TTS playback\nChunk Duration: 250ms (configurable)\n"}]}]}]]}]]}],"\n",["$","h3",null,{"id":"speech-to-text-stt-providers","children":["$","a",null,{"href":"#speech-to-text-stt-providers","children":"Speech-to-Text (STT) Providers"}]}],"\n",["$","$Lb",null,{"children":[["$","$Lc",null,{"children":[["$","$Ld",null,{"index":0,"children":"Deepgram (Primary)"}],["$","$Ld",null,{"index":1,"children":"Whisper (Fallback)"}]]}],["$","$Le",null,{"children":[["$","$Lf",null,{"index":0,"children":[["$","p",null,{"children":[["$","strong",null,{"children":"Deepgram"}]," is the primary STT provider, chosen for its low-latency streaming capabilities."]}],["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Property"}],["$","th",null,{"children":"Value"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Mode"}]}],["$","td",null,{"children":"WebSocket streaming"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Latency"}]}],["$","td",null,{"children":"100-150ms to first transcript"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Features"}]}],["$","td",null,{"children":"Interim results, VAD events, punctuation, diarization"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Languages"}]}],["$","td",null,{"children":"English (primary), multilingual support"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Config Key"}]}],["$","td",null,{"children":["$","code",null,{"children":"DEEPGRAM_API_KEY"}]}]]}]]}]]}],["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-emerald-200 bg-emerald-50 text-emerald-900 dark:border-emerald-900/40 dark:bg-emerald-900/20 dark:text-emerald-100","children":["$undefined",["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":"Deepgram provides real-time VAD (Voice Activity Detection) events, enabling accurate end-of-utterance detection without client-side inference."}]}]]}]]}],["$","$Lf",null,{"index":1,"children":[["$","p",null,{"children":[["$","strong",null,{"children":"OpenAI Whisper"}]," serves as a fallback when Deepgram is unavailable."]}],["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Property"}],["$","th",null,{"children":"Value"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Mode"}]}],["$","td",null,{"children":"Batch (non-streaming)"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Latency"}]}],["$","td",null,{"children":"500-1500ms (depends on audio length)"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Features"}]}],["$","td",null,{"children":"High accuracy, language detection"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Config Key"}]}],["$","td",null,{"children":["$","code",null,{"children":"OPENAI_API_KEY"}]}]]}]]}]]}],["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-amber-200 bg-amber-50 text-amber-900 dark:border-amber-900/40 dark:bg-amber-900/20 dark:text-amber-100","children":["$undefined",["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":"Whisper operates in batch mode, introducing higher latency. It should only be used when streaming STT is not available."}]}]]}]]}]]}]]}],"\n",["$","h3",null,{"id":"llm--assistant-layer","children":["$","a",null,{"href":"#llm--assistant-layer","children":"LLM / Assistant Layer"}]}],"\n",["$","p",null,{"children":["The ",["$","strong",null,{"children":"Thinker Service"}]," (",["$","code",null,{"children":"thinker_service.py"}],") handles language model reasoning with intelligent routing:"]}],"\n",["$","$Lb",null,{"children":[["$","$Lc",null,{"children":[["$","$Ld",null,{"index":0,"children":"Cloud (GPT-4o)"}],["$","$Ld",null,{"index":1,"children":"Local (Llama)"}]]}],["$","$Le",null,{"children":[["$","$Lf",null,{"index":0,"children":[["$","p",null,{"children":[["$","strong",null,{"children":"OpenAI GPT-4o"}]," is the primary LLM for general queries."]}],["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Property"}],["$","th",null,{"children":"Value"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Model"}]}],["$","td",null,{"children":["$","code",null,{"children":"gpt-4o"}]}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Mode"}]}],["$","td",null,{"children":"Streaming"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Latency"}]}],["$","td",null,{"children":"200-500ms to first token"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Features"}]}],["$","td",null,{"children":"Tool calling, RAG integration, citations"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Use Case"}]}],["$","td",null,{"children":"General medical queries, clinical decision support"}]]}]]}]]}],["$","figure",null,{"data-rehype-pretty-code-figure":"","children":["$","pre",null,{"tabIndex":"0","data-language":"python","data-theme":"github-light github-dark","children":["$","code",null,{"data-language":"python","data-theme":"github-light github-dark","style":{"display":"grid"},"children":[["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"# Query classification determines urgency"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"URGENT"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" → prioritized, faster response"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"SIMPLE"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" → direct answer, minimal context"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"COMPLEX"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" → multi"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"-"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"hop reasoning, "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"RAG"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" retrieval"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"CLARIFICATION"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" → follow"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"-"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"up questions"}]]}]]}]}]}]]}],["$","$Lf",null,{"index":1,"children":[["$","p",null,{"children":[["$","strong",null,{"children":"Local Llama"}]," handles PHI-sensitive queries to avoid sending protected health information to cloud services."]}],["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Property"}],["$","th",null,{"children":"Value"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Model"}]}],["$","td",null,{"children":"Llama 3.1 (8B or 70B)"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Mode"}]}],["$","td",null,{"children":"Local inference"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Latency"}]}],["$","td",null,{"children":"300-800ms (hardware dependent)"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Use Case"}]}],["$","td",null,{"children":"PHI-present queries, patient-specific data"}]]}]]}]]}],["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-blue-200 bg-blue-50 text-blue-900 dark:border-blue-900/40 dark:bg-blue-900/20 dark:text-blue-100","children":["$undefined",["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":"PHI detection uses Presidio and custom regex patterns. When PHI is detected, queries are automatically routed to the local model."}]}]]}]]}]]}]]}],"\n",["$","h3",null,{"id":"text-to-speech-tts-providers","children":["$","a",null,{"href":"#text-to-speech-tts-providers","children":"Text-to-Speech (TTS) Providers"}]}],"\n",["$","$Lb",null,{"children":[["$","$Lc",null,{"children":[["$","$Ld",null,{"index":0,"children":"ElevenLabs (Primary)"}],["$","$Ld",null,{"index":1,"children":"OpenAI TTS (Fallback)"}]]}],["$","$Le",null,{"children":[["$","$Lf",null,{"index":0,"children":[["$","p",null,{"children":[["$","strong",null,{"children":"ElevenLabs"}]," provides premium neural TTS with emotional expressiveness."]}],["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Property"}],["$","th",null,{"children":"Value"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Models"}]}],["$","td",null,{"children":[["$","code",null,{"children":"eleven_multilingual_v2"}],", ",["$","code",null,{"children":"eleven_turbo_v2_5"}]]}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Mode"}]}],["$","td",null,{"children":"HTTP streaming"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Latency"}]}],["$","td",null,{"children":"50-100ms TTFA (time to first audio)"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Languages"}]}],["$","td",null,{"children":"28+ languages"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Voices"}]}],["$","td",null,{"children":"Custom voice IDs, professional cloning"}]]}]]}]]}],["$","p",null,{"children":["$","strong",null,{"children":"Voice Parameters:"}]}],["$","ul",null,{"children":["\n",["$","li",null,{"children":"Stability: 0.0-1.0 (consistency vs. expressiveness)"}],"\n",["$","li",null,{"children":"Clarity: 0.0-1.0 (pronunciation precision)"}],"\n",["$","li",null,{"children":"Style: 0.0-1.0 (emotional intensity)"}],"\n"]}],["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-emerald-200 bg-emerald-50 text-emerald-900 dark:border-emerald-900/40 dark:bg-emerald-900/20 dark:text-emerald-100","children":["$undefined",["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":"ElevenLabs supports SSML tags for prosody control (emphasis, pauses, rate), enabling natural-sounding medical terminology pronunciation."}]}]]}]]}],["$","$Lf",null,{"index":1,"children":[["$","p",null,{"children":[["$","strong",null,{"children":"OpenAI TTS"}]," serves as a fallback with standard voices."]}],["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Property"}],["$","th",null,{"children":"Value"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Voices"}]}],["$","td",null,{"children":"alloy, echo, fable, onyx, nova, shimmer"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Mode"}]}],["$","td",null,{"children":"Streaming"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Quality"}]}],["$","td",null,{"children":"HD audio output"}]]}]]}]]}],["$","figure",null,{"data-rehype-pretty-code-figure":"","children":["$","pre",null,{"tabIndex":"0","data-language":"typescript","data-theme":"github-light github-dark","children":["$","code",null,{"data-language":"typescript","data-theme":"github-light github-dark","style":{"display":"grid"},"children":[["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Voice selection in settings"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"const"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" voices"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" ="}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" ["}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'alloy'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'echo'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'fable'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'onyx'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'nova'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'shimmer'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"];"}]]}]]}]}]}]]}]]}]]}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"streaming-and-latency-behavior","children":["$","a",null,{"href":"#streaming-and-latency-behavior","children":"Streaming and Latency Behavior"}]}],"\n",["$","h3",null,{"id":"streaming-architecture","children":["$","a",null,{"href":"#streaming-architecture","children":"Streaming Architecture"}]}],"\n",["$","p",null,{"children":"All pipeline components support streaming to minimize perceived latency:"}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Component"}],["$","th",null,{"children":"Streaming Mode"}],["$","th",null,{"children":"Chunk Size"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":"STT (Deepgram)"}],["$","td",null,{"children":"WebSocket bidirectional"}],["$","td",null,{"children":"Continuous"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"LLM (GPT-4o)"}],["$","td",null,{"children":"Server-sent events"}],["$","td",null,{"children":"Token-by-token"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"TTS (ElevenLabs)"}],["$","td",null,{"children":"HTTP chunked"}],["$","td",null,{"children":"256 samples (24kHz)"}]]}]]}]]}],"\n",["$","h3",null,{"id":"latency-targets","children":["$","a",null,{"href":"#latency-targets","children":"Latency Targets"}]}],"\n",["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-blue-200 bg-blue-50 text-blue-900 dark:border-blue-900/40 dark:bg-blue-900/20 dark:text-blue-100","children":[["$","div",null,{"className":"font-semibold mb-2","children":"Performance Goals"}],["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":"VoiceAssist targets sub-500ms end-to-end latency for optimal conversational UX."}]}]]}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Stage"}],["$","th",null,{"children":"Target Latency"}],["$","th",null,{"children":"Actual (P95)"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":"Audio capture → STT"}],["$","td",null,{"children":"100-150ms"}],["$","td",null,{"children":"~120ms"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"STT → LLM first token"}],["$","td",null,{"children":"200-300ms"}],["$","td",null,{"children":"~250ms"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"LLM → TTS first audio"}],["$","td",null,{"children":"50-100ms"}],["$","td",null,{"children":"~80ms"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Total (speech-to-audio)"}]}],["$","td",null,{"children":["$","strong",null,{"children":"under 500ms"}]}],["$","td",null,{"children":["$","strong",null,{"children":"~450ms"}]}]]}]]}]]}],"\n",["$","h3",null,{"id":"voice-quality-presets","children":["$","a",null,{"href":"#voice-quality-presets","children":"Voice Quality Presets"}]}],"\n",["$","p",null,{"children":"Users can select latency vs. quality trade-offs:"}],"\n",["$","figure",null,{"data-rehype-pretty-code-figure":"","children":["$","pre",null,{"tabIndex":"0","data-language":"typescript","data-theme":"github-light github-dark","children":["$","code",null,{"data-language":"typescript","data-theme":"github-light github-dark","style":{"display":"grid"},"children":[["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// voiceSettingsStore.ts"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"type"}],["$","span",null,{"style":{"--shiki-light":"#6F42C1","--shiki-dark":"#B392F0"},"children":" VoiceQualityPreset"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" ="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":" 'speed'"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" |"}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":" 'balanced'"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" |"}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":" 'natural'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":";"}]]}],"\n",["$","span",null,{"data-line":"","children":" "}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"const"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" presets"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" ="}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" {"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" speed: { ttfa: "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'100-150ms'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", description: "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'Fastest response'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" },"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" balanced: { ttfa: "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'200-250ms'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", description: "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'Recommended default'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" },"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" natural: { ttfa: "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'300-400ms'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", description: "}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"'Most natural prosody'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" }"}]]}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"};"}]}]]}]}]}],"\n",["$","h3",null,{"id":"vad-and-end-of-utterance-detection","children":["$","a",null,{"href":"#vad-and-end-of-utterance-detection","children":"VAD and End-of-Utterance Detection"}]}],"\n",["$","p",null,{"children":"The system determines when the user has finished speaking using:"}],"\n",["$","ol",null,{"children":["\n",["$","li",null,{"children":[["$","strong",null,{"children":"Deepgram VAD Events:"}]," Server-side voice activity detection"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Silence Threshold:"}]," 800ms of silence triggers end-of-utterance"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"VAD Sensitivity:"}]," 200ms minimum speech duration to avoid false triggers"]}],"\n"]}],"\n",["$","$La",null,{"chart":"stateDiagram-v2\n [*] --> Listening\n Listening --> SpeechDetected: VAD speech_start\n SpeechDetected --> Processing: VAD speech_end (800ms silence)\n SpeechDetected --> SpeechDetected: Continued speech\n Processing --> Responding: LLM response ready\n Responding --> Listening: Audio playback complete\n Responding --> Interrupted: User barge-in\n Interrupted --> Processing: New utterance"}],"\n",["$","h3",null,{"id":"barge-in-support","children":["$","a",null,{"href":"#barge-in-support","children":"Barge-In Support"}]}],"\n",["$","p",null,{"children":"Users can interrupt the AI's response mid-playback:"}],"\n",["$","ul",null,{"children":["\n",["$","li",null,{"children":[["$","strong",null,{"children":"Detection:"}]," ",["$","code",null,{"children":"barge_in_classifier.py"}]," monitors for new speech during playback"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Action:"}]," Current audio playback stops, new utterance is processed"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"UI:"}]," ",["$","code",null,{"children":"VoiceBargeInIndicator.tsx"}]," provides visual feedback"]}],"\n"]}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"multilingual-and-pronunciation-behavior","children":["$","a",null,{"href":"#multilingual-and-pronunciation-behavior","children":"Multilingual and Pronunciation Behavior"}]}],"\n",["$","h3",null,{"id":"supported-languages","children":["$","a",null,{"href":"#supported-languages","children":"Supported Languages"}]}],"\n",["$","$Lb",null,{"children":[["$","$Lc",null,{"children":[["$","$Ld",null,{"index":0,"children":"STT Languages"}],["$","$Ld",null,{"index":1,"children":"TTS Languages"}]]}],["$","$Le",null,{"children":[["$","$Lf",null,{"index":0,"children":[["$","p",null,{"children":[["$","strong",null,{"children":"Deepgram STT"}]," supports multiple languages, but the system is primarily configured for:"]}],["$","ul",null,{"children":["\n",["$","li",null,{"children":"English (US) - Primary"}],"\n",["$","li",null,{"children":"Spanish"}],"\n",["$","li",null,{"children":"French"}],"\n",["$","li",null,{"children":"German"}],"\n",["$","li",null,{"children":"Italian"}],"\n",["$","li",null,{"children":"Portuguese"}],"\n"]}],["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-amber-200 bg-amber-50 text-amber-900 dark:border-amber-900/40 dark:bg-amber-900/20 dark:text-amber-100","children":["$undefined",["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":["Automatic language detection is ",["$","strong",null,{"children":"not currently implemented"}]," in STT. The language must be pre-configured or selected by the user."]}]}]]}]]}],["$","$Lf",null,{"index":1,"children":[["$","p",null,{"children":[["$","strong",null,{"children":"ElevenLabs"}]," ",["$","code",null,{"children":"eleven_multilingual_v2"}]," supports 28+ languages with native pronunciation."]}],["$","p",null,{"children":"High-quality support includes:"}],["$","ul",null,{"children":["\n",["$","li",null,{"children":"English (multiple accents)"}],"\n",["$","li",null,{"children":"Spanish (Castilian, Latin American)"}],"\n",["$","li",null,{"children":"French"}],"\n",["$","li",null,{"children":"German"}],"\n",["$","li",null,{"children":"Italian"}],"\n",["$","li",null,{"children":"Portuguese (Brazilian, European)"}],"\n",["$","li",null,{"children":"Arabic"}],"\n",["$","li",null,{"children":"Hindi"}],"\n",["$","li",null,{"children":"Japanese"}],"\n",["$","li",null,{"children":"Korean"}],"\n",["$","li",null,{"children":"Mandarin Chinese"}],"\n"]}]]}]]}]]}],"\n",["$","h3",null,{"id":"mixed-language-support","children":["$","a",null,{"href":"#mixed-language-support","children":"Mixed-Language Support"}]}],"\n",["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-amber-200 bg-amber-50 text-amber-900 dark:border-amber-900/40 dark:bg-amber-900/20 dark:text-amber-100","children":[["$","div",null,{"className":"font-semibold mb-2","children":"Current Limitation"}],["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":["Mixed-language utterances (e.g., English with Arabic terms) are ",["$","strong",null,{"children":"not fully supported"}],". The STT provider may fail to accurately transcribe code-switched speech."]}]}]]}],"\n",["$","p",null,{"children":["$","strong",null,{"children":"Workarounds:"}]}],"\n",["$","ul",null,{"children":["\n",["$","li",null,{"children":"Configure STT for the dominant language"}],"\n",["$","li",null,{"children":"Use medical terminology in the configured language"}],"\n",["$","li",null,{"children":"Rely on TTS's multilingual model for pronunciation"}],"\n"]}],"\n",["$","h3",null,{"id":"pronunciation-handling","children":["$","a",null,{"href":"#pronunciation-handling","children":"Pronunciation Handling"}]}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Feature"}],["$","th",null,{"children":"Status"}],["$","th",null,{"children":"Notes"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":"Custom lexicons"}],["$","td",null,{"children":"Not implemented"}],["$","td",null,{"children":"No phoneme dictionaries"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"Medical terminology"}],["$","td",null,{"children":"Partial"}],["$","td",null,{"children":"ElevenLabs handles common terms"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"SSML pronunciation"}],["$","td",null,{"children":"Supported"}],["$","td",null,{"children":["Via ",["$","code",null,{"children":"ssml_processor.py"}]]}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"Per-language tuning"}],["$","td",null,{"children":"Not implemented"}],["$","td",null,{"children":"Single-language configuration"}]]}]]}]]}],"\n",["$","p",null,{"children":["$","strong",null,{"children":"Known Issues:"}]}],"\n",["$","ul",null,{"children":["\n",["$","li",null,{"children":"Uncommon drug names may be mispronounced"}],"\n",["$","li",null,{"children":"Eponyms (e.g., \"Parkinson's\", \"Alzheimer's\") generally work well"}],"\n",["$","li",null,{"children":"Abbreviations (e.g., \"mg\", \"mL\") require SSML hints"}],"\n"]}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"architecture-and-module-integration","children":["$","a",null,{"href":"#architecture-and-module-integration","children":"Architecture and Module Integration"}]}],"\n",["$","h3",null,{"id":"backend-service-structure","children":["$","a",null,{"href":"#backend-service-structure","children":"Backend Service Structure"}]}],"\n",["$","p",null,{"children":["The voice pipeline is implemented across multiple services in ",["$","code",null,{"children":"services/api-gateway/app/services/"}],":"]}],"\n",["$","pre",null,{"children":["$","code",null,{"children":"services/\n├── voice_pipeline_service.py # Main orchestrator\n├── streaming_stt_service.py # Deepgram/Whisper STT\n├── thinker_service.py # LLM reasoning\n├── talker_service.py # TTS orchestration\n├── voice_websocket_handler.py # WebSocket management\n├── thinker_talker_websocket_handler.py # T/T protocol\n├── voice_activity_detector.py # VAD logic\n├── barge_in_classifier.py # Interrupt detection\n├── elevenlabs_service.py # ElevenLabs client\n├── openai_tts_service.py # OpenAI TTS client\n├── ssml_processor.py # SSML generation\n├── emotion_detection_service.py # User emotion analysis\n├── prosody_analysis_service.py # Speech prosody\n├── backchannel_service.py # Conversational cues\n└── dictation_service.py # Medical dictation\n"}]}],"\n",["$","h3",null,{"id":"frontend-hook-structure","children":["$","a",null,{"href":"#frontend-hook-structure","children":"Frontend Hook Structure"}]}],"\n",["$","p",null,{"children":["Voice features are exposed via React hooks in ",["$","code",null,{"children":"apps/web-app/src/hooks/"}],":"]}],"\n",["$","figure",null,{"data-rehype-pretty-code-figure":"","children":["$","pre",null,{"tabIndex":"0","data-language":"typescript","data-theme":"github-light github-dark","children":["$","code",null,{"data-language":"typescript","data-theme":"github-light github-dark","style":{"display":"grid"},"children":[["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Primary hooks (current production)"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useThinkerTalkerSession.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Session management"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useThinkerTalkerVoiceMode.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Combined session + playback"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useTTAudioPlayback.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Audio streaming playback"}]]}],"\n",["$","span",null,{"data-line":"","children":" "}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Supporting hooks"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useVoiceMetrics.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Latency tracking"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useVoiceModeStateMachine.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// State management"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useStreamingAudio.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Audio stream handling"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useBackchannelAudio.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// AI conversational cues"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useVoicePreferencesSync.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Settings persistence"}]]}],"\n",["$","span",null,{"data-line":"","children":" "}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Legacy (deprecated)"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"useRealtimeVoiceSession.ts "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// OpenAI Realtime API (deprecated)"}]]}]]}]}]}],"\n",["$","h3",null,{"id":"pipeline-modes","children":["$","a",null,{"href":"#pipeline-modes","children":"Pipeline Modes"}]}],"\n",["$","p",null,{"children":"The voice pipeline supports multiple operating modes:"}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Mode"}],["$","th",null,{"children":"Description"}],["$","th",null,{"children":"Use Case"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","code",null,{"children":"CONVERSATION"}]}],["$","td",null,{"children":"Full Thinker/Talker pipeline"}],["$","td",null,{"children":"Normal voice chat"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","code",null,{"children":"DICTATION"}]}],["$","td",null,{"children":"Speech-to-text with formatting"}],["$","td",null,{"children":"Medical note dictation"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","code",null,{"children":"COMMAND"}]}],["$","td",null,{"children":"Voice command processing"}],["$","td",null,{"children":"Quick actions"}]]}]]}]]}],"\n",["$","h3",null,{"id":"error-handling-and-retries","children":["$","a",null,{"href":"#error-handling-and-retries","children":"Error Handling and Retries"}]}],"\n",["$","figure",null,{"data-rehype-pretty-code-figure":"","children":["$","pre",null,{"tabIndex":"0","data-language":"typescript","data-theme":"github-light github-dark","children":["$","code",null,{"data-language":"typescript","data-theme":"github-light github-dark","style":{"display":"grid"},"children":[["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Circuit breaker pattern for external APIs"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"const"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" circuitBreaker"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" ="}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" {"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" failureThreshold: "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"5"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":","}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" recoveryTimeout: "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"30000"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// 30 seconds"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" halfOpenRequests: "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"3"}]]}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"};"}]}],"\n",["$","span",null,{"data-line":"","children":" "}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// Retry strategy"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"const"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" retryPolicy"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" ="}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" {"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" maxRetries: "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"3"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":","}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" baseDelay: "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"1000"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":","}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" maxDelay: "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"10000"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":","}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" backoffMultiplier: "}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"2"}]]}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"};"}]}]]}]}]}],"\n",["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-blue-200 bg-blue-50 text-blue-900 dark:border-blue-900/40 dark:bg-blue-900/20 dark:text-blue-100","children":["$undefined",["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":"When ElevenLabs fails, the system automatically falls back to OpenAI TTS. When Deepgram fails, batch Whisper transcription is used."}]}]]}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"medical-intelligence-and-data-sources","children":["$","a",null,{"href":"#medical-intelligence-and-data-sources","children":"Medical Intelligence and Data Sources"}]}],"\n",["$","h3",null,{"id":"currently-integrated-sources","children":["$","a",null,{"href":"#currently-integrated-sources","children":"Currently Integrated Sources"}]}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Source"}],["$","th",null,{"children":"Type"}],["$","th",null,{"children":"Integration"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"PubMed (NCBI)"}]}],["$","td",null,{"children":"Research articles"}],["$","td",null,{"children":"E-utilities API"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"OpenEvidence"}]}],["$","td",null,{"children":"Clinical evidence"}],["$","td",null,{"children":"REST API"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Medical Guidelines"}]}],["$","td",null,{"children":"Curated guidelines"}],["$","td",null,{"children":"Local vector DB"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Epic FHIR"}]}],["$","td",null,{"children":"EHR data"}],["$","td",null,{"children":"FHIR R4 API"}]]}]]}]]}],"\n",["$","h3",null,{"id":"rag-architecture","children":["$","a",null,{"href":"#rag-architecture","children":"RAG Architecture"}]}],"\n",["$","p",null,{"children":"The system uses Retrieval-Augmented Generation for evidence-based responses:"}],"\n",["$","$La",null,{"chart":"flowchart TB\n QUERY[User Query] --> CLASSIFY[Query Classifier]\n CLASSIFY --> EXPAND[Query Expansion]\n EXPAND --> HYBRID[Hybrid Search]\n\n subgraph Search[\"Search Layer\"]\n HYBRID --> BM25[BM25 Keyword]\n HYBRID --> VECTOR[Vector Similarity]\n BM25 --> RRF[Reciprocal Rank Fusion]\n VECTOR --> RRF\n end\n\n RRF --> RERANK[Re-ranker]\n RERANK --> CONTEXT[Context Assembly]\n CONTEXT --> LLM[LLM Generation]\n LLM --> CITATIONS[Citation Formatting]"}],"\n",["$","h3",null,{"id":"medical-embedding-models","children":["$","a",null,{"href":"#medical-embedding-models","children":"Medical Embedding Models"}]}],"\n",["$","p",null,{"children":"Multiple embedding models are available for semantic search:"}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Model"}],["$","th",null,{"children":"Dimensions"}],["$","th",null,{"children":"Best For"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":"OpenAI text-embedding-3-large"}],["$","td",null,{"children":"3072"}],["$","td",null,{"children":"General queries"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"PubMedBERT"}],["$","td",null,{"children":"768"}],["$","td",null,{"children":"Research literature"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"BioBERT"}],["$","td",null,{"children":"768"}],["$","td",null,{"children":"Biomedical text"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":"MedCPT"}],["$","td",null,{"children":"768"}],["$","td",null,{"children":"Clinical queries"}]]}]]}]]}],"\n",["$","h3",null,{"id":"fhir-integration","children":["$","a",null,{"href":"#fhir-integration","children":"FHIR Integration"}]}],"\n",["$","$Lb",null,{"children":[["$","$Lc",null,{"children":[["$","$Ld",null,{"index":0,"children":"Read Operations"}],["$","$Ld",null,{"index":1,"children":"Write Operations"}]]}],["$","$Le",null,{"children":[["$","$Lf",null,{"index":0,"children":[["$","p",null,{"children":["$","strong",null,{"children":"Fully Implemented:"}]}],["$","ul",null,{"children":["\n",["$","li",null,{"children":"Patient demographics"}],"\n",["$","li",null,{"children":"MedicationRequest (active/historical)"}],"\n",["$","li",null,{"children":"Condition (diagnoses, ICD-10)"}],"\n",["$","li",null,{"children":"Observation (labs, vitals, LOINC)"}],"\n",["$","li",null,{"children":"AllergyIntolerance"}],"\n",["$","li",null,{"children":"Procedure (CPT codes)"}],"\n",["$","li",null,{"children":"Encounter history"}],"\n"]}]]}],["$","$Lf",null,{"index":1,"children":[["$","p",null,{"children":["$","strong",null,{"children":"Partially Implemented:"}]}],["$","ul",null,{"children":["\n",["$","li",null,{"children":"MedicationRequest (prescribing)"}],"\n",["$","li",null,{"children":"ServiceRequest (lab orders)"}],"\n",["$","li",null,{"children":"DocumentReference (notes)"}],"\n"]}],["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-amber-200 bg-amber-50 text-amber-900 dark:border-amber-900/40 dark:bg-amber-900/20 dark:text-amber-100","children":["$undefined",["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":"Write operations require Epic adapter configuration and are not fully functional in all deployments."}]}]]}]]}]]}]]}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"known-gaps-and-todos","children":["$","a",null,{"href":"#known-gaps-and-todos","children":"Known Gaps and TODOs"}]}],"\n",["$","h3",null,{"id":"voice-pipeline-gaps","children":["$","a",null,{"href":"#voice-pipeline-gaps","children":"Voice Pipeline Gaps"}]}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Gap"}],["$","th",null,{"children":"Description"}],["$","th",null,{"children":"Priority"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Language Detection"}]}],["$","td",null,{"children":"No automatic STT language detection"}],["$","td",null,{"children":"High"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Mixed Language"}]}],["$","td",null,{"children":"Code-switched speech not supported"}],["$","td",null,{"children":"Medium"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Custom Lexicons"}]}],["$","td",null,{"children":"No phoneme/pronunciation dictionaries"}],["$","td",null,{"children":"Medium"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Speaker ID"}]}],["$","td",null,{"children":"No multi-speaker diarization"}],["$","td",null,{"children":"Low"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Noise Suppression"}]}],["$","td",null,{"children":"Limited background noise handling"}],["$","td",null,{"children":"Medium"}]]}]]}]]}],"\n",["$","h3",null,{"id":"medical-intelligence-gaps","children":["$","a",null,{"href":"#medical-intelligence-gaps","children":"Medical Intelligence Gaps"}]}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Gap"}],["$","th",null,{"children":"Description"}],["$","th",null,{"children":"Priority"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Drug Interactions"}]}],["$","td",null,{"children":"No PharmGKB integration"}],["$","td",null,{"children":"High"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Real-time EHR"}]}],["$","td",null,{"children":"No streaming vital signs"}],["$","td",null,{"children":"Medium"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Clinical NER"}]}],["$","td",null,{"children":"No medication/condition extraction from text"}],["$","td",null,{"children":"High"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"SNOMED CT"}]}],["$","td",null,{"children":"No ontology mapping"}],["$","td",null,{"children":"Medium"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Evidence Grading"}]}],["$","td",null,{"children":"Limited quality assessment"}],["$","td",null,{"children":"Medium"}]]}]]}]]}],"\n",["$","h3",null,{"id":"documentation-gaps","children":["$","a",null,{"href":"#documentation-gaps","children":"Documentation Gaps"}]}],"\n",["$","table",null,{"children":[["$","thead",null,{"children":["$","tr",null,{"children":[["$","th",null,{"children":"Gap"}],["$","th",null,{"children":"Information Needed"}]]}]}],["$","tbody",null,{"children":[["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Exact VAD thresholds"}]}],["$","td",null,{"children":"Configurable silence duration and sensitivity"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"ElevenLabs voice IDs"}]}],["$","td",null,{"children":"Complete list of available voices and characteristics"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"PHI detection rules"}]}],["$","td",null,{"children":"Full regex patterns and Presidio configuration"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"Fallback behavior"}]}],["$","td",null,{"children":"Exact conditions triggering provider fallbacks"}]]}],["$","tr",null,{"children":[["$","td",null,{"children":["$","strong",null,{"children":"WebSocket protocol"}]}],["$","td",null,{"children":"Complete message schema and error codes"}]]}]]}]]}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"configuration-reference","children":["$","a",null,{"href":"#configuration-reference","children":"Configuration Reference"}]}],"\n",["$","h3",null,{"id":"environment-variables","children":["$","a",null,{"href":"#environment-variables","children":"Environment Variables"}]}],"\n",["$","figure",null,{"data-rehype-pretty-code-figure":"","children":["$","pre",null,{"tabIndex":"0","data-language":"bash","data-theme":"github-light github-dark","children":["$","code",null,{"data-language":"bash","data-theme":"github-light github-dark","style":{"display":"grid"},"children":[["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"# STT Configuration"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"DEEPGRAM_API_KEY"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"your-deepgram-key"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"VOICE_PIPELINE_STT_PRIMARY"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"deepgram"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"VOICE_PIPELINE_STT_FALLBACK"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"whisper"}]]}],"\n",["$","span",null,{"data-line":"","children":" "}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"# TTS Configuration"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"ELEVENLABS_API_KEY"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"your-elevenlabs-key"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"VOICE_PIPELINE_TTS_PROVIDER"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"elevenlabs"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"TTS_VOICE"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"default-voice-id"}]]}],"\n",["$","span",null,{"data-line":"","children":" "}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"# LLM Configuration"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"OPENAI_API_KEY"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"your-openai-key"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"LOCAL_LLM_ENDPOINT"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"http://localhost:11434"}]]}],"\n",["$","span",null,{"data-line":"","children":" "}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"# Voice Pipeline"}]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"VOICE_WS_MAX_INFLIGHT"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"10"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"VAD_SILENCE_THRESHOLD_MS"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"800"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"VAD_SENSITIVITY_MS"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"="}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":"200"}]]}]]}]}]}],"\n",["$","h3",null,{"id":"user-preferences-voicesettingsstore","children":["$","a",null,{"href":"#user-preferences-voicesettingsstore","children":"User Preferences (voiceSettingsStore)"}]}],"\n",["$","figure",null,{"data-rehype-pretty-code-figure":"","children":["$","pre",null,{"tabIndex":"0","data-language":"typescript","data-theme":"github-light github-dark","children":["$","code",null,{"data-language":"typescript","data-theme":"github-light github-dark","style":{"display":"grid"},"children":[["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"interface"}],["$","span",null,{"style":{"--shiki-light":"#6F42C1","--shiki-dark":"#B392F0"},"children":" VoiceSettings"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":" {"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" voiceId"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" string"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"; "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// ElevenLabs voice ID"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" language"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" string"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"; "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// ISO language code"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" playbackSpeed"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" number"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"; "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// 0.5-2.0x"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" stability"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" number"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"; "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// 0.0-1.0"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" clarity"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" number"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"; "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// 0.0-1.0"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" expressiveness"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" number"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"; "}],["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// 0.0-1.0"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" qualityPreset"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":" 'speed'"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" |"}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":" 'balanced'"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":" |"}],["$","span",null,{"style":{"--shiki-light":"#032F62","--shiki-dark":"#9ECBFF"},"children":" 'natural'"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":";"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" pushToTalk"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" boolean"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":";"}]]}],"\n",["$","span",null,{"data-line":"","children":[["$","span",null,{"style":{"--shiki-light":"#E36209","--shiki-dark":"#FFAB70"},"children":" autoPlay"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":":"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":" boolean"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":";"}]]}],"\n",["$","span",null,{"data-line":"","children":["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"}"}]}]]}]}]}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"cost-philosophy","children":["$","a",null,{"href":"#cost-philosophy","children":"Cost Philosophy"}]}],"\n",["$","div",null,{"className":"my-4 rounded-lg border p-4 shadow-sm backdrop-blur border-emerald-200 bg-emerald-50 text-emerald-900 dark:border-emerald-900/40 dark:bg-emerald-900/20 dark:text-emerald-100","children":[["$","div",null,{"className":"font-semibold mb-2","children":"Important Context"}],["$","div",null,{"className":"space-y-2 text-sm leading-relaxed","children":["$","p",null,{"children":["The product team is ",["$","strong",null,{"children":"not trying to reduce costs"}]," at the expense of quality. We are willing to ",["$","strong",null,{"children":"increase costs"}]," when it demonstrably improves the voice experience. However, we aim to avoid wasteful spending and prefer solutions with strong cost-benefit ratios."]}]}]]}],"\n",["$","p",null,{"children":["$","strong",null,{"children":"Guiding Principles:"}]}],"\n",["$","ol",null,{"children":["\n",["$","li",null,{"children":[["$","strong",null,{"children":"Quality First:"}]," Premium providers (ElevenLabs, Deepgram) are preferred for their superior quality"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Smart Fallbacks:"}]," Cost-effective alternatives only activate when primary providers fail"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"No Downgrades:"}]," Never propose replacing current components with cheaper, lower-quality alternatives"]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Measured Upgrades:"}]," New features should justify their cost with measurable UX improvements"]}],"\n"]}],"\n",["$","hr",null,{}],"\n",["$","h2",null,{"id":"references","children":["$","a",null,{"href":"#references","children":"References"}]}],"\n",["$","h3",null,{"id":"backend-files","children":["$","a",null,{"href":"#backend-files","children":"Backend Files"}]}],"\n",["$","ul",null,{"children":["\n",["$","li",null,{"children":["$","code",null,{"children":"services/api-gateway/app/services/voice_pipeline_service.py"}]}],"\n",["$","li",null,{"children":["$","code",null,{"children":"services/api-gateway/app/services/streaming_stt_service.py"}]}],"\n",["$","li",null,{"children":["$","code",null,{"children":"services/api-gateway/app/services/thinker_service.py"}]}],"\n",["$","li",null,{"children":["$","code",null,{"children":"services/api-gateway/app/services/talker_service.py"}]}],"\n",["$","li",null,{"children":["$","code",null,{"children":"services/api-gateway/app/services/elevenlabs_service.py"}]}],"\n"]}],"\n",["$","h3",null,{"id":"frontend-files","children":["$","a",null,{"href":"#frontend-files","children":"Frontend Files"}]}],"\n",["$","ul",null,{"children":["\n",["$","li",null,{"children":["$","code",null,{"children":"apps/web-app/src/hooks/useThinkerTalkerSession.ts"}]}],"\n",["$","li",null,{"children":["$","code",null,{"children":"apps/web-app/src/hooks/useThinkerTalkerVoiceMode.ts"}]}],"\n",["$","li",null,{"children":["$","code",null,{"children":"apps/web-app/src/components/voice/ThinkerTalkerVoicePanel.tsx"}]}],"\n",["$","li",null,{"children":["$","code",null,{"children":"apps/web-app/src/stores/voiceSettingsStore.ts"}]}],"\n"]}],"\n",["$","h3",null,{"id":"related-documentation","children":["$","a",null,{"href":"#related-documentation","children":"Related Documentation"}]}],"\n",["$","ul",null,{"children":["\n",["$","li",null,{"children":["$","a",null,{"href":"/backend/websocket-protocol","children":"Realtime WebSocket Protocol"}]}],"\n",["$","li",null,{"children":["$","a",null,{"href":"/operations/deployment","children":"Deployment Guide"}]}],"\n"]}]]}]}],false]}]]}]