from fastapi import FastAPI, WebSocket
import asyncio
import base64
from voice_pipeline import VoicePipeline
app = FastAPI()
@app.websocket('/twilio-stream')
async def twilio_stream(ws: WebSocket):
await ws.accept()
pipeline = VoicePipeline()
try:
async for msg in ws.iter_json():
if msg['event'] == 'start':
pipeline.call_sid = msg['start']['callSid']
pipeline.from_number = msg['start']['customParameters']['from']
asyncio.create_task(pipeline.run(ws))
elif msg['event'] == 'media':
# Inbound audio chunk from caller
audio_chunk = base64.b64decode(msg['media']['payload'])
await pipeline.on_audio_in(audio_chunk)
elif msg['event'] == 'stop':
await pipeline.shutdown()
break
except Exception as e:
print(f'Stream error: {e}')
await pipeline.shutdown()import asyncio
from typing import Optional
from anthropic import AsyncAnthropic
from sarvam_client import SarvamTTS
from whisper_client import StreamingWhisper
class VoicePipeline:
def __init__(self):
self.whisper = StreamingWhisper(model='medium', languages=['hi', 'ta', 'en'])
self.tts = SarvamTTS(voice='manan')
self.claude = AsyncAnthropic()
self.tts_task: Optional[asyncio.Task] = None
self.user_speaking = False
async def on_audio_in(self, audio_chunk: bytes):
# VAD: is the user speaking?
if self.whisper.is_speech(audio_chunk):
if not self.user_speaking:
self.user_speaking = True
# BARGE-IN: cancel any in-flight TTS
if self.tts_task and not self.tts_task.done():
self.tts_task.cancel()
print('Bot interrupted by caller')
await self.whisper.feed(audio_chunk)
async def run(self, ws):
# Greeting
await self.speak(ws, 'Namaste, Tally Helpdesk mein swagat hai. Aapki kya samasya hai?')
while True:
# Wait for caller to finish speaking
transcript = await self.whisper.get_completed_utterance()
if not transcript:
continue
self.user_speaking = False
detected_lang = self.whisper.last_detected_language
# Classify intent + generate reply
intent = await self.classify_intent(transcript, detected_lang)
if intent == 'transfer_to_human':
await self.transfer(ws, summary=transcript)
break
reply_stream = await self.generate_reply(transcript, intent, detected_lang)
# Stream TTS while Claude generates
self.tts_task = asyncio.create_task(self.stream_tts(ws, reply_stream, detected_lang))
try:
await self.tts_task
except asyncio.CancelledError:
print('TTS cancelled by barge-in')
async def stream_tts(self, ws, text_stream, language: str):
buffer = ''
async for chunk in text_stream:
buffer += chunk
# Flush at sentence boundary for natural cadence
if any(p in buffer for p in '.!?।'):
audio = await self.tts.synthesize(buffer, language=language)
await self.send_audio(ws, audio)
buffer = ''
if buffer:
audio = await self.tts.synthesize(buffer, language=language)
await self.send_audio(ws, audio)
async def speak(self, ws, text: str, language: str = 'hi'):
audio = await self.tts.synthesize(text, language=language)
await self.send_audio(ws, audio)
async def send_audio(self, ws, audio_bytes: bytes):
import base64
payload = base64.b64encode(audio_bytes).decode()
await ws.send_json({
'event': 'media',
'streamSid': self.stream_sid,
'media': {'payload': payload},
})। for Hindi punctuation) — buffering until the full Claude response would add 1-2 seconds of perceived delay. The language detection runs continuously through Whisper, so a caller switching from Hindi to English mid-call gets the right TTS voice.
## The intent classifier (Claude prompt)
INTENT_PROMPT = """You are an intent classifier for a Tally Helpdesk voice bot.
The caller said: "{transcript}"
Caller's language: {language}
Classify into ONE of these intents:
- license_renewal: questions about Tally license renewal, expiry, activation
- gst_query: GST return preparation, GSTR filing, GST configuration in Tally
- data_corrupt: Tally data file corruption, "data not opening", recovery requests
- password_reset: forgotten password, lock/unlock issues
- install_help: installation problems, version upgrade, setup
- e_invoice: e-invoice generation, IRN issues, GSTN integration
- training: requests for Tally training, "kaise sikhayenge"
- transfer_to_human: explicit request for human, "agent", "manager", complex multi-issue
- other: doesn't fit above categories
Respond with ONLY the intent name. No explanation.
"""
async def classify_intent(self, transcript: str, language: str) -> str:
response = await self.claude.messages.create(
model='claude-sonnet-4-5',
max_tokens=20,
messages=[{'role': 'user', 'content': INTENT_PROMPT.format(
transcript=transcript, language=language,
)}],
)
intent = response.content[0].text.strip()
return intent if intent in VALID_INTENTS else 'other'import httpx
async def create_tally_ticket(self, intent: str, transcript: str, language: str):
async with httpx.AsyncClient() as client:
# Lookup caller from CRM
customer = await self.lookup_customer_by_phone(self.from_number)
response = await client.post(
'https://tally-helpdesk.softechinfra.com/api/tickets',
headers={'Authorization': f'Bearer {os.environ["TALLY_TOKEN"]}'},
json={
'customer_id': customer['id'] if customer else None,
'phone': self.from_number,
'intent_category': intent,
'description': transcript,
'language': language,
'source': 'voice_ivr',
'priority': 'high' if intent == 'data_corrupt' else 'normal',
'call_sid': self.call_sid,
},
)
ticket = response.json()
return ticket['ticket_id']- Twilio India number with Mumbai SIP termination requested (free, 24h)
- HTTPS + valid SSL on the WebSocket endpoint (mandatory for Twilio Streams)
- Whisper VAD threshold tuned to local accent (380-450ms silence works for India)
- TTS streaming flushes at sentence boundary, not full-response
- Barge-in cancels TTS task at first detected user speech
- Intent classifier prompt includes "transfer_to_human" as explicit option
- Customer lookup by phone before ticket creation
- Recording disclosure played at call start (TRAI compliance)
- Warm-transfer plays context summary to staff before bridging
- Smoke test from at least 5 carrier+device combinations before go-live
<Gather> primitive handles DTMF and we fall back to a 4-option menu.
### What's the resolution rate floor for this approach?
In our experience across 6 client builds, intent-based IVRs land at 60-78% resolution depending on the FAQ coverage and the callers' familiarity with the bot. The CA practice hit 73% with 4 FAQ-resolved intents. A practice with 8 FAQ-resolved intents would likely hit 80%+.
### Can I deploy this on-prem instead of Hetzner cloud?
Yes. Replace Hetzner with any Linux box that has 4GB RAM and a public-routable HTTPS endpoint. The L4 GPU for Whisper is the only hard requirement — and you can swap Whisper for OpenAI's hosted STT (slightly slower, 200ms more latency) if you can't run a GPU.
### How do you handle accents the bot consistently mishears?
Add the misheard transcripts as training data for prompt-level disambiguation. We track Whisper outputs that human staff later corrected, then add a "common variations" section to the intent prompt: "Note: 'jeem-essay-tee' usually means GST." Improves classification by 3-5pp without retraining the STT model.
### What does the recording disclosure say?
"Yeh call recording aur quality monitoring ke liye record ho rahi hai. Agar aap ise opt out karna chahte hain, kripya 9 dabayein." TRAI requires the disclosure; the opt-out path is best practice. Recordings stored in S3 ap-south-1 with 90-day retention.
Want a Voice IVR for Your CA Practice or Accounting Software Helpdesk?
We ship voice IVRs for Indian SMB helpdesks (Tally, Zoho Books, BUSY, custom billing apps) in 7-14 working days. Trilingual routing, intent classification, automatic ticket creation, warm transfer to your team. Typical project: ₹65,000-₹1,40,000 fixed scope. Per-call run cost from ₹2.10. First call is technical — with the engineer who would lead your build.
Book a 20-min Call
