mkdir voice-ivr && cd voice-ivr
npm init -y
npm install fastify @fastify/websocket @fastify/formbody \
@anthropic-ai/sdk dotenv twilio// server.js
import Fastify from 'fastify';
import websocket from '@fastify/websocket';
import formbody from '@fastify/formbody';
import Anthropic from '@anthropic-ai/sdk';
import 'dotenv/config';
const app = Fastify({ logger: true });
await app.register(websocket);
await app.register(formbody);
const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
// Step 1: Twilio hits this when a call comes in
app.post('/twiml', async (req, reply) => {
const wsUrl = wss://${req.headers.host}/ws;
const twiml =
;
reply.type('text/xml').send(twiml);
});// continued in server.js
const SYSTEM_PROMPT = You are an appointment booking assistant for Sunrise Dental.
Hours: Mon-Sat, 9am-7pm. 8 clinics in Hyderabad.
Always speak in 1-2 short sentences. This is a phone call, not chat.
Confirm clinic name, date, time, and patient phone before booking.
If asked something outside booking, politely say you'll transfer to staff.;
const tools = [{
name: 'book_appointment',
description: 'Books a confirmed appointment after collecting all details',
input_schema: {
type: 'object',
properties: {
clinic: { type: 'string' },
date: { type: 'string', description: 'ISO 8601 date' },
time: { type: 'string', description: 'HH:MM 24-hour' },
patient_name: { type: 'string' },
patient_phone: { type: 'string' }
},
required: ['clinic', 'date', 'time', 'patient_name', 'patient_phone']
}
}];
app.register(async function (app) {
app.get('/ws', { websocket: true }, (socket, req) => {
const messages = [];
socket.on('message', async (raw) => {
const event = JSON.parse(raw.toString());
if (event.type === 'prompt') {
// Caller said something — we get the transcribed text
messages.push({ role: 'user', content: event.voicePrompt });
const stream = anthropic.messages.stream({
model: 'claude-haiku-4-5',
max_tokens: 200,
system: SYSTEM_PROMPT,
tools,
messages
});
let fullResponse = '';
stream.on('text', (chunk) => {
fullResponse += chunk;
// Stream tokens to Twilio for ElevenLabs to speak immediately
socket.send(JSON.stringify({
type: 'text',
token: chunk,
last: false
}));
});
stream.on('end', () => {
socket.send(JSON.stringify({ type: 'text', token: '', last: true }));
messages.push({ role: 'assistant', content: fullResponse });
});
// Handle tool calls
stream.on('contentBlock', async (block) => {
if (block.type === 'tool_use' && block.name === 'book_appointment') {
await saveAppointment(block.input);
messages.push({
role: 'user',
content: [{
type: 'tool_result',
tool_use_id: block.id,
content: 'Booked successfully.'
}]
});
}
});
}
if (event.type === 'interrupt') {
// Caller barged in — stop generating
// (Anthropic stream auto-cancels on socket close)
app.log.info('barge-in detected at ' + event.utteranceUntilInterrupt);
}
});
});
});
async function saveAppointment(details) {
// Hit your booking API / Google Calendar / DB
console.log('Booking:', details);
}
await app.listen({ port: 3000, host: '0.0.0.0' });{
"stt": {
"provider": "Deepgram",
"model": "nova-3-phonecall",
"endpointing": 300,
"punctuate": true
},
"tts": {
"provider": "ElevenLabs",
"model": "eleven_flash_v2_5",
"voice": "Rachel",
"optimize_streaming_latency": 4
},
"llm": {
"model": "claude-haiku-4-5",
"max_tokens": 200,
"stream": true
}
}endpointing: 300 — finalizes transcript when caller pauses 300ms (default is 700ms, too slow for natural conversation). (2) ElevenLabs optimize_streaming_latency: 4 — drops to lowest quality preset, saves ~150ms. (3) Claude max_tokens: 200 — short replies start playing faster. (4) Anthropic stream: true — token-by-token, not wait-for-complete.
## Common mistakes — five we keep seeing on r/Twilio and r/voiceai
Mistake 1 — Not handling barge-in. If your bot keeps talking when the caller speaks over it, the call dies in 30 seconds. ConversationRelay sends an interrupt event — you must stop streaming Claude tokens and clear any pending TTS audio. The Twilio Anthropic ConversationRelay tutorial covers this in detail.
Mistake 2 — Sending full sentences to TTS instead of tokens. Waiting for a complete Claude sentence adds 600–900ms to perceived latency. Stream tokens to ElevenLabs as they arrive — Flash v2.5's streaming endpoint accepts incremental input.
Mistake 3 — Using Claude Sonnet for voice. Sonnet's first-token latency runs 600–900ms versus Haiku's 380ms. For voice, that 300ms gap is the difference between "feels human" and "feels broken". Reserve Sonnet for the function-call result step, not the conversational turn.
Mistake 4 — Default Deepgram endpointing. Default 700ms means a half-second of dead air after every caller utterance. Drop to 300ms for natural turn-taking; bump back to 500ms only if your callers tend to pause mid-sentence (older demographics).
Mistake 5 — No fallback to a human number. If Claude returns nonsense, callers need an out. Add a fallback that triggers on three failed turn-rounds or on the keyword "human" / "agent" / "person". Every voice IVR we ship has this.
- You handle ≥1,000 inbound calls/month (below this, hire a person)
- Your call flow is task-shaped: booking, status check, payment confirmation, FAQ
- You have a Twilio number purchased and a credit card on file
- You set up barge-in handling and tested with rapid interruptions
- You added a "transfer to human" fallback path on three failed turns
- You logged the first 100 calls and reviewed transcripts before scaling
- You have explicit recording consent if you record calls
- You set Deepgram endpointing to 300ms for natural turn-taking
- You used Claude Haiku, not Sonnet, for the conversational turn
transcriptionLanguage="hi-IN" or set per-call. Quality on Tamil and Telugu agglutinative speech is the weakest link — test with native speakers.
### How do I add function calling for "check availability" or "send SMS confirmation"?
Claude's tools API works inside the ConversationRelay flow. Define your tool with input schema, list it in the messages.stream call, and execute the function when Claude emits a tool_use block. Append the tool result back into the message history and let Claude continue. The Twilio function-calling blog post has working examples.
### What happens if my server crashes mid-call?
Twilio retries the WebSocket connection once, then plays a fallback TwiML if you defined one. Always set a fallback URL on your TwiML app pointing to a static "We're having a glitch — please call back" message. Add health checks that bounce the server before traffic dies.
### How do I bill calls to clients accurately?
Twilio bills you per minute on inbound. ElevenLabs bills per character generated. Claude bills per token. Build an in-call counter that tracks all three and exports per-call cost to your DB. We bill clients monthly with itemized usage; some clients prefer flat per-minute pricing where we eat variance.
### Where can I read more on real-world voice AI latency benchmarks?
The [VEXYL AI 2026 latency tests](https://vexyl.ai/elevenlabs-tts-latency-test-2026-real-world-results/) and [Picovoice TTS Latency Benchmark](https://github.com/Picovoice/tts-latency-benchmark) on GitHub are the most credible third-party data. ElevenLabs' own latency docs cover the difference between model latency (75ms) and real-world TTFB (478ms) — both matter, depending on where you measure.
Want a Voice IVR for Your Business?
We ship production voice IVRs (booking, support, payment, FAQ) on Twilio + Claude + ElevenLabs in 5 working days. Typical project ₹95,000–₹2,40,000 depending on integrations. Per-call run cost from ₹12. Multilingual ready out of the box. You own the code and the Twilio account.
Book a 20-min Call
