From fc5c3d6aefabff530bc5a4116d3b16c749e91f4a Mon Sep 17 00:00:00 2001
From: Dexter Storey <36115192+DexterStorey@users.noreply.github.com>
Date: Fri, 5 Dec 2025 12:30:06 -0500
Subject: [PATCH 1/4] Add Working with OAI Realtime API blogpost

---
 blogpost/assets/realtime-flow.svg          |  75 +++++
 blogpost/working-with-oai-realtime-api.mdx | 346 +++++++++++++++++++++
 2 files changed, 421 insertions(+)
 create mode 100644 blogpost/assets/realtime-flow.svg
 create mode 100644 blogpost/working-with-oai-realtime-api.mdx
diff --git a/blogpost/assets/realtime-flow.svg b/blogpost/assets/realtime-flow.svg
new file mode 100644
index 0000000..815956c
--- /dev/null
+++ b/blogpost/assets/realtime-flow.svg
@@ -0,0 +1,75 @@
+<!-- blogpost/assets/realtime-flow.svg -->
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="520" viewBox="0 0 1200 520" role="img" aria-label="Realtime API flow diagram">
+  <defs>
+    <style>
+      text { font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji", "Segoe UI Emoji"; font-size: 18px; }
+      .box { fill: #ffffff; stroke: #111827; stroke-width: 2; rx: 14; }
+      .title { font-size: 20px; font-weight: 700; }
+      .muted { fill: #374151; font-size: 16px; }
+      .arrow { stroke: #111827; stroke-width: 2.5; fill: none; marker-end: url(#arrowhead); }
+      .chip { fill: #f3f4f6; stroke: #111827; stroke-width: 1.5; rx: 10; }
+    </style>
+    <marker id="arrowhead" markerWidth="12" markerHeight="12" refX="10" refY="6" orient="auto">
+      <path d="M0,0 L12,6 L0,12 Z" fill="#111827"/>
+    </marker>
+  </defs>
+
+  <!-- Boxes -->
+  <rect class="box" x="60" y="90" width="320" height="340" rx="16"/>
+  <rect class="box" x="440" y="90" width="320" height="340" rx="16"/>
+  <rect class="box" x="820" y="90" width="320" height="340" rx="16"/>
+
+  <!-- Left: Client -->
+  <text x="90" y="130" class="title">Browser client</text>
+  <text x="90" y="165" class="muted">• Mic → WebRTC audio track</text>
+  <text x="90" y="195" class="muted">• DataChannel: JSON events</text>
+  <text x="90" y="225" class="muted">• UI: custom instructions</text>
+  <text x="90" y="255" class="muted">• Handles interruptions</text>
+
+  <rect class="chip" x="90" y="300" width="260" height="44" rx="12"/>
+  <text x="105" y="329">RTCPeerConnection</text>
+
+  <rect class="chip" x="90" y="360" width="260" height="44" rx="12"/>
+  <text x="105" y="389">RTCDataChannel: oai-events</text>
+
+  <!-- Middle: Your server -->
+  <text x="470" y="130" class="title">Rubric server</text>
+  <text x="470" y="165" class="muted">• Holds real API key</text>
+  <text x="470" y="195" class="muted">• Mints client secrets</text>
+  <text x="470" y="225" class="muted">• Zod-validated boundary</text>
+  <text x="470" y="255" class="muted">• Optional tools/business logic</text>
+
+  <rect class="chip" x="470" y="300" width="260" height="44" rx="12"/>
+  <text x="485" y="329">POST /v1/realtime/client_secrets</text>
+
+  <!-- Right: OpenAI -->
+  <text x="850" y="130" class="title">OpenAI Realtime API</text>
+  <text x="850" y="165" class="muted">• WebRTC call creation</text>
+  <text x="850" y="195" class="muted">• Speech-to-speech model</text>
+  <text x="850" y="225" class="muted">• Server events stream</text>
+  <text x="850" y="255" class="muted">• Turn detection / VAD</text>
+
+  <rect class="chip" x="850" y="300" width="260" height="44" rx="12"/>
+  <text x="865" y="329">POST /v1/realtime/calls (SDP)</text>
+
+  <rect class="chip" x="850" y="360" width="260" height="44" rx="12"/>
+  <text x="865" y="389">Events: session.update / response.*</text>
+
+  <!-- Arrows -->
+  <path class="arrow" d="M 380 170 C 410 170, 410 170, 440 170"/>
+  <text x="390" y="155" class="muted">fetch secret</text>
+
+  <path class="arrow" d="M 760 170 C 790 170, 790 170, 820 170"/>
+  <text x="770" y="155" class="muted">mint</text>
+
+  <path class="arrow" d="M 380 330 C 600 330, 600 330, 820 330"/>
+  <text x="470" y="315" class="muted">SDP offer → answer</text>
+
+  <path class="arrow" d="M 820 395 C 600 395, 600 395, 380 395"/>
+  <text x="560" y="380" class="muted">audio + events</text>
+
+  <!-- Footer note -->
+  <text x="60" y="485" class="muted">
+    Tip: keep the client secret ephemeral; keep tools/business logic on the server; drive behaviors with session.update.
+  </text>
+</svg>
diff --git a/blogpost/working-with-oai-realtime-api.mdx b/blogpost/working-with-oai-realtime-api.mdx
new file mode 100644
index 0000000..82a661e
--- /dev/null
+++ b/blogpost/working-with-oai-realtime-api.mdx
@@ -0,0 +1,346 @@
+---
+title: Working with OAI Realtime API
+description: Lilac — an experimental conversational translator + language playground
+---
+
+# Working with OAI Realtime API
+
+Lilac started as a side quest: **make conversational translation feel like an actual conversation**—not an awkward “speak… wait… read… repeat” loop.
+
+It’s experimental. It’s a little chaotic. It’s also *weirdly* good.
+
+## TL;DR
+
+- **Realtime is a transport**, not a chatbot UI: WebRTC (browser), WebSocket (servers), SIP (phone calls).
+- You mint a short-lived **client secret** server-side, then use it in the browser to create a **WebRTC call**.
+- After the SDP handshake, you drive everything with JSON **events** (session updates, responses, interruptions).
+- Rubric scaffolding (create-rubric-app) keeps the whole thing **type-safe and shippable** without turning it into a Big Project™.
+
+---
+
+## Why we ditched “traditional translation”
+
+Google Translate is fine for menus. In conversation, it’s… not.
+
+What we kept running into:
+
+- **Back-channels get lost** (“mm”, “ah”, “wait wait”, “yeah yeah”) so turns feel cold.
+- **Code-switching** (English + a dialect word + Mandarin filler) comes back mangled.
+- **Long turns become summaries** when you wanted a faithful, *human* rendition.
+- **Timing matters**: the pause before a sentence, the self-correction mid-phrase—those are meaning.
+
+Lilac’s bet is simple: if we can do *speech-to-speech* with low latency, the translation stops feeling like “input → output” and starts feeling like “talking”.
+
+---
+
+## Three real-world workflows (how Lilac gets used)
+
+### 1) Dexter + a friend (taking turns)
+They pass the phone back and forth. Dexter speaks English, the model answers in Teochew (or Teochew-ish), and the vibe stays intact because the model is responding *as the other person*, not as a translator bot.
+
+What matters here:
+- low-latency turn handoff
+- voice that doesn’t sound like it’s reading a caption
+
+### 2) Ted, the self-learner
+Ted uses Lilac like a private tutor:
+- he speaks in a target language,
+- asks for corrections,
+- gets rephrases inline,
+- keeps talking without “ending the session” to do grammar homework.
+
+What matters here:
+- visible corrections (data channel events)
+- consistent “teacher persona” prompts
+- quick mid-session instruction tweaks
+
+### 3) Dexter’s friend, custom dialect support (Teochew from a Mandarin base)
+She uses **custom instructions** like a dialect adapter:
+- start from Mandarin-ish defaults,
+- add a doc describing pronunciation + word choices,
+- steer output toward Teochew without rebuilding the app.
+
+What matters here:
+- instructions that live at the session boundary (so you can change them on the fly)
+- “prompt as config” instead of “prompt baked into code”
+
+---
+
+## The Realtime mental model (what you’re actually building)
+
+Here’s the shape we keep in our heads:
+
+![Realtime data flow](./assets/realtime-flow.svg)
+
+1. **Server minting**: your server creates a short-lived client secret (do *not* ship your real API key).
+2. **WebRTC handshake**: browser creates an SDP offer; OpenAI returns an SDP answer.
+3. **Media + events**: audio flows as WebRTC media; control plane flows as JSON events in a data channel.
+4. **Session steering**: you can update instructions, tools, turn detection, etc. mid-call.
+5. **Interruptions are a feature**: you’ll want to cancel, truncate, and clear output audio.
+
+---
+
+## GA vs Beta (and why you should care)
+
+Realtime has had multiple “shapes” over time. Today there’s a **GA interface** and an older **beta interface**.
+
+Our advice:
+- build against GA,
+- keep a mental map of beta, because you’ll see older examples floating around.
+
+Lilac is small enough that swapping shapes is feasible, and Rubric’s schema-first boundaries make it harder to accidentally break the client when OpenAI shifts an object shape.
+
+---
+
+## Using Realtime in practice: the small, correct happy path
+
+### Step 1 — Server: create a client secret (typed)
+
+You want a server-only endpoint that:
+- validates input with Zod,
+- calls OpenAI to mint a short-lived client secret,
+- returns only what the browser needs.
+
+```ts
+// app/api/realtime/secret/route.ts
+import { NextResponse } from 'next/server'
+import { z } from 'zod'
+
+const CreateSecretInputSchema = z.object({
+  model: z.string().default('gpt-realtime'),
+  instructions: z.string().optional(),
+  voice: z.string().optional(),
+  // keep this short: it’s a browser token, not your auth system
+  ttlSeconds: z.number().int().min(30).max(60 * 30).default(60 * 10),
+})
+
+const ClientSecretResponseSchema = z.object({
+  value: z.string(),       // looks like ek_...
+  expires_at: z.number(),  // epoch seconds
+  session: z.object({
+    id: z.string(),
+    model: z.string(),
+    type: z.string(),
+  }).passthrough(),
+})
+
+export async function POST(request: Request) {
+  const input = CreateSecretInputSchema.parse(await request.json().catch(() => ({})))
+
+  const response = await fetch('https://api.openai.com/v1/realtime/client_secrets', {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({
+      expires_after: { anchor: 'created_at', seconds: input.ttlSeconds },
+      session: {
+        type: 'realtime',
+        model: input.model,
+        ...(input.instructions ? { instructions: input.instructions } : {}),
+        ...(input.voice ? { audio: { output: { voice: input.voice } } } : {}),
+      },
+    }),
+  })
+
+  if (!response.ok) {
+    return NextResponse.json({ error: await response.text() }, { status: 500 })
+  }
+
+  const json = await response.json()
+  const parsed = ClientSecretResponseSchema.parse(json)
+
+  return NextResponse.json({
+    clientSecret: parsed.value,
+    expiresAt: parsed.expires_at,
+    session: parsed.session,
+  })
+}
+```
+
+Rubric pattern: **schemas at boundaries**, and the browser only gets the ephemeral secret.
+
+---
+
+### Step 2 — Client: WebRTC handshake + data channel
+
+This is the part everyone overcomplicates. Keep it boring.
+
+```ts
+async function connectRealtime() {
+  // 1) Ask our server for a short-lived secret
+  const secret = await fetch('/api/realtime/secret', { method: 'POST' }).then(r => r.json())
+  const clientSecret = secret.clientSecret as string
+
+  // 2) WebRTC peer + mic
+  const peer = new RTCPeerConnection()
+  const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+
+  for (const track of stream.getTracks()) peer.addTrack(track, stream)
+
+  // 3) Data channel for events
+  const events = peer.createDataChannel('oai-events')
+
+  events.addEventListener('message', (ev) => {
+    // server events arrive as JSON
+    try {
+      const event = JSON.parse(ev.data)
+      // route into your typed event handler here
+      console.log('realtime:event', event.type, event)
+    } catch {
+      // ignore
+    }
+  })
+
+  // 4) Create SDP offer
+  const offer = await peer.createOffer()
+  await peer.setLocalDescription(offer)
+
+  // 5) Send offer to OpenAI, get SDP answer back
+  const form = new FormData()
+  form.append('sdp', new Blob([offer.sdp ?? ''], { type: 'application/sdp' }))
+
+  const answerResponse = await fetch('https://api.openai.com/v1/realtime/calls', {
+    method: 'POST',
+    headers: { Authorization: `Bearer ${clientSecret}` },
+    body: form,
+  })
+
+  if (!answerResponse.ok) throw new Error(await answerResponse.text())
+
+  const answerSdp = await answerResponse.text()
+  await peer.setRemoteDescription({ type: 'answer', sdp: answerSdp })
+
+  return { peer, events }
+}
+```
+
+Once `setRemoteDescription` succeeds, you’re “in the call”: audio track in, audio track out, plus an event stream in the data channel.
+
+---
+
+## Session steering: “custom instructions” is just session.update
+
+Lilac’s custom instructions UI maps directly to one idea:
+**change the session config without restarting the call**.
+
+```ts
+function sessionUpdate(events: RTCDataChannel, patch: unknown) {
+  events.send(JSON.stringify({ type: 'session.update', session: patch }))
+}
+
+// Example: translation-first persona
+sessionUpdate(events, {
+  type: 'realtime',
+  instructions: [
+    'You are a real-time conversational translator.',
+    'Keep the cadence and intent. Don’t over-explain.',
+    'Prefer Teochew. If uncertain, fall back to Mandarin.',
+    'Return short turns. Match the speaker’s tone.',
+  ].join('\n'),
+})
+```
+
+A practical trick: keep a few “personas” on the client and flip between them mid-session:
+
+* translation-first
+* teacher-first
+* roleplay / immersion mode
+
+---
+
+## Turning model inference on (response.create)
+
+In many setups, the server will create responses automatically when turn detection is enabled. But it’s still worth understanding the explicit control:
+
+```ts
+// simplest form: “please respond now”
+events.send(JSON.stringify({ type: 'response.create' }))
+
+// out-of-band response (e.g. summarization) that doesn’t write to the main conversation
+events.send(JSON.stringify({
+  type: 'response.create',
+  response: {
+    conversation: 'none',
+    output_modalities: ['text'],
+    instructions: 'Summarize in one sentence.',
+    metadata: { purpose: 'summarization' },
+    input: [
+      { type: 'message', role: 'user', content: [{ type: 'input_text', text: 'Summarize what we said.' }] },
+    ],
+  },
+}))
+```
+
+This is the “you can do more than phone-call UX” moment: you can run side tasks next to the live conversation.
+
+---
+
+## Interruptions: cancelling, truncating, and cutting off audio
+
+Real speech is messy: people interrupt, laugh, restart, bail mid-sentence.
+
+A few “must-have” controls:
+
+* `response.cancel` — stop the model response
+* `conversation.item.truncate` — tell the server what audio you actually played (important when the user interrupts)
+* `output_audio_buffer.clear` — **WebRTC-only** cutoff for audio output
+
+That’s how you get “natural” barge-in behavior instead of two voices yelling over each other.
+
+---
+
+## How Rubric infra makes this fast (and boring)
+
+Lilac is built on create-rubric-app energy:
+
+* opinionated app scaffolding (Next.js + TS + Biome + Zod)
+* schemas everywhere, so your client doesn’t guess shapes
+* event streaming patterns that don’t degrade into “JSON.parse() spaghetti”
+* quick deploy defaults, so a weekend prototype can become “send a link”
+
+The point is not that Lilac is “enterprise ready.”
+The point is that your experiments can still have **good bones**.
+
+---
+
+## How we used Codex (the teammate who never sleeps)
+
+We leaned on Codex in a very specific way:
+
+1. **Bootstrap**: generated the app with `create-rubric-app`.
+2. **Deploy**: used the same Rubric workflow to bring up infra fast (so we could test on real phones, not just localhost).
+3. **Codex environment**: configured Codex with
+
+   * the repo commands to install/run/test,
+   * the environment variables it needs to actually start the app,
+   * custom instructions describing Lilac’s architecture and “don’t touch the secret key boundaries”.
+
+The best part wasn’t “Codex wrote code for us.”
+It was that we could say: “stand this up, reproduce the bug, and fix it *without changing the shape of the types*.”
+That’s a very Rubric way to move.
+
+---
+
+## Presentation-ready nits (if you want Lilac to read like a real OSS project)
+
+If you’re polishing the repo, these changes are high leverage:
+
+* Add a README section called **“Realtime API shape (GA vs beta)”** with a one-paragraph migration note.
+* Centralize the Realtime event types into a single `realtimeEvents.ts` with a Zod discriminated union.
+* Wrap the data channel in a tiny typed adapter: `sendEvent(event)` + `onEvent(handler)`.
+* Add a tiny “debug overlay” showing:
+
+  * connection state
+  * last server event type
+  * current persona / instructions hash
+* Document the “custom instructions” format (especially for dialect docs) and include one example.
+
+---
+
+Lilac is a side project, but it’s also a pretty honest answer to a real problem:
+**translation isn’t a string function, it’s a conversation.**
+
+Peace nerds (:
+

From cd3779fe19c2a98ce5d2a563c3e5e8ff6feb1575 Mon Sep 17 00:00:00 2001
From: Dexter Storey <36115192+DexterStorey@users.noreply.github.com>
Date: Fri, 5 Dec 2025 12:44:17 -0500
Subject: [PATCH 2/4] Surface realtime blog post

---
 .../images}/realtime-flow.svg                 |   0
 src/app/icon.tsx                              |   1 +
 .../posts}/working-with-oai-realtime-api.mdx  | 100 +++++++++---------
 src/ui/video/video.tsx                        |  12 +--
 4 files changed, 56 insertions(+), 57 deletions(-)
 rename {blogpost/assets => public/images}/realtime-flow.svg (100%)
 rename {blogpost => src/lib/posts}/working-with-oai-realtime-api.mdx (87%)

diff --git a/blogpost/assets/realtime-flow.svg b/public/images/realtime-flow.svg
similarity index 100%
rename from blogpost/assets/realtime-flow.svg
rename to public/images/realtime-flow.svg
diff --git a/src/app/icon.tsx b/src/app/icon.tsx
index f884e8a..d857430 100644
--- a/src/app/icon.tsx
+++ b/src/app/icon.tsx
@@ -11,6 +11,7 @@ export const size = {
 export default async function Icon() {
 	return new ImageResponse(
 		<svg width="32" height="32" viewBox="0 0 64 64" fill="none" xmlns="http://www.w3.org/2000/svg">
+			<title>Rubric icon</title>
 			<rect width="64" height="64" fill="black" />
 			<path
 				d="M12.7998 12.7998H25.5998V25.5998H38.3998V38.3998H25.5998V51.1998H12.7998V12.7998Z"
diff --git a/blogpost/working-with-oai-realtime-api.mdx b/src/lib/posts/working-with-oai-realtime-api.mdx
similarity index 87%
rename from blogpost/working-with-oai-realtime-api.mdx
rename to src/lib/posts/working-with-oai-realtime-api.mdx
index 82a661e..e5f065f 100644
--- a/blogpost/working-with-oai-realtime-api.mdx
+++ b/src/lib/posts/working-with-oai-realtime-api.mdx
@@ -1,7 +1,15 @@
----
-title: Working with OAI Realtime API
-description: Lilac — an experimental conversational translator + language playground
----
+import { AUTHORS, CATEGORIES } from '~/lib/constants/blog'
+
+export const metadata = {
+  title: 'Working with OAI Realtime API',
+  subtitle: 'Lilac — an experimental conversational translator + language playground',
+  date: '2025-03-10',
+  author: AUTHORS.DEXTER_STOREY,
+  bannerImageUrl: '/images/realtime-flow.svg',
+  category: CATEGORIES.AI,
+  description:
+    'Hands-on guide to building Lilac’s Realtime translation workflow with client secrets, WebRTC, and session steering.'
+}
 
 # Working with OAI Realtime API
 
@@ -70,7 +78,7 @@ What matters here:
 
 Here’s the shape we keep in our heads:
 
-![Realtime data flow](./assets/realtime-flow.svg)
+![Realtime data flow](/images/realtime-flow.svg)
 
 1. **Server minting**: your server creates a short-lived client secret (do *not* ship your real API key).
 2. **WebRTC handshake**: browser creates an SDP offer; OpenAI returns an SDP answer.
@@ -110,18 +118,19 @@ const CreateSecretInputSchema = z.object({
   model: z.string().default('gpt-realtime'),
   instructions: z.string().optional(),
   voice: z.string().optional(),
-  // keep this short: it’s a browser token, not your auth system
-  ttlSeconds: z.number().int().min(30).max(60 * 30).default(60 * 10),
+  ttlSeconds: z.number().int().min(30).max(60 * 30).default(60 * 10)
 })
 
 const ClientSecretResponseSchema = z.object({
-  value: z.string(),       // looks like ek_...
-  expires_at: z.number(),  // epoch seconds
-  session: z.object({
-    id: z.string(),
-    model: z.string(),
-    type: z.string(),
-  }).passthrough(),
+  value: z.string(),
+  expires_at: z.number(),
+  session: z
+    .object({
+      id: z.string(),
+      model: z.string(),
+      type: z.string()
+    })
+    .passthrough()
 })
 
 export async function POST(request: Request) {
@@ -131,7 +140,7 @@ export async function POST(request: Request) {
     method: 'POST',
     headers: {
       Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
-      'Content-Type': 'application/json',
+      'Content-Type': 'application/json'
     },
     body: JSON.stringify({
       expires_after: { anchor: 'created_at', seconds: input.ttlSeconds },
@@ -139,9 +148,9 @@ export async function POST(request: Request) {
         type: 'realtime',
         model: input.model,
         ...(input.instructions ? { instructions: input.instructions } : {}),
-        ...(input.voice ? { audio: { output: { voice: input.voice } } } : {}),
-      },
-    }),
+        ...(input.voice ? { audio: { output: { voice: input.voice } } } : {})
+      }
+    })
   })
 
   if (!response.ok) {
@@ -154,7 +163,7 @@ export async function POST(request: Request) {
   return NextResponse.json({
     clientSecret: parsed.value,
     expiresAt: parsed.expires_at,
-    session: parsed.session,
+    session: parsed.session
   })
 }
 ```
@@ -169,42 +178,33 @@ This is the part everyone overcomplicates. Keep it boring.
 
 ```ts
 async function connectRealtime() {
-  // 1) Ask our server for a short-lived secret
   const secret = await fetch('/api/realtime/secret', { method: 'POST' }).then(r => r.json())
   const clientSecret = secret.clientSecret as string
 
-  // 2) WebRTC peer + mic
   const peer = new RTCPeerConnection()
   const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
 
   for (const track of stream.getTracks()) peer.addTrack(track, stream)
 
-  // 3) Data channel for events
   const events = peer.createDataChannel('oai-events')
 
-  events.addEventListener('message', (ev) => {
-    // server events arrive as JSON
+  events.addEventListener('message', ev => {
     try {
       const event = JSON.parse(ev.data)
-      // route into your typed event handler here
       console.log('realtime:event', event.type, event)
-    } catch {
-      // ignore
-    }
+    } catch {}
   })
 
-  // 4) Create SDP offer
   const offer = await peer.createOffer()
   await peer.setLocalDescription(offer)
 
-  // 5) Send offer to OpenAI, get SDP answer back
   const form = new FormData()
   form.append('sdp', new Blob([offer.sdp ?? ''], { type: 'application/sdp' }))
 
   const answerResponse = await fetch('https://api.openai.com/v1/realtime/calls', {
     method: 'POST',
     headers: { Authorization: `Bearer ${clientSecret}` },
-    body: form,
+    body: form
   })
 
   if (!answerResponse.ok) throw new Error(await answerResponse.text())
@@ -230,15 +230,14 @@ function sessionUpdate(events: RTCDataChannel, patch: unknown) {
   events.send(JSON.stringify({ type: 'session.update', session: patch }))
 }
 
-// Example: translation-first persona
 sessionUpdate(events, {
   type: 'realtime',
   instructions: [
     'You are a real-time conversational translator.',
     'Keep the cadence and intent. Don’t over-explain.',
     'Prefer Teochew. If uncertain, fall back to Mandarin.',
-    'Return short turns. Match the speaker’s tone.',
-  ].join('\n'),
+    'Return short turns. Match the speaker’s tone.'
+  ].join('\n')
 })
 ```
 
@@ -255,22 +254,26 @@ A practical trick: keep a few “personas” on the client and flip between them
 In many setups, the server will create responses automatically when turn detection is enabled. But it’s still worth understanding the explicit control:
 
 ```ts
-// simplest form: “please respond now”
 events.send(JSON.stringify({ type: 'response.create' }))
 
-// out-of-band response (e.g. summarization) that doesn’t write to the main conversation
-events.send(JSON.stringify({
-  type: 'response.create',
-  response: {
-    conversation: 'none',
-    output_modalities: ['text'],
-    instructions: 'Summarize in one sentence.',
-    metadata: { purpose: 'summarization' },
-    input: [
-      { type: 'message', role: 'user', content: [{ type: 'input_text', text: 'Summarize what we said.' }] },
-    ],
-  },
-}))
+events.send(
+  JSON.stringify({
+    type: 'response.create',
+    response: {
+      conversation: 'none',
+      output_modalities: ['text'],
+      instructions: 'Summarize in one sentence.',
+      metadata: { purpose: 'summarization' },
+      input: [
+        {
+          type: 'message',
+          role: 'user',
+          content: [{ type: 'input_text', text: 'Summarize what we said.' }]
+        }
+      ]
+    }
+  })
+)
 ```
 
 This is the “you can do more than phone-call UX” moment: you can run side tasks next to the live conversation.
@@ -343,4 +346,3 @@ Lilac is a side project, but it’s also a pretty honest answer to a real proble
 **translation isn’t a string function, it’s a conversation.**
 
 Peace nerds (:
-
diff --git a/src/ui/video/video.tsx b/src/ui/video/video.tsx
index d26dabd..1664b04 100644
--- a/src/ui/video/video.tsx
+++ b/src/ui/video/video.tsx
@@ -211,24 +211,20 @@ export function Video({ hlsUrl, mp4Url, className = '', posterUrl, transcription
 
 				{/* Clickable overlay for play/pause when in playing-with-sound mode */}
 				{status === 'playing-with-sound' && (
-					<div
+					<button
+						type="button"
 						className="absolute inset-0 z-[5] cursor-pointer"
 						onClick={togglePlayPause}
-						onKeyDown={e => e.key === 'Enter' && togglePlayPause()}
-						tabIndex={0}
-						role="button"
 						aria-label="Play or pause video"
 					/>
 				)}
 
 				{/* Clickable overlay for the entire video when not in playing-with-sound mode */}
 				{showPlayButton && (
-					<div
+					<button
+						type="button"
 						className="absolute inset-0 z-10 cursor-pointer"
 						onClick={playWithSound}
-						onKeyDown={e => e.key === 'Enter' && playWithSound()}
-						tabIndex={0}
-						role="button"
 						aria-label="Play with sound"
 					/>
 				)}

From 562c7e3f0cb131c81480f4f5ff91c715ba50b868 Mon Sep 17 00:00:00 2001
From: Dexter Storey <36115192+DexterStorey@users.noreply.github.com>
Date: Fri, 5 Dec 2025 14:22:18 -0500
Subject: [PATCH 3/4] rewrite

---
 .../posts/working-with-oai-realtime-api.mdx   | 751 +++++++++++-------
 1 file changed, 477 insertions(+), 274 deletions(-)

diff --git a/src/lib/posts/working-with-oai-realtime-api.mdx b/src/lib/posts/working-with-oai-realtime-api.mdx
index e5f065f..13b4071 100644
--- a/src/lib/posts/working-with-oai-realtime-api.mdx
+++ b/src/lib/posts/working-with-oai-realtime-api.mdx
@@ -1,348 +1,551 @@
-import { AUTHORS, CATEGORIES } from '~/lib/constants/blog'
-
-export const metadata = {
-  title: 'Working with OAI Realtime API',
-  subtitle: 'Lilac — an experimental conversational translator + language playground',
-  date: '2025-03-10',
-  author: AUTHORS.DEXTER_STOREY,
-  bannerImageUrl: '/images/realtime-flow.svg',
-  category: CATEGORIES.AI,
-  description:
-    'Hands-on guide to building Lilac’s Realtime translation workflow with client secrets, WebRTC, and session steering.'
-}
-
-# Working with OAI Realtime API
-
-Lilac started as a side quest: **make conversational translation feel like an actual conversation**—not an awkward “speak… wait… read… repeat” loop.
-
-It’s experimental. It’s a little chaotic. It’s also *weirdly* good.
-
-## TL;DR
-
-- **Realtime is a transport**, not a chatbot UI: WebRTC (browser), WebSocket (servers), SIP (phone calls).
-- You mint a short-lived **client secret** server-side, then use it in the browser to create a **WebRTC call**.
-- After the SDP handshake, you drive everything with JSON **events** (session updates, responses, interruptions).
-- Rubric scaffolding (create-rubric-app) keeps the whole thing **type-safe and shippable** without turning it into a Big Project™.
-
 ---
-
-## Why we ditched “traditional translation”
-
-Google Translate is fine for menus. In conversation, it’s… not.
-
-What we kept running into:
-
-- **Back-channels get lost** (“mm”, “ah”, “wait wait”, “yeah yeah”) so turns feel cold.
-- **Code-switching** (English + a dialect word + Mandarin filler) comes back mangled.
-- **Long turns become summaries** when you wanted a faithful, *human* rendition.
-- **Timing matters**: the pause before a sentence, the self-correction mid-phrase—those are meaning.
-
-Lilac’s bet is simple: if we can do *speech-to-speech* with low latency, the translation stops feeling like “input → output” and starts feeling like “talking”.
-
+title: Building Voice Agents with OpenAI's Realtime API
+description: A production-focused guide to speech-to-speech AI — architecture, typed events, and the patterns that actually matter
 ---
 
-## Three real-world workflows (how Lilac gets used)
+# Building Voice Agents with OpenAI's Realtime API
 
-### 1) Dexter + a friend (taking turns)
-They pass the phone back and forth. Dexter speaks English, the model answers in Teochew (or Teochew-ish), and the vibe stays intact because the model is responding *as the other person*, not as a translator bot.
+OpenAI's Realtime API is the first production-grade speech-to-speech interface that doesn't feel like you're talking to a pipeline. Unlike traditional voice AI (STT → LLM → TTS), the Realtime API processes audio natively — which means latency drops from seconds to hundreds of milliseconds, and nuance like tone, hesitation, and interruption flow through without getting flattened into text.
 
-What matters here:
-- low-latency turn handoff
-- voice that doesn’t sound like it’s reading a caption
+This guide covers what you actually need to know to ship something.
 
-### 2) Ted, the self-learner
-Ted uses Lilac like a private tutor:
-- he speaks in a target language,
-- asks for corrections,
-- gets rephrases inline,
-- keeps talking without “ending the session” to do grammar homework.
+> **Context**: This research came out of building [Lilac](https://lilac.chat) — a voice-to-voice translation and language learning app. The thesis: translation isn't a string function, it's a conversation. Back-channels ("mm", "ah", "wait wait"), code-switching between languages, self-corrections mid-sentence — these carry meaning that traditional translate APIs discard. The Realtime API made it possible to build something that feels like talking, not transcribing.
 
-What matters here:
-- visible corrections (data channel events)
-- consistent “teacher persona” prompts
-- quick mid-session instruction tweaks
+## The Mental Model
 
-### 3) Dexter’s friend, custom dialect support (Teochew from a Mandarin base)
-She uses **custom instructions** like a dialect adapter:
-- start from Mandarin-ish defaults,
-- add a doc describing pronunciation + word choices,
-- steer output toward Teochew without rebuilding the app.
+The Realtime API is **not** a REST endpoint. It's a stateful, bidirectional event protocol over WebRTC (browser) or WebSocket (server-to-server). Think of it as opening a phone call to GPT-4o where:
 
-What matters here:
-- instructions that live at the session boundary (so you can change them on the fly)
-- “prompt as config” instead of “prompt baked into code”
+1. You stream audio in
+2. It streams audio back
+3. Control happens through JSON events on a data channel
+4. The server manages conversation state for you
 
----
+![Realtime API Architecture](./assets/realtime-architecture.svg)
 
-## The Realtime mental model (what you’re actually building)
+The server maintains conversation history, handles turn detection (when the user stops talking), and manages interruptions automatically. Your job is to:
 
-Here’s the shape we keep in our heads:
+1. Mint a short-lived credential
+2. Establish the WebRTC connection
+3. Configure the session
+4. Handle events
 
-![Realtime data flow](/images/realtime-flow.svg)
+## Why WebRTC (and When to Use WebSocket)
 
-1. **Server minting**: your server creates a short-lived client secret (do *not* ship your real API key).
-2. **WebRTC handshake**: browser creates an SDP offer; OpenAI returns an SDP answer.
-3. **Media + events**: audio flows as WebRTC media; control plane flows as JSON events in a data channel.
-4. **Session steering**: you can update instructions, tools, turn detection, etc. mid-call.
-5. **Interruptions are a feature**: you’ll want to cancel, truncate, and clear output audio.
+**WebRTC for browsers/mobile apps**. It handles network jitter, packet loss, echo cancellation, and automatic gain control. TCP-based WebSockets will accumulate latency under real network conditions.
 
----
+**WebSocket for server-to-server**. When you're building a phone integration (via SIP) or need a backend to handle tool calls securely, WebSocket is appropriate since you control the network path.
 
-## GA vs Beta (and why you should care)
+**Sideband connections** let you do both: browser connects via WebRTC for audio, your server connects via WebSocket to the same session for monitoring and tool handling.
 
-Realtime has had multiple “shapes” over time. Today there’s a **GA interface** and an older **beta interface**.
-
-Our advice:
-- build against GA,
-- keep a mental map of beta, because you’ll see older examples floating around.
-
-Lilac is small enough that swapping shapes is feasible, and Rubric’s schema-first boundaries make it harder to accidentally break the client when OpenAI shifts an object shape.
-
----
+## Authentication: Ephemeral Tokens
 
-## Using Realtime in practice: the small, correct happy path
+Never ship your API key to the browser. Mint short-lived client secrets server-side:
 
-### Step 1 — Server: create a client secret (typed)
+```typescript
+// server/api/realtime/session.ts
+import { z } from "zod/v4";
 
-You want a server-only endpoint that:
-- validates input with Zod,
-- calls OpenAI to mint a short-lived client secret,
-- returns only what the browser needs.
-
-```ts
-// app/api/realtime/secret/route.ts
-import { NextResponse } from 'next/server'
-import { z } from 'zod'
-
-const CreateSecretInputSchema = z.object({
-  model: z.string().default('gpt-realtime'),
+const SessionConfigSchema = z.object({
+  model: z.string().default("gpt-realtime-2025-08-28"),
+  voice: z.enum(["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "cedar", "marin"]).default("marin"),
   instructions: z.string().optional(),
-  voice: z.string().optional(),
-  ttlSeconds: z.number().int().min(30).max(60 * 30).default(60 * 10)
-})
-
-const ClientSecretResponseSchema = z.object({
-  value: z.string(),
-  expires_at: z.number(),
-  session: z
-    .object({
-      id: z.string(),
-      model: z.string(),
-      type: z.string()
-    })
-    .passthrough()
-})
-
-export async function POST(request: Request) {
-  const input = CreateSecretInputSchema.parse(await request.json().catch(() => ({})))
-
-  const response = await fetch('https://api.openai.com/v1/realtime/client_secrets', {
-    method: 'POST',
+  ttlSeconds: z.number().int().min(30).max(1800).default(600),
+});
+
+const ClientSecretSchema = z.object({
+  client_secret: z.object({
+    value: z.string(),
+    expires_at: z.number(),
+  }),
+  session: z.object({
+    id: z.string(),
+    model: z.string(),
+  }),
+});
+
+type SessionConfig = z.infer<typeof SessionConfigSchema>;
+type ClientSecretResponse = z.infer<typeof ClientSecretSchema>;
+
+async function createRealtimeSession(config: SessionConfig): Promise<ClientSecretResponse> {
+  const validatedConfig = SessionConfigSchema.parse(config);
+  
+  const response = await fetch("https://api.openai.com/v1/realtime/sessions", {
+    method: "POST",
     headers: {
       Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
-      'Content-Type': 'application/json'
+      "Content-Type": "application/json",
     },
     body: JSON.stringify({
-      expires_after: { anchor: 'created_at', seconds: input.ttlSeconds },
-      session: {
-        type: 'realtime',
-        model: input.model,
-        ...(input.instructions ? { instructions: input.instructions } : {}),
-        ...(input.voice ? { audio: { output: { voice: input.voice } } } : {})
-      }
-    })
-  })
+      model: validatedConfig.model,
+      voice: validatedConfig.voice,
+      instructions: validatedConfig.instructions,
+      input_audio_transcription: { model: "gpt-4o-transcribe" },
+      turn_detection: {
+        type: "server_vad",
+        threshold: 0.5,
+        prefix_padding_ms: 300,
+        silence_duration_ms: 500,
+      },
+    }),
+  });
 
   if (!response.ok) {
-    return NextResponse.json({ error: await response.text() }, { status: 500 })
+    throw new Error(`Session creation failed: ${await response.text()}`);
   }
 
-  const json = await response.json()
-  const parsed = ClientSecretResponseSchema.parse(json)
-
-  return NextResponse.json({
-    clientSecret: parsed.value,
-    expiresAt: parsed.expires_at,
-    session: parsed.session
-  })
+  return ClientSecretSchema.parse(await response.json());
 }
 ```
 
-Rubric pattern: **schemas at boundaries**, and the browser only gets the ephemeral secret.
-
----
+The `cedar` and `marin` voices are exclusive to the Realtime API and have the most natural speech quality.
 
-### Step 2 — Client: WebRTC handshake + data channel
+## WebRTC Connection (Browser)
 
-This is the part everyone overcomplicates. Keep it boring.
+The handshake is three steps: create peer, exchange SDP, open data channel.
 
-```ts
-async function connectRealtime() {
-  const secret = await fetch('/api/realtime/secret', { method: 'POST' }).then(r => r.json())
-  const clientSecret = secret.clientSecret as string
+```typescript
+// client/realtime.ts
+interface RealtimeConnection {
+  peerConnection: RTCPeerConnection;
+  dataChannel: RTCDataChannel;
+  audioElement: HTMLAudioElement;
+  disconnect: () => void;
+}
 
-  const peer = new RTCPeerConnection()
-  const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+async function connectToRealtime(clientSecret: string): Promise<RealtimeConnection> {
+  const peerConnection = new RTCPeerConnection();
+  
+  // Capture microphone
+  const localStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+  for (const track of localStream.getTracks()) {
+    peerConnection.addTrack(track, localStream);
+  }
+  
+  // Set up playback for model audio
+  const audioElement = document.createElement("audio");
+  audioElement.autoplay = true;
+  
+  peerConnection.ontrack = (event) => {
+    audioElement.srcObject = event.streams[0];
+  };
+  
+  // Create data channel for events (must be named "oai-events")
+  const dataChannel = peerConnection.createDataChannel("oai-events");
+  
+  // Create and send SDP offer
+  const offer = await peerConnection.createOffer();
+  await peerConnection.setLocalDescription(offer);
+  
+  const sdpResponse = await fetch("https://api.openai.com/v1/realtime?model=gpt-realtime-2025-08-28", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${clientSecret}`,
+      "Content-Type": "application/sdp",
+    },
+    body: offer.sdp,
+  });
+  
+  if (!sdpResponse.ok) {
+    throw new Error(`SDP exchange failed: ${await sdpResponse.text()}`);
+  }
+  
+  const answerSdp = await sdpResponse.text();
+  await peerConnection.setRemoteDescription({ type: "answer", sdp: answerSdp });
+  
+  function disconnect() {
+    localStream.getTracks().forEach(track => track.stop());
+    peerConnection.close();
+  }
+  
+  return { peerConnection, dataChannel, audioElement, disconnect };
+}
+```
 
-  for (const track of stream.getTracks()) peer.addTrack(track, stream)
+Wait for `dataChannel.onopen` before sending events.
+
+## Typed Event System
+
+The Realtime API has 9 client events and 28+ server events. Type them properly:
+
+```typescript
+// shared/realtime-events.ts
+import { z } from "zod/v4";
+
+// ─────────────────────────────────────────────────────────────
+// Client Events (what you send)
+// ─────────────────────────────────────────────────────────────
+
+const SessionUpdateEventSchema = z.object({
+  type: z.literal("session.update"),
+  session: z.object({
+    instructions: z.string().optional(),
+    voice: z.string().optional(),
+    input_audio_transcription: z.object({ model: z.string() }).optional(),
+    turn_detection: z.object({
+      type: z.enum(["server_vad", "none"]),
+      threshold: z.number().optional(),
+      prefix_padding_ms: z.number().optional(),
+      silence_duration_ms: z.number().optional(),
+      create_response: z.boolean().optional(),
+    }).optional(),
+    tools: z.array(z.object({
+      type: z.literal("function"),
+      name: z.string(),
+      description: z.string(),
+      parameters: z.record(z.unknown()),
+    })).optional(),
+  }),
+});
+
+const ResponseCreateEventSchema = z.object({
+  type: z.literal("response.create"),
+  response: z.object({
+    modalities: z.array(z.enum(["text", "audio"])).optional(),
+    instructions: z.string().optional(),
+    conversation: z.enum(["auto", "none"]).optional(),
+  }).optional(),
+});
+
+const ResponseCancelEventSchema = z.object({
+  type: z.literal("response.cancel"),
+});
+
+const ConversationItemTruncateSchema = z.object({
+  type: z.literal("conversation.item.truncate"),
+  item_id: z.string(),
+  content_index: z.number(),
+  audio_end_ms: z.number(),
+});
+
+const ClientEventSchema = z.discriminatedUnion("type", [
+  SessionUpdateEventSchema,
+  ResponseCreateEventSchema,
+  ResponseCancelEventSchema,
+  ConversationItemTruncateSchema,
+]);
+
+type ClientEvent = z.infer<typeof ClientEventSchema>;
+
+// ─────────────────────────────────────────────────────────────
+// Server Events (what you receive)
+// ─────────────────────────────────────────────────────────────
+
+const ServerEventBaseSchema = z.object({
+  event_id: z.string(),
+});
+
+const SessionCreatedEventSchema = ServerEventBaseSchema.extend({
+  type: z.literal("session.created"),
+  session: z.object({
+    id: z.string(),
+    model: z.string(),
+    expires_at: z.number(),
+  }).passthrough(),
+});
+
+const ResponseCreatedEventSchema = ServerEventBaseSchema.extend({
+  type: z.literal("response.created"),
+  response: z.object({
+    id: z.string(),
+    status: z.enum(["in_progress", "completed", "cancelled", "failed", "incomplete"]),
+  }).passthrough(),
+});
+
+const ResponseDoneEventSchema = ServerEventBaseSchema.extend({
+  type: z.literal("response.done"),
+  response: z.object({
+    id: z.string(),
+    status: z.string(),
+    output: z.array(z.object({
+      type: z.string(),
+      id: z.string(),
+    }).passthrough()).optional(),
+  }).passthrough(),
+});
+
+const InputAudioBufferSpeechStartedSchema = ServerEventBaseSchema.extend({
+  type: z.literal("input_audio_buffer.speech_started"),
+  audio_start_ms: z.number(),
+});
+
+const InputAudioBufferSpeechStoppedSchema = ServerEventBaseSchema.extend({
+  type: z.literal("input_audio_buffer.speech_stopped"),
+  audio_end_ms: z.number(),
+});
+
+const ConversationItemInputAudioTranscriptionCompletedSchema = ServerEventBaseSchema.extend({
+  type: z.literal("conversation.item.input_audio_transcription.completed"),
+  item_id: z.string(),
+  transcript: z.string(),
+});
+
+const ResponseAudioTranscriptDeltaSchema = ServerEventBaseSchema.extend({
+  type: z.literal("response.audio_transcript.delta"),
+  response_id: z.string(),
+  delta: z.string(),
+});
+
+const ResponseFunctionCallArgumentsDoneSchema = ServerEventBaseSchema.extend({
+  type: z.literal("response.function_call_arguments.done"),
+  call_id: z.string(),
+  name: z.string(),
+  arguments: z.string(),
+});
+
+const ErrorEventSchema = ServerEventBaseSchema.extend({
+  type: z.literal("error"),
+  error: z.object({
+    type: z.string(),
+    code: z.string().optional(),
+    message: z.string(),
+  }),
+});
+
+const ServerEventSchema = z.discriminatedUnion("type", [
+  SessionCreatedEventSchema,
+  ResponseCreatedEventSchema,
+  ResponseDoneEventSchema,
+  InputAudioBufferSpeechStartedSchema,
+  InputAudioBufferSpeechStoppedSchema,
+  ConversationItemInputAudioTranscriptionCompletedSchema,
+  ResponseAudioTranscriptDeltaSchema,
+  ResponseFunctionCallArgumentsDoneSchema,
+  ErrorEventSchema,
+]).passthrough(); // Allow unknown event types to pass through
+
+type ServerEvent = z.infer<typeof ServerEventSchema>;
+
+export { 
+  ClientEventSchema, 
+  ServerEventSchema, 
+  type ClientEvent, 
+  type ServerEvent 
+};
+```
 
-  const events = peer.createDataChannel('oai-events')
+## Event Handler Pattern
 
-  events.addEventListener('message', ev => {
-    try {
-      const event = JSON.parse(ev.data)
-      console.log('realtime:event', event.type, event)
-    } catch {}
-  })
+Wrap the data channel in a typed interface:
 
-  const offer = await peer.createOffer()
-  await peer.setLocalDescription(offer)
+```typescript
+// client/realtime-channel.ts
+import { ClientEvent, ServerEvent, ServerEventSchema } from "./realtime-events";
 
-  const form = new FormData()
-  form.append('sdp', new Blob([offer.sdp ?? ''], { type: 'application/sdp' }))
+type ServerEventHandler = (event: ServerEvent) => void;
 
-  const answerResponse = await fetch('https://api.openai.com/v1/realtime/calls', {
-    method: 'POST',
-    headers: { Authorization: `Bearer ${clientSecret}` },
-    body: form
-  })
+interface RealtimeChannel {
+  send: (event: ClientEvent) => void;
+  onEvent: (handler: ServerEventHandler) => void;
+  onError: (handler: (error: Error) => void) => void;
+}
 
-  if (!answerResponse.ok) throw new Error(await answerResponse.text())
+function createRealtimeChannel(dataChannel: RTCDataChannel): RealtimeChannel {
+  const eventHandlers: ServerEventHandler[] = [];
+  const errorHandlers: ((error: Error) => void)[] = [];
 
-  const answerSdp = await answerResponse.text()
-  await peer.setRemoteDescription({ type: 'answer', sdp: answerSdp })
+  dataChannel.onmessage = (messageEvent) => {
+    try {
+      const rawEvent = JSON.parse(messageEvent.data);
+      const parsedEvent = ServerEventSchema.parse(rawEvent);
+      
+      for (const handler of eventHandlers) {
+        handler(parsedEvent);
+      }
+    } catch (error) {
+      for (const handler of errorHandlers) {
+        handler(error instanceof Error ? error : new Error(String(error)));
+      }
+    }
+  };
 
-  return { peer, events }
+  return {
+    send(event: ClientEvent) {
+      dataChannel.send(JSON.stringify(event));
+    },
+    onEvent(handler: ServerEventHandler) {
+      eventHandlers.push(handler);
+    },
+    onError(handler: (error: Error) => void) {
+      errorHandlers.push(handler);
+    },
+  };
 }
 ```
 
-Once `setRemoteDescription` succeeds, you’re “in the call”: audio track in, audio track out, plus an event stream in the data channel.
-
----
-
-## Session steering: “custom instructions” is just session.update
-
-Lilac’s custom instructions UI maps directly to one idea:
-**change the session config without restarting the call**.
-
-```ts
-function sessionUpdate(events: RTCDataChannel, patch: unknown) {
-  events.send(JSON.stringify({ type: 'session.update', session: patch }))
+## Session Configuration
+
+Configure the session after connection:
+
+```typescript
+function configureSession(channel: RealtimeChannel, instructions: string) {
+  channel.send({
+    type: "session.update",
+    session: {
+      instructions,
+      voice: "marin",
+      input_audio_transcription: { model: "gpt-4o-transcribe" },
+      turn_detection: {
+        type: "server_vad",
+        threshold: 0.5,
+        prefix_padding_ms: 300,
+        silence_duration_ms: 500, // 500ms works for most cases
+        create_response: true,    // Auto-trigger response when user stops
+      },
+    },
+  });
 }
-
-sessionUpdate(events, {
-  type: 'realtime',
-  instructions: [
-    'You are a real-time conversational translator.',
-    'Keep the cadence and intent. Don’t over-explain.',
-    'Prefer Teochew. If uncertain, fall back to Mandarin.',
-    'Return short turns. Match the speaker’s tone.'
-  ].join('\n')
-})
 ```
 
-A practical trick: keep a few “personas” on the client and flip between them mid-session:
+`silence_duration_ms` controls how long the system waits after speech stops before triggering a response. 500ms is a good default. For interview or educational contexts, 800–1000ms gives users more time to think.
 
-* translation-first
-* teacher-first
-* roleplay / immersion mode
+## Function Calling
 
----
+Tools are defined in the session config and invoked via events:
 
-## Turning model inference on (response.create)
-
-In many setups, the server will create responses automatically when turn detection is enabled. But it’s still worth understanding the explicit control:
-
-```ts
-events.send(JSON.stringify({ type: 'response.create' }))
-
-events.send(
-  JSON.stringify({
-    type: 'response.create',
-    response: {
-      conversation: 'none',
-      output_modalities: ['text'],
-      instructions: 'Summarize in one sentence.',
-      metadata: { purpose: 'summarization' },
-      input: [
-        {
-          type: 'message',
-          role: 'user',
-          content: [{ type: 'input_text', text: 'Summarize what we said.' }]
-        }
-      ]
-    }
-  })
-)
+```typescript
+const tools = [
+  {
+    type: "function" as const,
+    name: "get_weather",
+    description: "Get current weather for a location",
+    parameters: {
+      type: "object",
+      properties: {
+        location: { type: "string", description: "City and state, e.g. San Francisco, CA" },
+      },
+      required: ["location"],
+    },
+  },
+];
+
+// Configure session with tools
+channel.send({
+  type: "session.update",
+  session: { tools },
+});
+
+// Handle function calls
+channel.onEvent((event) => {
+  if (event.type === "response.function_call_arguments.done") {
+    const args = JSON.parse(event.arguments);
+    
+    // Execute the function
+    const result = await executeFunction(event.name, args);
+    
+    // Send result back
+    channel.send({
+      type: "conversation.item.create",
+      item: {
+        type: "function_call_output",
+        call_id: event.call_id,
+        output: JSON.stringify(result),
+      },
+    });
+    
+    // Trigger model to continue
+    channel.send({ type: "response.create" });
+  }
+});
 ```
 
-This is the “you can do more than phone-call UX” moment: you can run side tasks next to the live conversation.
-
----
-
-## Interruptions: cancelling, truncating, and cutting off audio
-
-Real speech is messy: people interrupt, laugh, restart, bail mid-sentence.
-
-A few “must-have” controls:
-
-* `response.cancel` — stop the model response
-* `conversation.item.truncate` — tell the server what audio you actually played (important when the user interrupts)
-* `output_audio_buffer.clear` — **WebRTC-only** cutoff for audio output
-
-That’s how you get “natural” barge-in behavior instead of two voices yelling over each other.
-
----
+The GA model supports **async function calling** — the model can continue speaking while waiting for function results instead of blocking.
+
+## Handling Interruptions
+
+When users interrupt, you need to:
+
+1. Cancel the current response
+2. Truncate the conversation to what was actually heard
+3. Clear the audio buffer
+
+```typescript
+function handleInterruption(
+  channel: RealtimeChannel, 
+  currentItemId: string, 
+  audioPlayedMs: number
+) {
+  // Stop generation
+  channel.send({ type: "response.cancel" });
+  
+  // Sync server context with what user actually heard
+  channel.send({
+    type: "conversation.item.truncate",
+    item_id: currentItemId,
+    content_index: 0,
+    audio_end_ms: audioPlayedMs,
+  });
+  
+  // Clear any buffered audio (WebRTC only)
+  channel.send({ type: "output_audio_buffer.clear" });
+}
+```
 
-## How Rubric infra makes this fast (and boring)
+This is critical for natural conversation. The model generates audio faster than realtime playback, so without truncation, the conversation context includes text the user never heard.
 
-Lilac is built on create-rubric-app energy:
+## Context and Token Management
 
-* opinionated app scaffolding (Next.js + TS + Biome + Zod)
-* schemas everywhere, so your client doesn’t guess shapes
-* event streaming patterns that don’t degrade into “JSON.parse() spaghetti”
-* quick deploy defaults, so a weekend prototype can become “send a link”
+Key limits:
 
-The point is not that Lilac is “enterprise ready.”
-The point is that your experiments can still have **good bones**.
+| Constraint | Value |
+|------------|-------|
+| Context window | 32,768 tokens |
+| Max response tokens | 4,096 tokens |
+| Max instructions + tools | 16,384 tokens |
+| Session duration | 60 minutes |
+| Audio token rate | ~800 tokens/minute |
 
----
+When context fills up, the API automatically truncates (drops) oldest messages. Configure this behavior:
 
-## How we used Codex (the teammate who never sleeps)
+```typescript
+channel.send({
+  type: "session.update",
+  session: {
+    truncation: {
+      type: "retention_ratio",
+      retention_ratio: 0.8, // Drop 20% when truncating (better for cache hits)
+    },
+  },
+});
+```
 
-We leaned on Codex in a very specific way:
+Setting `retention_ratio: 0.8` means when truncation triggers, it drops more than the minimum needed. This preserves prompt caching (which requires identical prefixes) better than truncating one message at a time.
 
-1. **Bootstrap**: generated the app with `create-rubric-app`.
-2. **Deploy**: used the same Rubric workflow to bring up infra fast (so we could test on real phones, not just localhost).
-3. **Codex environment**: configured Codex with
+## Cost Model
 
-   * the repo commands to install/run/test,
-   * the environment variables it needs to actually start the app,
-   * custom instructions describing Lilac’s architecture and “don’t touch the secret key boundaries”.
+Audio tokens are priced differently than text:
 
-The best part wasn’t “Codex wrote code for us.”
-It was that we could say: “stand this up, reproduce the bug, and fix it *without changing the shape of the types*.”
-That’s a very Rubric way to move.
+| Type | Price per 1M tokens | ~Per minute |
+|------|--------------------:|------------:|
+| Audio input | $40 (cached: $2.50) | ~$0.03 |
+| Audio output | $80 | ~$0.06 |
+| Text input | $2.50 | — |
+| Text output | $10 | — |
 
----
+For a 10-minute conversation with ~70% talk time, expect roughly $2–3. The automatic context caching helps significantly for longer sessions.
 
-## Presentation-ready nits (if you want Lilac to read like a real OSS project)
+## Production Checklist
 
-If you’re polishing the repo, these changes are high leverage:
+1. **Never expose your API key** — mint ephemeral client secrets
+2. **Handle reconnection** — sessions can drop; implement exponential backoff
+3. **Track audio playback position** — needed for accurate truncation on interrupts  
+4. **Log events** — the `event_id` field helps correlate issues
+5. **Set idle timeouts** — use `idle_timeout_ms` in VAD config to handle silent users
+6. **Test with network throttling** — WebRTC handles jitter well, but test your UI
+7. **Implement mute/unmute** — VAD can trigger on background noise
+8. **Show connection state** — users need feedback when connecting
 
-* Add a README section called **“Realtime API shape (GA vs beta)”** with a one-paragraph migration note.
-* Centralize the Realtime event types into a single `realtimeEvents.ts` with a Zod discriminated union.
-* Wrap the data channel in a tiny typed adapter: `sendEvent(event)` + `onEvent(handler)`.
-* Add a tiny “debug overlay” showing:
+## When Not to Use Realtime
 
-  * connection state
-  * last server event type
-  * current persona / instructions hash
-* Document the “custom instructions” format (especially for dialect docs) and include one example.
+The Realtime API is optimized for low-latency conversation. Consider alternatives when:
 
----
+- **You need deterministic output** — temperature is fixed at 0.8, no way to reduce variance
+- **Latency doesn't matter** — Chat Completions API with audio is cheaper for async use cases
+- **You need long context** — 32k tokens is the ceiling; for long documents, use text models
+- **You're doing batch processing** — Realtime is priced for interactive use
 
-Lilac is a side project, but it’s also a pretty honest answer to a real problem:
-**translation isn’t a string function, it’s a conversation.**
+## Further Reading
 
-Peace nerds (:
+- [OpenAI Realtime API Docs](https://platform.openai.com/docs/guides/realtime)
+- [OpenAI Realtime API Reference](https://platform.openai.com/docs/api-reference/realtime)
+- [Realtime Console (Reference Implementation)](https://github.com/openai/openai-realtime-console)
+- [Pipecat (Open Source Voice AI Framework)](https://github.com/pipecat-ai/pipecat)

From 1a97f0d22178a6554882174dcbd24a6e88452ac4 Mon Sep 17 00:00:00 2001
From: Dexter Storey <36115192+DexterStorey@users.noreply.github.com>
Date: Fri, 5 Dec 2025 14:23:23 -0500
Subject: [PATCH 4/4] rewrite

---
 public/images/realtime-architecture.svg | 114 ++++++++++++++++++++++++
 public/images/realtime-flow.svg         |  75 ----------------
 2 files changed, 114 insertions(+), 75 deletions(-)
 create mode 100644 public/images/realtime-architecture.svg
 delete mode 100644 public/images/realtime-flow.svg

diff --git a/public/images/realtime-architecture.svg b/public/images/realtime-architecture.svg
new file mode 100644
index 0000000..e8d6bf5
--- /dev/null
+++ b/public/images/realtime-architecture.svg
@@ -0,0 +1,114 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="900" height="400" viewBox="0 0 900 400" role="img" aria-label="OpenAI Realtime API architecture diagram">
+  <defs>
+    <style>
+      text { font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace; }
+      .label { font-size: 14px; fill: #374151; }
+      .title { font-size: 16px; font-weight: 600; fill: #111827; }
+      .box { fill: #ffffff; stroke: #e5e7eb; stroke-width: 2; rx: 8; }
+      .box-highlight { fill: #f9fafb; stroke: #111827; stroke-width: 2; rx: 8; }
+      .arrow { stroke: #6b7280; stroke-width: 2; fill: none; marker-end: url(#arrow); }
+      .arrow-media { stroke: #059669; stroke-width: 2.5; fill: none; marker-end: url(#arrow-green); }
+      .arrow-events { stroke: #2563eb; stroke-width: 2; stroke-dasharray: 6,4; fill: none; marker-end: url(#arrow-blue); }
+      .connector { stroke: #d1d5db; stroke-width: 1.5; fill: none; }
+      .note { font-size: 12px; fill: #6b7280; }
+    </style>
+    <marker id="arrow" markerWidth="10" markerHeight="10" refX="9" refY="5" orient="auto">
+      <path d="M0,0 L10,5 L0,10 Z" fill="#6b7280"/>
+    </marker>
+    <marker id="arrow-green" markerWidth="10" markerHeight="10" refX="9" refY="5" orient="auto">
+      <path d="M0,0 L10,5 L0,10 Z" fill="#059669"/>
+    </marker>
+    <marker id="arrow-blue" markerWidth="10" markerHeight="10" refX="9" refY="5" orient="auto">
+      <path d="M0,0 L10,5 L0,10 Z" fill="#2563eb"/>
+    </marker>
+  </defs>
+
+  <!-- Background -->
+  <rect width="900" height="400" fill="#fafafa"/>
+
+  <!-- Step 1: Server mints token -->
+  <rect class="box" x="40" y="40" width="180" height="100"/>
+  <text class="title" x="60" y="70">Your Server</text>
+  <text class="label" x="60" y="95">Holds API key</text>
+  <text class="label" x="60" y="115">Mints client secrets</text>
+
+  <!-- Step 2: Browser client -->
+  <rect class="box-highlight" x="40" y="180" width="180" height="140"/>
+  <text class="title" x="60" y="210">Browser Client</text>
+  <text class="label" x="60" y="235">RTCPeerConnection</text>
+  <text class="label" x="60" y="255">Microphone track</text>
+  <text class="label" x="60" y="275">DataChannel</text>
+  <text class="note" x="60" y="300">"oai-events"</text>
+
+  <!-- Step 3: OpenAI Realtime -->
+  <rect class="box-highlight" x="360" y="100" width="200" height="160"/>
+  <text class="title" x="380" y="130">OpenAI Realtime</text>
+  <text class="label" x="380" y="160">gpt-realtime model</text>
+  <text class="label" x="380" y="180">Session state</text>
+  <text class="label" x="380" y="200">VAD / turn detection</text>
+  <text class="label" x="380" y="220">Conversation context</text>
+  <text class="label" x="380" y="240">Tool execution</text>
+
+  <!-- Legend -->
+  <rect class="box" x="680" y="40" width="180" height="120"/>
+  <text class="title" x="700" y="65">Legend</text>
+  <line x1="700" y1="85" x2="740" y2="85" class="arrow-media"/>
+  <text class="label" x="750" y="90">Audio (WebRTC)</text>
+  <line x1="700" y1="110" x2="740" y2="110" class="arrow-events"/>
+  <text class="label" x="750" y="115">Events (JSON)</text>
+  <line x1="700" y1="135" x2="740" y2="135" class="arrow"/>
+  <text class="label" x="750" y="140">HTTP</text>
+
+  <!-- Connections -->
+  <!-- Server to Browser: client secret -->
+  <path class="arrow" d="M130 140 L130 175"/>
+  <text class="note" x="140" y="162">client_secret</text>
+
+  <!-- Browser to OpenAI: SDP handshake -->
+  <path class="arrow" d="M220 220 L355 180"/>
+  <text class="note" x="250" y="185">SDP offer</text>
+  
+  <path class="arrow" d="M355 195 L220 235"/>
+  <text class="note" x="250" y="230">SDP answer</text>
+
+  <!-- Browser to OpenAI: Media -->
+  <path class="arrow-media" d="M220 260 L355 200"/>
+  <text class="note" x="265" y="255" fill="#059669">mic audio</text>
+
+  <path class="arrow-media" d="M355 210 L220 275"/>
+  <text class="note" x="265" y="280" fill="#059669">model audio</text>
+
+  <!-- Browser to OpenAI: Events -->
+  <path class="arrow-events" d="M220 295 L355 230"/>
+  <text class="note" x="230" y="320" fill="#2563eb">session.update</text>
+  <text class="note" x="230" y="335" fill="#2563eb">response.create</text>
+
+  <path class="arrow-events" d="M355 245 L220 310"/>
+  <text class="note" x="270" y="305" fill="#2563eb">response.done</text>
+
+  <!-- Server to OpenAI: mint token -->
+  <path class="arrow" d="M220 70 L355 140"/>
+  <text class="note" x="250" y="90">POST /sessions</text>
+
+  <!-- Sideband connection note -->
+  <rect class="box" x="680" y="200" width="180" height="100"/>
+  <text class="title" x="700" y="225">Sideband (optional)</text>
+  <text class="label" x="700" y="250">Server → same session</text>
+  <text class="label" x="700" y="270">via WebSocket</text>
+  <text class="note" x="700" y="290">For secure tool handling</text>
+
+  <path class="connector" d="M560 180 L680 240"/>
+
+  <!-- Flow numbers -->
+  <circle cx="130" cy="155" r="12" fill="#111827"/>
+  <text x="126" y="160" fill="white" font-size="12" font-weight="bold">1</text>
+
+  <circle cx="288" cy="200" r="12" fill="#111827"/>
+  <text x="284" y="205" fill="white" font-size="12" font-weight="bold">2</text>
+
+  <circle cx="288" cy="270" r="12" fill="#111827"/>
+  <text x="284" y="275" fill="white" font-size="12" font-weight="bold">3</text>
+
+  <!-- Footer -->
+  <text x="40" y="380" class="note">1. Server mints ephemeral token   2. WebRTC handshake (SDP exchange)   3. Bidirectional audio + events</text>
+</svg>
diff --git a/public/images/realtime-flow.svg b/public/images/realtime-flow.svg
deleted file mode 100644
index 815956c..0000000
--- a/public/images/realtime-flow.svg
+++ /dev/null
@@ -1,75 +0,0 @@
-<!-- blogpost/assets/realtime-flow.svg -->
-<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="520" viewBox="0 0 1200 520" role="img" aria-label="Realtime API flow diagram">
-  <defs>
-    <style>
-      text { font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji", "Segoe UI Emoji"; font-size: 18px; }
-      .box { fill: #ffffff; stroke: #111827; stroke-width: 2; rx: 14; }
-      .title { font-size: 20px; font-weight: 700; }
-      .muted { fill: #374151; font-size: 16px; }
-      .arrow { stroke: #111827; stroke-width: 2.5; fill: none; marker-end: url(#arrowhead); }
-      .chip { fill: #f3f4f6; stroke: #111827; stroke-width: 1.5; rx: 10; }
-    </style>
-    <marker id="arrowhead" markerWidth="12" markerHeight="12" refX="10" refY="6" orient="auto">
-      <path d="M0,0 L12,6 L0,12 Z" fill="#111827"/>
-    </marker>
-  </defs>
-
-  <!-- Boxes -->
-  <rect class="box" x="60" y="90" width="320" height="340" rx="16"/>
-  <rect class="box" x="440" y="90" width="320" height="340" rx="16"/>
-  <rect class="box" x="820" y="90" width="320" height="340" rx="16"/>
-
-  <!-- Left: Client -->
-  <text x="90" y="130" class="title">Browser client</text>
-  <text x="90" y="165" class="muted">• Mic → WebRTC audio track</text>
-  <text x="90" y="195" class="muted">• DataChannel: JSON events</text>
-  <text x="90" y="225" class="muted">• UI: custom instructions</text>
-  <text x="90" y="255" class="muted">• Handles interruptions</text>
-
-  <rect class="chip" x="90" y="300" width="260" height="44" rx="12"/>
-  <text x="105" y="329">RTCPeerConnection</text>
-
-  <rect class="chip" x="90" y="360" width="260" height="44" rx="12"/>
-  <text x="105" y="389">RTCDataChannel: oai-events</text>
-
-  <!-- Middle: Your server -->
-  <text x="470" y="130" class="title">Rubric server</text>
-  <text x="470" y="165" class="muted">• Holds real API key</text>
-  <text x="470" y="195" class="muted">• Mints client secrets</text>
-  <text x="470" y="225" class="muted">• Zod-validated boundary</text>
-  <text x="470" y="255" class="muted">• Optional tools/business logic</text>
-
-  <rect class="chip" x="470" y="300" width="260" height="44" rx="12"/>
-  <text x="485" y="329">POST /v1/realtime/client_secrets</text>
-
-  <!-- Right: OpenAI -->
-  <text x="850" y="130" class="title">OpenAI Realtime API</text>
-  <text x="850" y="165" class="muted">• WebRTC call creation</text>
-  <text x="850" y="195" class="muted">• Speech-to-speech model</text>
-  <text x="850" y="225" class="muted">• Server events stream</text>
-  <text x="850" y="255" class="muted">• Turn detection / VAD</text>
-
-  <rect class="chip" x="850" y="300" width="260" height="44" rx="12"/>
-  <text x="865" y="329">POST /v1/realtime/calls (SDP)</text>
-
-  <rect class="chip" x="850" y="360" width="260" height="44" rx="12"/>
-  <text x="865" y="389">Events: session.update / response.*</text>
-
-  <!-- Arrows -->
-  <path class="arrow" d="M 380 170 C 410 170, 410 170, 440 170"/>
-  <text x="390" y="155" class="muted">fetch secret</text>
-
-  <path class="arrow" d="M 760 170 C 790 170, 790 170, 820 170"/>
-  <text x="770" y="155" class="muted">mint</text>
-
-  <path class="arrow" d="M 380 330 C 600 330, 600 330, 820 330"/>
-  <text x="470" y="315" class="muted">SDP offer → answer</text>
-
-  <path class="arrow" d="M 820 395 C 600 395, 600 395, 380 395"/>
-  <text x="560" y="380" class="muted">audio + events</text>
-
-  <!-- Footer note -->
-  <text x="60" y="485" class="muted">
-    Tip: keep the client secret ephemeral; keep tools/business logic on the server; drive behaviors with session.update.
-  </text>
-</svg>