From fc5c3d6aefabff530bc5a4116d3b16c749e91f4a Mon Sep 17 00:00:00 2001
From: Dexter Storey <36115192+DexterStorey@users.noreply.github.com>
Date: Fri, 5 Dec 2025 12:30:06 -0500
Subject: [PATCH 1/4] Add Working with OAI Realtime API blogpost
---
blogpost/assets/realtime-flow.svg | 75 +++++
blogpost/working-with-oai-realtime-api.mdx | 346 +++++++++++++++++++++
2 files changed, 421 insertions(+)
create mode 100644 blogpost/assets/realtime-flow.svg
create mode 100644 blogpost/working-with-oai-realtime-api.mdx
diff --git a/blogpost/assets/realtime-flow.svg b/blogpost/assets/realtime-flow.svg
new file mode 100644
index 0000000..815956c
--- /dev/null
+++ b/blogpost/assets/realtime-flow.svg
@@ -0,0 +1,75 @@
+
+
diff --git a/blogpost/working-with-oai-realtime-api.mdx b/blogpost/working-with-oai-realtime-api.mdx
new file mode 100644
index 0000000..82a661e
--- /dev/null
+++ b/blogpost/working-with-oai-realtime-api.mdx
@@ -0,0 +1,346 @@
+---
+title: Working with OAI Realtime API
+description: Lilac — an experimental conversational translator + language playground
+---
+
+# Working with OAI Realtime API
+
+Lilac started as a side quest: **make conversational translation feel like an actual conversation**—not an awkward “speak… wait… read… repeat” loop.
+
+It’s experimental. It’s a little chaotic. It’s also *weirdly* good.
+
+## TL;DR
+
+- **Realtime is a transport**, not a chatbot UI: WebRTC (browser), WebSocket (servers), SIP (phone calls).
+- You mint a short-lived **client secret** server-side, then use it in the browser to create a **WebRTC call**.
+- After the SDP handshake, you drive everything with JSON **events** (session updates, responses, interruptions).
+- Rubric scaffolding (create-rubric-app) keeps the whole thing **type-safe and shippable** without turning it into a Big Project™.
+
+---
+
+## Why we ditched “traditional translation”
+
+Google Translate is fine for menus. In conversation, it’s… not.
+
+What we kept running into:
+
+- **Back-channels get lost** (“mm”, “ah”, “wait wait”, “yeah yeah”) so turns feel cold.
+- **Code-switching** (English + a dialect word + Mandarin filler) comes back mangled.
+- **Long turns become summaries** when you wanted a faithful, *human* rendition.
+- **Timing matters**: the pause before a sentence, the self-correction mid-phrase—those are meaning.
+
+Lilac’s bet is simple: if we can do *speech-to-speech* with low latency, the translation stops feeling like “input → output” and starts feeling like “talking”.
+
+---
+
+## Three real-world workflows (how Lilac gets used)
+
+### 1) Dexter + a friend (taking turns)
+They pass the phone back and forth. Dexter speaks English, the model answers in Teochew (or Teochew-ish), and the vibe stays intact because the model is responding *as the other person*, not as a translator bot.
+
+What matters here:
+- low-latency turn handoff
+- voice that doesn’t sound like it’s reading a caption
+
+### 2) Ted, the self-learner
+Ted uses Lilac like a private tutor:
+- he speaks in a target language,
+- asks for corrections,
+- gets rephrases inline,
+- keeps talking without “ending the session” to do grammar homework.
+
+What matters here:
+- visible corrections (data channel events)
+- consistent “teacher persona” prompts
+- quick mid-session instruction tweaks
+
+### 3) Dexter’s friend, custom dialect support (Teochew from a Mandarin base)
+She uses **custom instructions** like a dialect adapter:
+- start from Mandarin-ish defaults,
+- add a doc describing pronunciation + word choices,
+- steer output toward Teochew without rebuilding the app.
+
+What matters here:
+- instructions that live at the session boundary (so you can change them on the fly)
+- “prompt as config” instead of “prompt baked into code”
+
+---
+
+## The Realtime mental model (what you’re actually building)
+
+Here’s the shape we keep in our heads:
+
+
+
+1. **Server minting**: your server creates a short-lived client secret (do *not* ship your real API key).
+2. **WebRTC handshake**: browser creates an SDP offer; OpenAI returns an SDP answer.
+3. **Media + events**: audio flows as WebRTC media; control plane flows as JSON events in a data channel.
+4. **Session steering**: you can update instructions, tools, turn detection, etc. mid-call.
+5. **Interruptions are a feature**: you’ll want to cancel, truncate, and clear output audio.
+
+---
+
+## GA vs Beta (and why you should care)
+
+Realtime has had multiple “shapes” over time. Today there’s a **GA interface** and an older **beta interface**.
+
+Our advice:
+- build against GA,
+- keep a mental map of beta, because you’ll see older examples floating around.
+
+Lilac is small enough that swapping shapes is feasible, and Rubric’s schema-first boundaries make it harder to accidentally break the client when OpenAI shifts an object shape.
+
+---
+
+## Using Realtime in practice: the small, correct happy path
+
+### Step 1 — Server: create a client secret (typed)
+
+You want a server-only endpoint that:
+- validates input with Zod,
+- calls OpenAI to mint a short-lived client secret,
+- returns only what the browser needs.
+
+```ts
+// app/api/realtime/secret/route.ts
+import { NextResponse } from 'next/server'
+import { z } from 'zod'
+
+const CreateSecretInputSchema = z.object({
+ model: z.string().default('gpt-realtime'),
+ instructions: z.string().optional(),
+ voice: z.string().optional(),
+ // keep this short: it’s a browser token, not your auth system
+ ttlSeconds: z.number().int().min(30).max(60 * 30).default(60 * 10),
+})
+
+const ClientSecretResponseSchema = z.object({
+ value: z.string(), // looks like ek_...
+ expires_at: z.number(), // epoch seconds
+ session: z.object({
+ id: z.string(),
+ model: z.string(),
+ type: z.string(),
+ }).passthrough(),
+})
+
+export async function POST(request: Request) {
+ const input = CreateSecretInputSchema.parse(await request.json().catch(() => ({})))
+
+ const response = await fetch('https://api.openai.com/v1/realtime/client_secrets', {
+ method: 'POST',
+ headers: {
+ Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify({
+ expires_after: { anchor: 'created_at', seconds: input.ttlSeconds },
+ session: {
+ type: 'realtime',
+ model: input.model,
+ ...(input.instructions ? { instructions: input.instructions } : {}),
+ ...(input.voice ? { audio: { output: { voice: input.voice } } } : {}),
+ },
+ }),
+ })
+
+ if (!response.ok) {
+ return NextResponse.json({ error: await response.text() }, { status: 500 })
+ }
+
+ const json = await response.json()
+ const parsed = ClientSecretResponseSchema.parse(json)
+
+ return NextResponse.json({
+ clientSecret: parsed.value,
+ expiresAt: parsed.expires_at,
+ session: parsed.session,
+ })
+}
+```
+
+Rubric pattern: **schemas at boundaries**, and the browser only gets the ephemeral secret.
+
+---
+
+### Step 2 — Client: WebRTC handshake + data channel
+
+This is the part everyone overcomplicates. Keep it boring.
+
+```ts
+async function connectRealtime() {
+ // 1) Ask our server for a short-lived secret
+ const secret = await fetch('/api/realtime/secret', { method: 'POST' }).then(r => r.json())
+ const clientSecret = secret.clientSecret as string
+
+ // 2) WebRTC peer + mic
+ const peer = new RTCPeerConnection()
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+
+ for (const track of stream.getTracks()) peer.addTrack(track, stream)
+
+ // 3) Data channel for events
+ const events = peer.createDataChannel('oai-events')
+
+ events.addEventListener('message', (ev) => {
+ // server events arrive as JSON
+ try {
+ const event = JSON.parse(ev.data)
+ // route into your typed event handler here
+ console.log('realtime:event', event.type, event)
+ } catch {
+ // ignore
+ }
+ })
+
+ // 4) Create SDP offer
+ const offer = await peer.createOffer()
+ await peer.setLocalDescription(offer)
+
+ // 5) Send offer to OpenAI, get SDP answer back
+ const form = new FormData()
+ form.append('sdp', new Blob([offer.sdp ?? ''], { type: 'application/sdp' }))
+
+ const answerResponse = await fetch('https://api.openai.com/v1/realtime/calls', {
+ method: 'POST',
+ headers: { Authorization: `Bearer ${clientSecret}` },
+ body: form,
+ })
+
+ if (!answerResponse.ok) throw new Error(await answerResponse.text())
+
+ const answerSdp = await answerResponse.text()
+ await peer.setRemoteDescription({ type: 'answer', sdp: answerSdp })
+
+ return { peer, events }
+}
+```
+
+Once `setRemoteDescription` succeeds, you’re “in the call”: audio track in, audio track out, plus an event stream in the data channel.
+
+---
+
+## Session steering: “custom instructions” is just session.update
+
+Lilac’s custom instructions UI maps directly to one idea:
+**change the session config without restarting the call**.
+
+```ts
+function sessionUpdate(events: RTCDataChannel, patch: unknown) {
+ events.send(JSON.stringify({ type: 'session.update', session: patch }))
+}
+
+// Example: translation-first persona
+sessionUpdate(events, {
+ type: 'realtime',
+ instructions: [
+ 'You are a real-time conversational translator.',
+ 'Keep the cadence and intent. Don’t over-explain.',
+ 'Prefer Teochew. If uncertain, fall back to Mandarin.',
+ 'Return short turns. Match the speaker’s tone.',
+ ].join('\n'),
+})
+```
+
+A practical trick: keep a few “personas” on the client and flip between them mid-session:
+
+* translation-first
+* teacher-first
+* roleplay / immersion mode
+
+---
+
+## Turning model inference on (response.create)
+
+In many setups, the server will create responses automatically when turn detection is enabled. But it’s still worth understanding the explicit control:
+
+```ts
+// simplest form: “please respond now”
+events.send(JSON.stringify({ type: 'response.create' }))
+
+// out-of-band response (e.g. summarization) that doesn’t write to the main conversation
+events.send(JSON.stringify({
+ type: 'response.create',
+ response: {
+ conversation: 'none',
+ output_modalities: ['text'],
+ instructions: 'Summarize in one sentence.',
+ metadata: { purpose: 'summarization' },
+ input: [
+ { type: 'message', role: 'user', content: [{ type: 'input_text', text: 'Summarize what we said.' }] },
+ ],
+ },
+}))
+```
+
+This is the “you can do more than phone-call UX” moment: you can run side tasks next to the live conversation.
+
+---
+
+## Interruptions: cancelling, truncating, and cutting off audio
+
+Real speech is messy: people interrupt, laugh, restart, bail mid-sentence.
+
+A few “must-have” controls:
+
+* `response.cancel` — stop the model response
+* `conversation.item.truncate` — tell the server what audio you actually played (important when the user interrupts)
+* `output_audio_buffer.clear` — **WebRTC-only** cutoff for audio output
+
+That’s how you get “natural” barge-in behavior instead of two voices yelling over each other.
+
+---
+
+## How Rubric infra makes this fast (and boring)
+
+Lilac is built on create-rubric-app energy:
+
+* opinionated app scaffolding (Next.js + TS + Biome + Zod)
+* schemas everywhere, so your client doesn’t guess shapes
+* event streaming patterns that don’t degrade into “JSON.parse() spaghetti”
+* quick deploy defaults, so a weekend prototype can become “send a link”
+
+The point is not that Lilac is “enterprise ready.”
+The point is that your experiments can still have **good bones**.
+
+---
+
+## How we used Codex (the teammate who never sleeps)
+
+We leaned on Codex in a very specific way:
+
+1. **Bootstrap**: generated the app with `create-rubric-app`.
+2. **Deploy**: used the same Rubric workflow to bring up infra fast (so we could test on real phones, not just localhost).
+3. **Codex environment**: configured Codex with
+
+ * the repo commands to install/run/test,
+ * the environment variables it needs to actually start the app,
+ * custom instructions describing Lilac’s architecture and “don’t touch the secret key boundaries”.
+
+The best part wasn’t “Codex wrote code for us.”
+It was that we could say: “stand this up, reproduce the bug, and fix it *without changing the shape of the types*.”
+That’s a very Rubric way to move.
+
+---
+
+## Presentation-ready nits (if you want Lilac to read like a real OSS project)
+
+If you’re polishing the repo, these changes are high leverage:
+
+* Add a README section called **“Realtime API shape (GA vs beta)”** with a one-paragraph migration note.
+* Centralize the Realtime event types into a single `realtimeEvents.ts` with a Zod discriminated union.
+* Wrap the data channel in a tiny typed adapter: `sendEvent(event)` + `onEvent(handler)`.
+* Add a tiny “debug overlay” showing:
+
+ * connection state
+ * last server event type
+ * current persona / instructions hash
+* Document the “custom instructions” format (especially for dialect docs) and include one example.
+
+---
+
+Lilac is a side project, but it’s also a pretty honest answer to a real problem:
+**translation isn’t a string function, it’s a conversation.**
+
+Peace nerds (:
+
From cd3779fe19c2a98ce5d2a563c3e5e8ff6feb1575 Mon Sep 17 00:00:00 2001
From: Dexter Storey <36115192+DexterStorey@users.noreply.github.com>
Date: Fri, 5 Dec 2025 12:44:17 -0500
Subject: [PATCH 2/4] Surface realtime blog post
---
.../images}/realtime-flow.svg | 0
src/app/icon.tsx | 1 +
.../posts}/working-with-oai-realtime-api.mdx | 100 +++++++++---------
src/ui/video/video.tsx | 12 +--
4 files changed, 56 insertions(+), 57 deletions(-)
rename {blogpost/assets => public/images}/realtime-flow.svg (100%)
rename {blogpost => src/lib/posts}/working-with-oai-realtime-api.mdx (87%)
diff --git a/blogpost/assets/realtime-flow.svg b/public/images/realtime-flow.svg
similarity index 100%
rename from blogpost/assets/realtime-flow.svg
rename to public/images/realtime-flow.svg
diff --git a/src/app/icon.tsx b/src/app/icon.tsx
index f884e8a..d857430 100644
--- a/src/app/icon.tsx
+++ b/src/app/icon.tsx
@@ -11,6 +11,7 @@ export const size = {
export default async function Icon() {
return new ImageResponse(