diff --git a/public/images/realtime-architecture.svg b/public/images/realtime-architecture.svg
new file mode 100644
index 0000000..e8d6bf5
--- /dev/null
+++ b/public/images/realtime-architecture.svg
@@ -0,0 +1,114 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="900" height="400" viewBox="0 0 900 400" role="img" aria-label="OpenAI Realtime API architecture diagram">
+  <defs>
+    <style>
+      text { font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace; }
+      .label { font-size: 14px; fill: #374151; }
+      .title { font-size: 16px; font-weight: 600; fill: #111827; }
+      .box { fill: #ffffff; stroke: #e5e7eb; stroke-width: 2; rx: 8; }
+      .box-highlight { fill: #f9fafb; stroke: #111827; stroke-width: 2; rx: 8; }
+      .arrow { stroke: #6b7280; stroke-width: 2; fill: none; marker-end: url(#arrow); }
+      .arrow-media { stroke: #059669; stroke-width: 2.5; fill: none; marker-end: url(#arrow-green); }
+      .arrow-events { stroke: #2563eb; stroke-width: 2; stroke-dasharray: 6,4; fill: none; marker-end: url(#arrow-blue); }
+      .connector { stroke: #d1d5db; stroke-width: 1.5; fill: none; }
+      .note { font-size: 12px; fill: #6b7280; }
+    </style>
+    <marker id="arrow" markerWidth="10" markerHeight="10" refX="9" refY="5" orient="auto">
+      <path d="M0,0 L10,5 L0,10 Z" fill="#6b7280"/>
+    </marker>
+    <marker id="arrow-green" markerWidth="10" markerHeight="10" refX="9" refY="5" orient="auto">
+      <path d="M0,0 L10,5 L0,10 Z" fill="#059669"/>
+    </marker>
+    <marker id="arrow-blue" markerWidth="10" markerHeight="10" refX="9" refY="5" orient="auto">
+      <path d="M0,0 L10,5 L0,10 Z" fill="#2563eb"/>
+    </marker>
+  </defs>
+
+  <!-- Background -->
+  <rect width="900" height="400" fill="#fafafa"/>
+
+  <!-- Step 1: Server mints token -->
+  <rect class="box" x="40" y="40" width="180" height="100"/>
+  <text class="title" x="60" y="70">Your Server</text>
+  <text class="label" x="60" y="95">Holds API key</text>
+  <text class="label" x="60" y="115">Mints client secrets</text>
+
+  <!-- Step 2: Browser client -->
+  <rect class="box-highlight" x="40" y="180" width="180" height="140"/>
+  <text class="title" x="60" y="210">Browser Client</text>
+  <text class="label" x="60" y="235">RTCPeerConnection</text>
+  <text class="label" x="60" y="255">Microphone track</text>
+  <text class="label" x="60" y="275">DataChannel</text>
+  <text class="note" x="60" y="300">"oai-events"</text>
+
+  <!-- Step 3: OpenAI Realtime -->
+  <rect class="box-highlight" x="360" y="100" width="200" height="160"/>
+  <text class="title" x="380" y="130">OpenAI Realtime</text>
+  <text class="label" x="380" y="160">gpt-realtime model</text>
+  <text class="label" x="380" y="180">Session state</text>
+  <text class="label" x="380" y="200">VAD / turn detection</text>
+  <text class="label" x="380" y="220">Conversation context</text>
+  <text class="label" x="380" y="240">Tool execution</text>
+
+  <!-- Legend -->
+  <rect class="box" x="680" y="40" width="180" height="120"/>
+  <text class="title" x="700" y="65">Legend</text>
+  <line x1="700" y1="85" x2="740" y2="85" class="arrow-media"/>
+  <text class="label" x="750" y="90">Audio (WebRTC)</text>
+  <line x1="700" y1="110" x2="740" y2="110" class="arrow-events"/>
+  <text class="label" x="750" y="115">Events (JSON)</text>
+  <line x1="700" y1="135" x2="740" y2="135" class="arrow"/>
+  <text class="label" x="750" y="140">HTTP</text>
+
+  <!-- Connections -->
+  <!-- Server to Browser: client secret -->
+  <path class="arrow" d="M130 140 L130 175"/>
+  <text class="note" x="140" y="162">client_secret</text>
+
+  <!-- Browser to OpenAI: SDP handshake -->
+  <path class="arrow" d="M220 220 L355 180"/>
+  <text class="note" x="250" y="185">SDP offer</text>
+  
+  <path class="arrow" d="M355 195 L220 235"/>
+  <text class="note" x="250" y="230">SDP answer</text>
+
+  <!-- Browser to OpenAI: Media -->
+  <path class="arrow-media" d="M220 260 L355 200"/>
+  <text class="note" x="265" y="255" fill="#059669">mic audio</text>
+
+  <path class="arrow-media" d="M355 210 L220 275"/>
+  <text class="note" x="265" y="280" fill="#059669">model audio</text>
+
+  <!-- Browser to OpenAI: Events -->
+  <path class="arrow-events" d="M220 295 L355 230"/>
+  <text class="note" x="230" y="320" fill="#2563eb">session.update</text>
+  <text class="note" x="230" y="335" fill="#2563eb">response.create</text>
+
+  <path class="arrow-events" d="M355 245 L220 310"/>
+  <text class="note" x="270" y="305" fill="#2563eb">response.done</text>
+
+  <!-- Server to OpenAI: mint token -->
+  <path class="arrow" d="M220 70 L355 140"/>
+  <text class="note" x="250" y="90">POST /sessions</text>
+
+  <!-- Sideband connection note -->
+  <rect class="box" x="680" y="200" width="180" height="100"/>
+  <text class="title" x="700" y="225">Sideband (optional)</text>
+  <text class="label" x="700" y="250">Server → same session</text>
+  <text class="label" x="700" y="270">via WebSocket</text>
+  <text class="note" x="700" y="290">For secure tool handling</text>
+
+  <path class="connector" d="M560 180 L680 240"/>
+
+  <!-- Flow numbers -->
+  <circle cx="130" cy="155" r="12" fill="#111827"/>
+  <text x="126" y="160" fill="white" font-size="12" font-weight="bold">1</text>
+
+  <circle cx="288" cy="200" r="12" fill="#111827"/>
+  <text x="284" y="205" fill="white" font-size="12" font-weight="bold">2</text>
+
+  <circle cx="288" cy="270" r="12" fill="#111827"/>
+  <text x="284" y="275" fill="white" font-size="12" font-weight="bold">3</text>
+
+  <!-- Footer -->
+  <text x="40" y="380" class="note">1. Server mints ephemeral token   2. WebRTC handshake (SDP exchange)   3. Bidirectional audio + events</text>
+</svg>
diff --git a/src/app/icon.tsx b/src/app/icon.tsx
index f884e8a..d857430 100644
--- a/src/app/icon.tsx
+++ b/src/app/icon.tsx
@@ -11,6 +11,7 @@ export const size = {
 export default async function Icon() {
 	return new ImageResponse(
 		<svg width="32" height="32" viewBox="0 0 64 64" fill="none" xmlns="http://www.w3.org/2000/svg">
+			<title>Rubric icon</title>
 			<rect width="64" height="64" fill="black" />
 			<path
 				d="M12.7998 12.7998H25.5998V25.5998H38.3998V38.3998H25.5998V51.1998H12.7998V12.7998Z"
diff --git a/src/lib/posts/working-with-oai-realtime-api.mdx b/src/lib/posts/working-with-oai-realtime-api.mdx
new file mode 100644
index 0000000..13b4071
--- /dev/null
+++ b/src/lib/posts/working-with-oai-realtime-api.mdx
@@ -0,0 +1,551 @@
+---
+title: Building Voice Agents with OpenAI's Realtime API
+description: A production-focused guide to speech-to-speech AI — architecture, typed events, and the patterns that actually matter
+---
+
+# Building Voice Agents with OpenAI's Realtime API
+
+OpenAI's Realtime API is the first production-grade speech-to-speech interface that doesn't feel like you're talking to a pipeline. Unlike traditional voice AI (STT → LLM → TTS), the Realtime API processes audio natively — which means latency drops from seconds to hundreds of milliseconds, and nuance like tone, hesitation, and interruption flow through without getting flattened into text.
+
+This guide covers what you actually need to know to ship something.
+
+> **Context**: This research came out of building [Lilac](https://lilac.chat) — a voice-to-voice translation and language learning app. The thesis: translation isn't a string function, it's a conversation. Back-channels ("mm", "ah", "wait wait"), code-switching between languages, self-corrections mid-sentence — these carry meaning that traditional translate APIs discard. The Realtime API made it possible to build something that feels like talking, not transcribing.
+
+## The Mental Model
+
+The Realtime API is **not** a REST endpoint. It's a stateful, bidirectional event protocol over WebRTC (browser) or WebSocket (server-to-server). Think of it as opening a phone call to GPT-4o where:
+
+1. You stream audio in
+2. It streams audio back
+3. Control happens through JSON events on a data channel
+4. The server manages conversation state for you
+
+![Realtime API Architecture](./assets/realtime-architecture.svg)
+
+The server maintains conversation history, handles turn detection (when the user stops talking), and manages interruptions automatically. Your job is to:
+
+1. Mint a short-lived credential
+2. Establish the WebRTC connection
+3. Configure the session
+4. Handle events
+
+## Why WebRTC (and When to Use WebSocket)
+
+**WebRTC for browsers/mobile apps**. It handles network jitter, packet loss, echo cancellation, and automatic gain control. TCP-based WebSockets will accumulate latency under real network conditions.
+
+**WebSocket for server-to-server**. When you're building a phone integration (via SIP) or need a backend to handle tool calls securely, WebSocket is appropriate since you control the network path.
+
+**Sideband connections** let you do both: browser connects via WebRTC for audio, your server connects via WebSocket to the same session for monitoring and tool handling.
+
+## Authentication: Ephemeral Tokens
+
+Never ship your API key to the browser. Mint short-lived client secrets server-side:
+
+```typescript
+// server/api/realtime/session.ts
+import { z } from "zod/v4";
+
+const SessionConfigSchema = z.object({
+  model: z.string().default("gpt-realtime-2025-08-28"),
+  voice: z.enum(["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "cedar", "marin"]).default("marin"),
+  instructions: z.string().optional(),
+  ttlSeconds: z.number().int().min(30).max(1800).default(600),
+});
+
+const ClientSecretSchema = z.object({
+  client_secret: z.object({
+    value: z.string(),
+    expires_at: z.number(),
+  }),
+  session: z.object({
+    id: z.string(),
+    model: z.string(),
+  }),
+});
+
+type SessionConfig = z.infer<typeof SessionConfigSchema>;
+type ClientSecretResponse = z.infer<typeof ClientSecretSchema>;
+
+async function createRealtimeSession(config: SessionConfig): Promise<ClientSecretResponse> {
+  const validatedConfig = SessionConfigSchema.parse(config);
+  
+  const response = await fetch("https://api.openai.com/v1/realtime/sessions", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      model: validatedConfig.model,
+      voice: validatedConfig.voice,
+      instructions: validatedConfig.instructions,
+      input_audio_transcription: { model: "gpt-4o-transcribe" },
+      turn_detection: {
+        type: "server_vad",
+        threshold: 0.5,
+        prefix_padding_ms: 300,
+        silence_duration_ms: 500,
+      },
+    }),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Session creation failed: ${await response.text()}`);
+  }
+
+  return ClientSecretSchema.parse(await response.json());
+}
+```
+
+The `cedar` and `marin` voices are exclusive to the Realtime API and have the most natural speech quality.
+
+## WebRTC Connection (Browser)
+
+The handshake is three steps: create peer, exchange SDP, open data channel.
+
+```typescript
+// client/realtime.ts
+interface RealtimeConnection {
+  peerConnection: RTCPeerConnection;
+  dataChannel: RTCDataChannel;
+  audioElement: HTMLAudioElement;
+  disconnect: () => void;
+}
+
+async function connectToRealtime(clientSecret: string): Promise<RealtimeConnection> {
+  const peerConnection = new RTCPeerConnection();
+  
+  // Capture microphone
+  const localStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+  for (const track of localStream.getTracks()) {
+    peerConnection.addTrack(track, localStream);
+  }
+  
+  // Set up playback for model audio
+  const audioElement = document.createElement("audio");
+  audioElement.autoplay = true;
+  
+  peerConnection.ontrack = (event) => {
+    audioElement.srcObject = event.streams[0];
+  };
+  
+  // Create data channel for events (must be named "oai-events")
+  const dataChannel = peerConnection.createDataChannel("oai-events");
+  
+  // Create and send SDP offer
+  const offer = await peerConnection.createOffer();
+  await peerConnection.setLocalDescription(offer);
+  
+  const sdpResponse = await fetch("https://api.openai.com/v1/realtime?model=gpt-realtime-2025-08-28", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${clientSecret}`,
+      "Content-Type": "application/sdp",
+    },
+    body: offer.sdp,
+  });
+  
+  if (!sdpResponse.ok) {
+    throw new Error(`SDP exchange failed: ${await sdpResponse.text()}`);
+  }
+  
+  const answerSdp = await sdpResponse.text();
+  await peerConnection.setRemoteDescription({ type: "answer", sdp: answerSdp });
+  
+  function disconnect() {
+    localStream.getTracks().forEach(track => track.stop());
+    peerConnection.close();
+  }
+  
+  return { peerConnection, dataChannel, audioElement, disconnect };
+}
+```
+
+Wait for `dataChannel.onopen` before sending events.
+
+## Typed Event System
+
+The Realtime API has 9 client events and 28+ server events. Type them properly:
+
+```typescript
+// shared/realtime-events.ts
+import { z } from "zod/v4";
+
+// ─────────────────────────────────────────────────────────────
+// Client Events (what you send)
+// ─────────────────────────────────────────────────────────────
+
+const SessionUpdateEventSchema = z.object({
+  type: z.literal("session.update"),
+  session: z.object({
+    instructions: z.string().optional(),
+    voice: z.string().optional(),
+    input_audio_transcription: z.object({ model: z.string() }).optional(),
+    turn_detection: z.object({
+      type: z.enum(["server_vad", "none"]),
+      threshold: z.number().optional(),
+      prefix_padding_ms: z.number().optional(),
+      silence_duration_ms: z.number().optional(),
+      create_response: z.boolean().optional(),
+    }).optional(),
+    tools: z.array(z.object({
+      type: z.literal("function"),
+      name: z.string(),
+      description: z.string(),
+      parameters: z.record(z.unknown()),
+    })).optional(),
+  }),
+});
+
+const ResponseCreateEventSchema = z.object({
+  type: z.literal("response.create"),
+  response: z.object({
+    modalities: z.array(z.enum(["text", "audio"])).optional(),
+    instructions: z.string().optional(),
+    conversation: z.enum(["auto", "none"]).optional(),
+  }).optional(),
+});
+
+const ResponseCancelEventSchema = z.object({
+  type: z.literal("response.cancel"),
+});
+
+const ConversationItemTruncateSchema = z.object({
+  type: z.literal("conversation.item.truncate"),
+  item_id: z.string(),
+  content_index: z.number(),
+  audio_end_ms: z.number(),
+});
+
+const ClientEventSchema = z.discriminatedUnion("type", [
+  SessionUpdateEventSchema,
+  ResponseCreateEventSchema,
+  ResponseCancelEventSchema,
+  ConversationItemTruncateSchema,
+]);
+
+type ClientEvent = z.infer<typeof ClientEventSchema>;
+
+// ─────────────────────────────────────────────────────────────
+// Server Events (what you receive)
+// ─────────────────────────────────────────────────────────────
+
+const ServerEventBaseSchema = z.object({
+  event_id: z.string(),
+});
+
+const SessionCreatedEventSchema = ServerEventBaseSchema.extend({
+  type: z.literal("session.created"),
+  session: z.object({
+    id: z.string(),
+    model: z.string(),
+    expires_at: z.number(),
+  }).passthrough(),
+});
+
+const ResponseCreatedEventSchema = ServerEventBaseSchema.extend({
+  type: z.literal("response.created"),
+  response: z.object({
+    id: z.string(),
+    status: z.enum(["in_progress", "completed", "cancelled", "failed", "incomplete"]),
+  }).passthrough(),
+});
+
+const ResponseDoneEventSchema = ServerEventBaseSchema.extend({
+  type: z.literal("response.done"),
+  response: z.object({
+    id: z.string(),
+    status: z.string(),
+    output: z.array(z.object({
+      type: z.string(),
+      id: z.string(),
+    }).passthrough()).optional(),
+  }).passthrough(),
+});
+
+const InputAudioBufferSpeechStartedSchema = ServerEventBaseSchema.extend({
+  type: z.literal("input_audio_buffer.speech_started"),
+  audio_start_ms: z.number(),
+});
+
+const InputAudioBufferSpeechStoppedSchema = ServerEventBaseSchema.extend({
+  type: z.literal("input_audio_buffer.speech_stopped"),
+  audio_end_ms: z.number(),
+});
+
+const ConversationItemInputAudioTranscriptionCompletedSchema = ServerEventBaseSchema.extend({
+  type: z.literal("conversation.item.input_audio_transcription.completed"),
+  item_id: z.string(),
+  transcript: z.string(),
+});
+
+const ResponseAudioTranscriptDeltaSchema = ServerEventBaseSchema.extend({
+  type: z.literal("response.audio_transcript.delta"),
+  response_id: z.string(),
+  delta: z.string(),
+});
+
+const ResponseFunctionCallArgumentsDoneSchema = ServerEventBaseSchema.extend({
+  type: z.literal("response.function_call_arguments.done"),
+  call_id: z.string(),
+  name: z.string(),
+  arguments: z.string(),
+});
+
+const ErrorEventSchema = ServerEventBaseSchema.extend({
+  type: z.literal("error"),
+  error: z.object({
+    type: z.string(),
+    code: z.string().optional(),
+    message: z.string(),
+  }),
+});
+
+const ServerEventSchema = z.discriminatedUnion("type", [
+  SessionCreatedEventSchema,
+  ResponseCreatedEventSchema,
+  ResponseDoneEventSchema,
+  InputAudioBufferSpeechStartedSchema,
+  InputAudioBufferSpeechStoppedSchema,
+  ConversationItemInputAudioTranscriptionCompletedSchema,
+  ResponseAudioTranscriptDeltaSchema,
+  ResponseFunctionCallArgumentsDoneSchema,
+  ErrorEventSchema,
+]).passthrough(); // Allow unknown event types to pass through
+
+type ServerEvent = z.infer<typeof ServerEventSchema>;
+
+export { 
+  ClientEventSchema, 
+  ServerEventSchema, 
+  type ClientEvent, 
+  type ServerEvent 
+};
+```
+
+## Event Handler Pattern
+
+Wrap the data channel in a typed interface:
+
+```typescript
+// client/realtime-channel.ts
+import { ClientEvent, ServerEvent, ServerEventSchema } from "./realtime-events";
+
+type ServerEventHandler = (event: ServerEvent) => void;
+
+interface RealtimeChannel {
+  send: (event: ClientEvent) => void;
+  onEvent: (handler: ServerEventHandler) => void;
+  onError: (handler: (error: Error) => void) => void;
+}
+
+function createRealtimeChannel(dataChannel: RTCDataChannel): RealtimeChannel {
+  const eventHandlers: ServerEventHandler[] = [];
+  const errorHandlers: ((error: Error) => void)[] = [];
+
+  dataChannel.onmessage = (messageEvent) => {
+    try {
+      const rawEvent = JSON.parse(messageEvent.data);
+      const parsedEvent = ServerEventSchema.parse(rawEvent);
+      
+      for (const handler of eventHandlers) {
+        handler(parsedEvent);
+      }
+    } catch (error) {
+      for (const handler of errorHandlers) {
+        handler(error instanceof Error ? error : new Error(String(error)));
+      }
+    }
+  };
+
+  return {
+    send(event: ClientEvent) {
+      dataChannel.send(JSON.stringify(event));
+    },
+    onEvent(handler: ServerEventHandler) {
+      eventHandlers.push(handler);
+    },
+    onError(handler: (error: Error) => void) {
+      errorHandlers.push(handler);
+    },
+  };
+}
+```
+
+## Session Configuration
+
+Configure the session after connection:
+
+```typescript
+function configureSession(channel: RealtimeChannel, instructions: string) {
+  channel.send({
+    type: "session.update",
+    session: {
+      instructions,
+      voice: "marin",
+      input_audio_transcription: { model: "gpt-4o-transcribe" },
+      turn_detection: {
+        type: "server_vad",
+        threshold: 0.5,
+        prefix_padding_ms: 300,
+        silence_duration_ms: 500, // 500ms works for most cases
+        create_response: true,    // Auto-trigger response when user stops
+      },
+    },
+  });
+}
+```
+
+`silence_duration_ms` controls how long the system waits after speech stops before triggering a response. 500ms is a good default. For interview or educational contexts, 800–1000ms gives users more time to think.
+
+## Function Calling
+
+Tools are defined in the session config and invoked via events:
+
+```typescript
+const tools = [
+  {
+    type: "function" as const,
+    name: "get_weather",
+    description: "Get current weather for a location",
+    parameters: {
+      type: "object",
+      properties: {
+        location: { type: "string", description: "City and state, e.g. San Francisco, CA" },
+      },
+      required: ["location"],
+    },
+  },
+];
+
+// Configure session with tools
+channel.send({
+  type: "session.update",
+  session: { tools },
+});
+
+// Handle function calls
+channel.onEvent((event) => {
+  if (event.type === "response.function_call_arguments.done") {
+    const args = JSON.parse(event.arguments);
+    
+    // Execute the function
+    const result = await executeFunction(event.name, args);
+    
+    // Send result back
+    channel.send({
+      type: "conversation.item.create",
+      item: {
+        type: "function_call_output",
+        call_id: event.call_id,
+        output: JSON.stringify(result),
+      },
+    });
+    
+    // Trigger model to continue
+    channel.send({ type: "response.create" });
+  }
+});
+```
+
+The GA model supports **async function calling** — the model can continue speaking while waiting for function results instead of blocking.
+
+## Handling Interruptions
+
+When users interrupt, you need to:
+
+1. Cancel the current response
+2. Truncate the conversation to what was actually heard
+3. Clear the audio buffer
+
+```typescript
+function handleInterruption(
+  channel: RealtimeChannel, 
+  currentItemId: string, 
+  audioPlayedMs: number
+) {
+  // Stop generation
+  channel.send({ type: "response.cancel" });
+  
+  // Sync server context with what user actually heard
+  channel.send({
+    type: "conversation.item.truncate",
+    item_id: currentItemId,
+    content_index: 0,
+    audio_end_ms: audioPlayedMs,
+  });
+  
+  // Clear any buffered audio (WebRTC only)
+  channel.send({ type: "output_audio_buffer.clear" });
+}
+```
+
+This is critical for natural conversation. The model generates audio faster than realtime playback, so without truncation, the conversation context includes text the user never heard.
+
+## Context and Token Management
+
+Key limits:
+
+| Constraint | Value |
+|------------|-------|
+| Context window | 32,768 tokens |
+| Max response tokens | 4,096 tokens |
+| Max instructions + tools | 16,384 tokens |
+| Session duration | 60 minutes |
+| Audio token rate | ~800 tokens/minute |
+
+When context fills up, the API automatically truncates (drops) oldest messages. Configure this behavior:
+
+```typescript
+channel.send({
+  type: "session.update",
+  session: {
+    truncation: {
+      type: "retention_ratio",
+      retention_ratio: 0.8, // Drop 20% when truncating (better for cache hits)
+    },
+  },
+});
+```
+
+Setting `retention_ratio: 0.8` means when truncation triggers, it drops more than the minimum needed. This preserves prompt caching (which requires identical prefixes) better than truncating one message at a time.
+
+## Cost Model
+
+Audio tokens are priced differently than text:
+
+| Type | Price per 1M tokens | ~Per minute |
+|------|--------------------:|------------:|
+| Audio input | $40 (cached: $2.50) | ~$0.03 |
+| Audio output | $80 | ~$0.06 |
+| Text input | $2.50 | — |
+| Text output | $10 | — |
+
+For a 10-minute conversation with ~70% talk time, expect roughly $2–3. The automatic context caching helps significantly for longer sessions.
+
+## Production Checklist
+
+1. **Never expose your API key** — mint ephemeral client secrets
+2. **Handle reconnection** — sessions can drop; implement exponential backoff
+3. **Track audio playback position** — needed for accurate truncation on interrupts  
+4. **Log events** — the `event_id` field helps correlate issues
+5. **Set idle timeouts** — use `idle_timeout_ms` in VAD config to handle silent users
+6. **Test with network throttling** — WebRTC handles jitter well, but test your UI
+7. **Implement mute/unmute** — VAD can trigger on background noise
+8. **Show connection state** — users need feedback when connecting
+
+## When Not to Use Realtime
+
+The Realtime API is optimized for low-latency conversation. Consider alternatives when:
+
+- **You need deterministic output** — temperature is fixed at 0.8, no way to reduce variance
+- **Latency doesn't matter** — Chat Completions API with audio is cheaper for async use cases
+- **You need long context** — 32k tokens is the ceiling; for long documents, use text models
+- **You're doing batch processing** — Realtime is priced for interactive use
+
+## Further Reading
+
+- [OpenAI Realtime API Docs](https://platform.openai.com/docs/guides/realtime)
+- [OpenAI Realtime API Reference](https://platform.openai.com/docs/api-reference/realtime)
+- [Realtime Console (Reference Implementation)](https://github.com/openai/openai-realtime-console)
+- [Pipecat (Open Source Voice AI Framework)](https://github.com/pipecat-ai/pipecat)
diff --git a/src/ui/video/video.tsx b/src/ui/video/video.tsx
index d26dabd..1664b04 100644
--- a/src/ui/video/video.tsx
+++ b/src/ui/video/video.tsx
@@ -211,24 +211,20 @@ export function Video({ hlsUrl, mp4Url, className = '', posterUrl, transcription
 
 				{/* Clickable overlay for play/pause when in playing-with-sound mode */}
 				{status === 'playing-with-sound' && (
-					<div
+					<button
+						type="button"
 						className="absolute inset-0 z-[5] cursor-pointer"
 						onClick={togglePlayPause}
-						onKeyDown={e => e.key === 'Enter' && togglePlayPause()}
-						tabIndex={0}
-						role="button"
 						aria-label="Play or pause video"
 					/>
 				)}
 
 				{/* Clickable overlay for the entire video when not in playing-with-sound mode */}
 				{showPlayButton && (
-					<div
+					<button
+						type="button"
 						className="absolute inset-0 z-10 cursor-pointer"
 						onClick={playWithSound}
-						onKeyDown={e => e.key === 'Enter' && playWithSound()}
-						tabIndex={0}
-						role="button"
 						aria-label="Play with sound"
 					/>
 				)}