Skip to content

Audio generation

Audio modality is represented as AudioParts.

types.ts
interface AudioPart {
type: "audio";
/**
* The base64-encoded audio data.
*/
data: string;
format: AudioFormat;
/**
* The sample rate of the audio. E.g. 44100, 48000.
*/
sample_rate?: number;
/**
* The number of channels of the audio. E.g. 1, 2.
*/
channels?: number;
/**
* The transcript of the audio.
*/
transcript?: string;
/**
* ID of the audio part, if applicable
*/
id?: string;
}
type AudioFormat =
| "wav"
| "mp3"
| "linear16"
| "flac"
| "mulaw"
| "alaw"
| "aac"
| "opus";

To ensure the audio can be played correctly, the application code must consider the provided audio format, sample_rate, and channels.

Depending on the provider, you may have to pass additional parameters to their API, such as voice or format, using the audio field.

types.ts
interface AudioOptions {
/**
* The desired audio format.
*/
format?: AudioFormat;
/**
* The provider-specific voice ID to use for audio generation.
*/
voice?: string;
/**
* The language code for the audio generation.
*/
language?: string;
}

To generate audio, specify audio in the input modalities.

generate-audio.ts
// Requires ffplay (https://ffmpeg.org/) on PATH.
import { spawn, type ChildProcess } from "node:child_process";
import { getModel } from "./get-model.ts";
const model = getModel("openai-chat-completion", "gpt-4o-audio-preview");
const response = await model.generate({
modalities: ["text", "audio"],
audio: {
format: "mp3",
voice: "alloy",
},
messages: [
{
role: "user",
content: [
{
type: "text",
text: "Is a golden retriever a good family dog?",
},
],
},
],
});
console.dir(response, { depth: null });
const audioPart = response.content.find((part) => part.type === "audio");
if (!audioPart) {
throw new Error("Audio part not found in response");
}
await play(Buffer.from(audioPart.data, "base64"));
async function play(audio: Buffer) {
const player = spawn(
"ffplay",
["-autoexit", "-nodisp", "-loglevel", "error", "-"],
{
stdio: ["pipe", "ignore", "inherit"],
},
);
await waitForSpawn(player);
const stdin = player.stdin;
if (!stdin) {
throw new Error("ffplay stdin unavailable");
}
stdin.end(audio);
await new Promise<void>((resolve, reject) => {
player.once("close", (code) => {
if (code === 0) {
resolve();
} else {
reject(new Error(`ffplay exited with code ${code}`));
}
});
player.once("error", reject);
});
}
function waitForSpawn(child: ChildProcess) {
return new Promise<void>((resolve, reject) => {
child.once("spawn", resolve);
child.once("error", reject);
});
}

Audio generation can also be streamed using the stream() method. In streamed responses, AudioPart will be represented as AudioPartDelta:

types.ts
interface AudioPartDelta {
type: "audio";
/**
* The base64-encoded audio data.
*/
data?: string;
format?: AudioFormat;
/**
* The sample rate of the audio. E.g. 44100, 48000.
*/
sample_rate?: number;
/**
* The number of channels of the audio. E.g. 1, 2.
*/
channels?: number;
/**
* The transcript of the audio.
*/
transcript?: string;
/**
* The ID of the audio part, if applicable
*/
id?: string;
}
type AudioFormat =
| "wav"
| "mp3"
| "linear16"
| "flac"
| "mulaw"
| "alaw"
| "aac"
| "opus";

Individual data chunks can be played back as they are received. They can be combined to create the final audio output.

stream-audio.ts
// Requires ffplay (https://ffmpeg.org/) on PATH.
import { spawn, type ChildProcessByStdio } from "node:child_process";
import type Stream from "node:stream";
import { getModel } from "./get-model.ts";
const model = getModel("openai-chat-completion", "gpt-4o-audio-preview");
const stream = model.stream({
modalities: ["text", "audio"],
audio: {
format: "linear16",
voice: "alloy",
},
messages: [
{
role: "user",
content: [
{
type: "text",
text: "Is a golden retriever a good family dog?",
},
],
},
],
});
type Player = ChildProcessByStdio<Stream.Writable, null, null>;
let player: Player | undefined;
let sampleRate: number | undefined;
let channels: number | undefined;
for await (const partial of stream) {
console.dir(redactAudioData(partial), { depth: null });
const part = partial.delta?.part;
if (part?.type !== "audio" || !part.data) continue;
if (part.format && part.format !== "linear16") {
throw new Error(`Unsupported audio format: ${part.format}`);
}
sampleRate ??= part.sample_rate ?? 24_000;
channels ??= part.channels ?? 1;
if (!player) {
player = await startFfplay(sampleRate, channels);
console.log(
`Streaming audio with ffplay (${sampleRate} Hz, ${channels} channel${channels === 1 ? "" : "s"}).`,
);
}
const currentPlayer = player;
currentPlayer.stdin.write(Buffer.from(part.data, "base64"), (err) => {
if (err) {
console.error("Error writing to ffplay stdin:", err);
}
});
}
if (player) {
await finishFfplay(player);
}
async function startFfplay(sampleRate: number, channels: number) {
const child = spawn(
"ffplay",
[
"-loglevel",
"error",
"-autoexit",
"-nodisp",
"-f",
"s16le",
"-ar",
String(sampleRate),
"-i",
"pipe:0",
"-af",
`aformat=channel_layouts=${channels === 1 ? "mono" : "stereo"}`,
],
{ stdio: ["pipe", "ignore", "inherit"] },
);
await waitForSpawn(child);
return child;
}
async function finishFfplay(child: Player) {
if (child.stdin.writable) {
child.stdin.end();
}
await new Promise<void>((resolve, reject) => {
child.once("close", (code) => {
if (code === 0) {
resolve();
} else {
reject(new Error(`ffplay exited with code ${code}`));
}
});
child.once("error", reject);
});
}
function waitForSpawn(child: Player) {
return new Promise<void>((resolve, reject) => {
child.once("spawn", resolve);
child.once("error", reject);
});
}
function redactAudioData(partial: unknown) {
if (!partial || typeof partial !== "object") {
return partial;
}
return JSON.parse(
JSON.stringify(partial, (_key, value) => {
if (
value &&
typeof value === "object" &&
"data" in value &&
typeof value.data === "string"
) {
const byteLength = Buffer.from(value.data, "base64").length;
return { ...value, data: `[${byteLength} bytes]` };
}
return value;
}),
);
}