Audio generation
Audio modality is represented as AudioParts.
interface AudioPart { type: "audio"; /** * The base64-encoded audio data. */ data: string; format: AudioFormat; /** * The sample rate of the audio. E.g. 44100, 48000. */ sample_rate?: number; /** * The number of channels of the audio. E.g. 1, 2. */ channels?: number; /** * The transcript of the audio. */ transcript?: string; /** * ID of the audio part, if applicable */ id?: string;}
type AudioFormat = | "wav" | "mp3" | "linear16" | "flac" | "mulaw" | "alaw" | "aac" | "opus";pub struct AudioPart { /// The base64-encoded audio data. pub data: String, /// The format of the audio. pub format: AudioFormat, /// The sample rate of the audio. E.g. 44100, 48000. #[serde(skip_serializing_if = "Option::is_none")] pub sample_rate: Option<u32>, /// The number of channels of the audio. E.g. 1, 2. #[serde(skip_serializing_if = "Option::is_none")] pub channels: Option<u32>, /// The transcript of the audio. #[serde(skip_serializing_if = "Option::is_none")] pub transcript: Option<String>, /// The ID of the audio part, if applicable #[serde(skip_serializing_if = "Option::is_none")] pub id: Option<String>,}
pub enum AudioFormat { Wav, Mp3, Linear16, Flac, Mulaw, Alaw, Aac, Opus,}type AudioPart struct { // The base64-encoded audio data. Data string `json:"data"` Format AudioFormat `json:"format"` // The sample rate of the audio. E.g. 44100, 48000. SampleRate *int `json:"sample_rate,omitempty"` // The number of channels of the audio. E.g. 1, 2. Channels *int `json:"channels,omitempty"` // The transcript of the audio. Transcript *string `json:"transcript,omitempty"` // The ID of the part, if applicable. ID *string `json:"id,omitempty"`}
type AudioFormat stringTo ensure the audio can be played correctly, the application code must consider the provided audio format, sample_rate, and channels.
Depending on the provider, you may have to pass additional parameters to their API, such as voice or format, using the audio field.
interface AudioOptions { /** * The desired audio format. */ format?: AudioFormat; /** * The provider-specific voice ID to use for audio generation. */ voice?: string; /** * The language code for the audio generation. */ language?: string;}pub struct AudioOptions { /// The format of the audio. #[serde(skip_serializing_if = "Option::is_none")] pub format: Option<AudioFormat>, /// The provider-specifc voice ID to use for audio generation. #[serde(skip_serializing_if = "Option::is_none")] pub voice: Option<String>, /// The language code for the audio generation. #[serde(skip_serializing_if = "Option::is_none")] pub language: Option<String>,}type AudioOptions struct { // The desired audio format. Format *AudioFormat `json:"format,omitempty"` // The provider-specific voice ID to use for audio generation. Voice *string `json:"voice,omitempty"` // The language code for the audio generation LanguageCode *string `json:"language_code,omitempty"`}Generate audio
Section titled “Generate audio”To generate audio, specify audio in the input modalities.
// Requires ffplay (https://ffmpeg.org/) on PATH.import { spawn, type ChildProcess } from "node:child_process";import { getModel } from "./get-model.ts";
const model = getModel("openai-chat-completion", "gpt-4o-audio-preview");
const response = await model.generate({ modalities: ["text", "audio"], audio: { format: "mp3", voice: "alloy", }, messages: [ { role: "user", content: [ { type: "text", text: "Is a golden retriever a good family dog?", }, ], }, ],});
console.dir(response, { depth: null });
const audioPart = response.content.find((part) => part.type === "audio");
if (!audioPart) { throw new Error("Audio part not found in response");}
await play(Buffer.from(audioPart.data, "base64"));
async function play(audio: Buffer) { const player = spawn( "ffplay", ["-autoexit", "-nodisp", "-loglevel", "error", "-"], { stdio: ["pipe", "ignore", "inherit"], }, );
await waitForSpawn(player);
const stdin = player.stdin; if (!stdin) { throw new Error("ffplay stdin unavailable"); }
stdin.end(audio);
await new Promise<void>((resolve, reject) => { player.once("close", (code) => { if (code === 0) { resolve(); } else { reject(new Error(`ffplay exited with code ${code}`)); } }); player.once("error", reject); });}
function waitForSpawn(child: ChildProcess) { return new Promise<void>((resolve, reject) => { child.once("spawn", resolve); child.once("error", reject); });}// Requires ffplay (https://ffmpeg.org/) on PATH.use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};use dotenvy::dotenv;use llm_sdk::{AudioFormat, AudioOptions, LanguageModelInput, Message, Modality, Part};use std::{ io::Write, process::{Command, Stdio},};
mod common;
#[tokio::main]async fn main() { dotenv().ok();
let model = common::get_model("openai-chat-completion", "gpt-4o-audio-preview");
let response = model .generate(LanguageModelInput { modalities: Some(vec![Modality::Text, Modality::Audio]), messages: vec![Message::user(vec![Part::text( "Is a golden retriever a good family dog?", )])], audio: Some(AudioOptions { format: Some(AudioFormat::Mp3), voice: Some("alloy".into()), ..Default::default() }), ..Default::default() }) .await .expect("model.generate failed");
println!("{response:#?}");
if let Some(audio_part) = response.content.iter().find_map(|p| match p { Part::Audio(a) => Some(a), _ => None, }) { let audio_bytes = BASE64_STANDARD .decode(&audio_part.data) .expect("invalid base64 audio data");
play(&audio_bytes).expect("ffplay playback failed"); } else { println!("Audio part not found in response"); }}
fn play(audio: &[u8]) -> std::io::Result<()> { let mut child = Command::new("ffplay") .args(["-autoexit", "-nodisp", "-loglevel", "error", "-"]) .stdin(Stdio::piped()) .stdout(Stdio::null()) .stderr(Stdio::inherit()) .spawn()?;
{ let stdin = child.stdin.as_mut().expect("ffplay stdin unavailable"); stdin.write_all(audio)?; }
let status = child.wait()?; if status.success() { Ok(()) } else { Err(std::io::Error::other("ffplay exited with error")) }}package main
// Requires ffplay (https://ffmpeg.org/) on PATH.import ( "context" "encoding/base64" "log" "os/exec"
llmsdk "github.com/hoangvvo/llm-sdk/sdk-go" "github.com/hoangvvo/llm-sdk/sdk-go/examples" "github.com/hoangvvo/llm-sdk/sdk-go/utils/ptr" "github.com/sanity-io/litter")
func main() { model := examples.GetModel("openai-chat-completion", "gpt-4o-audio-preview")
response, err := model.Generate(context.Background(), &llmsdk.LanguageModelInput{ Modalities: []llmsdk.Modality{llmsdk.ModalityText, llmsdk.ModalityAudio}, Audio: &llmsdk.AudioOptions{ Format: ptr.To(llmsdk.AudioFormatMP3), Voice: ptr.To("alloy"), }, Messages: []llmsdk.Message{ llmsdk.NewUserMessage( llmsdk.NewTextPart("Is a golden retriever a good family dog?"), ), }, }) if err != nil { log.Fatalf("Generation failed: %v", err) }
litter.Dump(response)
for _, part := range response.Content { if part.AudioPart == nil { continue }
audioData, err := base64.StdEncoding.DecodeString(part.AudioPart.Data) if err != nil { log.Fatalf("Failed to decode audio data: %v", err) }
if err := play(audioData); err != nil { log.Fatalf("ffplay failed: %v", err) }
return }
log.Fatal("Audio part not found in response")}
func play(audio []byte) error { cmd := exec.Command("ffplay", "-autoexit", "-nodisp", "-loglevel", "error", "-") stdin, err := cmd.StdinPipe() if err != nil { return err }
if err := cmd.Start(); err != nil { return err }
if _, err := stdin.Write(audio); err != nil { return err } if err := stdin.Close(); err != nil { return err }
return cmd.Wait()}Stream audio
Section titled “Stream audio”Audio generation can also be streamed using the stream() method. In streamed responses, AudioPart will be represented as AudioPartDelta:
interface AudioPartDelta { type: "audio"; /** * The base64-encoded audio data. */ data?: string; format?: AudioFormat; /** * The sample rate of the audio. E.g. 44100, 48000. */ sample_rate?: number; /** * The number of channels of the audio. E.g. 1, 2. */ channels?: number; /** * The transcript of the audio. */ transcript?: string; /** * The ID of the audio part, if applicable */ id?: string;}
type AudioFormat = | "wav" | "mp3" | "linear16" | "flac" | "mulaw" | "alaw" | "aac" | "opus";pub struct AudioPartDelta { /// The base64-encoded audio data. #[serde(skip_serializing_if = "Option::is_none")] pub data: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub format: Option<AudioFormat>, /// The sample rate of the audio. E.g. 44100, 48000. #[serde(skip_serializing_if = "Option::is_none")] pub sample_rate: Option<u32>, /// The number of channels of the audio. E.g. 1, 2. #[serde(skip_serializing_if = "Option::is_none")] pub channels: Option<u32>, /// The transcript of the audio. #[serde(skip_serializing_if = "Option::is_none")] pub transcript: Option<String>, /// The ID of the audio part, if applicable #[serde(skip_serializing_if = "Option::is_none")] pub id: Option<String>,}
pub enum AudioFormat { Wav, Mp3, Linear16, Flac, Mulaw, Alaw, Aac, Opus,}type AudioPartDelta struct { // Data is the base64-encoded audio data. Data *string `json:"data,omitempty"` Format *AudioFormat `json:"format,omitempty"` SampleRate *int `json:"sample_rate,omitempty"` Channels *int `json:"channels,omitempty"` Transcript *string `json:"transcript,omitempty"` ID *string `json:"id,omitempty"`}
type AudioFormat stringIndividual data chunks can be played back as they are received. They can be combined to create the final audio output.
// Requires ffplay (https://ffmpeg.org/) on PATH.import { spawn, type ChildProcessByStdio } from "node:child_process";import type Stream from "node:stream";import { getModel } from "./get-model.ts";
const model = getModel("openai-chat-completion", "gpt-4o-audio-preview");
const stream = model.stream({ modalities: ["text", "audio"], audio: { format: "linear16", voice: "alloy", }, messages: [ { role: "user", content: [ { type: "text", text: "Is a golden retriever a good family dog?", }, ], }, ],});
type Player = ChildProcessByStdio<Stream.Writable, null, null>;
let player: Player | undefined;let sampleRate: number | undefined;let channels: number | undefined;
for await (const partial of stream) { console.dir(redactAudioData(partial), { depth: null });
const part = partial.delta?.part; if (part?.type !== "audio" || !part.data) continue;
if (part.format && part.format !== "linear16") { throw new Error(`Unsupported audio format: ${part.format}`); }
sampleRate ??= part.sample_rate ?? 24_000; channels ??= part.channels ?? 1;
if (!player) { player = await startFfplay(sampleRate, channels); console.log( `Streaming audio with ffplay (${sampleRate} Hz, ${channels} channel${channels === 1 ? "" : "s"}).`, ); }
const currentPlayer = player; currentPlayer.stdin.write(Buffer.from(part.data, "base64"), (err) => { if (err) { console.error("Error writing to ffplay stdin:", err); } });}
if (player) { await finishFfplay(player);}
async function startFfplay(sampleRate: number, channels: number) { const child = spawn( "ffplay", [ "-loglevel", "error", "-autoexit", "-nodisp", "-f", "s16le", "-ar", String(sampleRate), "-i", "pipe:0", "-af", `aformat=channel_layouts=${channels === 1 ? "mono" : "stereo"}`, ], { stdio: ["pipe", "ignore", "inherit"] }, );
await waitForSpawn(child);
return child;}
async function finishFfplay(child: Player) { if (child.stdin.writable) { child.stdin.end(); }
await new Promise<void>((resolve, reject) => { child.once("close", (code) => { if (code === 0) { resolve(); } else { reject(new Error(`ffplay exited with code ${code}`)); } }); child.once("error", reject); });}
function waitForSpawn(child: Player) { return new Promise<void>((resolve, reject) => { child.once("spawn", resolve); child.once("error", reject); });}
function redactAudioData(partial: unknown) { if (!partial || typeof partial !== "object") { return partial; }
return JSON.parse( JSON.stringify(partial, (_key, value) => { if ( value && typeof value === "object" && "data" in value && typeof value.data === "string" ) { const byteLength = Buffer.from(value.data, "base64").length; return { ...value, data: `[${byteLength} bytes]` }; } return value; }), );}// Requires ffplay (https://ffmpeg.org/) on PATH.use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};use dotenvy::dotenv;use futures::StreamExt;use llm_sdk::{AudioFormat, AudioOptions, LanguageModelInput, Message, Modality, Part, PartDelta};use serde_json::Value;use std::{ io::Write, process::{Child, ChildStdin, Command, Stdio},};
mod common;
#[tokio::main]async fn main() { dotenv().ok();
let model = common::get_model("openai-chat-completion", "gpt-4o-audio-preview");
let mut stream = model .stream(LanguageModelInput { modalities: Some(vec![Modality::Text, Modality::Audio]), messages: vec![Message::user(vec![Part::text( "Is a golden retriever a good family dog?", )])], audio: Some(AudioOptions { format: Some(AudioFormat::Linear16), voice: Some("alloy".into()), ..Default::default() }), ..Default::default() }) .await .expect("failed to start stream");
let mut sample_rate: Option<u32> = None; let mut channels: Option<u32> = None; let mut ffplay: Option<(Child, ChildStdin)> = None;
while let Some(item) = stream.next().await { let chunk = item.expect("stream error"); log_partial(&chunk);
if let Some(delta) = chunk.delta { if let PartDelta::Audio(audio) = delta.part { if let Some(format) = audio.format { if format != AudioFormat::Linear16 { panic!("unsupported audio format: {format:?}"); } } if let Some(b64) = audio.data { let bytes = BASE64_STANDARD .decode(b64.as_bytes()) .expect("invalid base64 audio");
if sample_rate.is_none() { sample_rate = Some(audio.sample_rate.unwrap_or(24_000)); } if channels.is_none() { channels = Some(audio.channels.unwrap_or(1)); }
if ffplay.is_none() { let rate = sample_rate.unwrap(); let ch = channels.unwrap(); ffplay = Some(start_ffplay(rate, ch)); println!( "Streaming audio with ffplay ({} Hz, {} channel{}).", rate, ch, if ch == 1 { "" } else { "s" }, ); }
if let Some((_, ref mut stdin)) = ffplay { stdin .write_all(&bytes) .expect("failed to write audio to ffplay"); } } } } }
if let Some((child, stdin)) = ffplay { finish_ffplay(child, stdin); }}
fn start_ffplay(sample_rate: u32, channels: u32) -> (Child, ChildStdin) { let mut child = Command::new("ffplay") .args([ "-loglevel", "error", "-autoexit", "-nodisp", "-f", "s16le", "-ar", &sample_rate.to_string(), "-i", "pipe:0", "-af", &format!( "aformat=channel_layouts={}", if channels <= 1 { "mono" } else { "stereo" } ), ]) .stdin(Stdio::piped()) .stdout(Stdio::null()) .stderr(Stdio::inherit()) .spawn() .expect("failed to start ffplay");
let stdin = child.stdin.take().expect("ffplay stdin unavailable"); (child, stdin)}
fn finish_ffplay(mut child: Child, mut stdin: ChildStdin) { stdin.flush().expect("failed to flush ffplay stdin"); drop(stdin);
let status = child.wait().expect("failed to wait for ffplay"); if !status.success() { panic!("ffplay exited with error"); }}
fn log_partial(partial: &llm_sdk::PartialModelResponse) { match serde_json::to_value(partial) { Ok(mut value) => { redact_data(&mut value); println!("{value:#?}"); } Err(_) => println!("{partial:#?}"), }}
fn redact_data(value: &mut Value) { match value { Value::Object(map) => { if let Some(Value::String(data)) = map.get_mut("data") { if let Ok(bytes) = BASE64_STANDARD.decode(data.as_bytes()) { *data = format!("[{} bytes]", bytes.len()); } else { *data = "[invalid data]".to_string(); } } for entry in map.values_mut() { redact_data(entry); } } Value::Array(array) => { for item in array.iter_mut() { redact_data(item); } } _ => {} }}package main
// Requires ffplay (https://ffmpeg.org/) on PATH.import ( "context" "encoding/base64" "encoding/json" "fmt" "io" "log" "os/exec" "strconv"
llmsdk "github.com/hoangvvo/llm-sdk/sdk-go" "github.com/hoangvvo/llm-sdk/sdk-go/examples" "github.com/hoangvvo/llm-sdk/sdk-go/utils/ptr" "github.com/sanity-io/litter")
func main() { model := examples.GetModel("openai-chat-completion", "gpt-4o-audio-preview")
stream, err := model.Stream(context.Background(), &llmsdk.LanguageModelInput{ Modalities: []llmsdk.Modality{llmsdk.ModalityText, llmsdk.ModalityAudio}, Audio: &llmsdk.AudioOptions{ Format: ptr.To(llmsdk.AudioFormatLinear16), Voice: ptr.To("alloy"), }, Messages: []llmsdk.Message{ llmsdk.NewUserMessage( llmsdk.NewTextPart("Is a golden retriever a good family dog?"), ), }, }) if err != nil { log.Fatalf("Stream failed: %v", err) }
var ( ffplayCmd *exec.Cmd ffplayStdin io.WriteCloser sampleRate int channels int )
accumulator := llmsdk.NewStreamAccumulator()
for stream.Next() { current := stream.Current() litter.Dump(redactPartial(current))
if current.Delta != nil && current.Delta.Part.AudioPartDelta != nil { delta := current.Delta.Part.AudioPartDelta if delta.Format != nil && *delta.Format != llmsdk.AudioFormatLinear16 { log.Fatalf("unsupported audio format: %s", *delta.Format) } if delta.Data != nil { pcm, err := base64.StdEncoding.DecodeString(*delta.Data) if err != nil { log.Fatalf("Failed to decode audio: %v", err) }
if sampleRate == 0 { if delta.SampleRate != nil { sampleRate = int(*delta.SampleRate) } else { sampleRate = 24_000 } } if channels == 0 { if delta.Channels != nil { channels = int(*delta.Channels) } else { channels = 1 } }
if ffplayStdin == nil { ffplayCmd, ffplayStdin = startFfplay(sampleRate, channels) log.Printf( "Streaming audio with ffplay (%d Hz, %d channel%s).", sampleRate, channels, pluralSuffix(channels), ) }
if _, err := ffplayStdin.Write(pcm); err != nil { log.Fatalf("Failed to write audio: %v", err) } } }
if err := accumulator.AddPartial(*current); err != nil { log.Printf("Failed to add partial: %v", err) } }
if err := stream.Err(); err != nil { log.Fatalf("Stream error: %v", err) }
if ffplayStdin != nil { finishFfplay(ffplayCmd, ffplayStdin) }
finalResponse, err := accumulator.ComputeResponse() if err != nil { log.Fatalf("Failed to compute response: %v", err) }
litter.Dump(finalResponse)}
func startFfplay(sampleRate, channels int) (*exec.Cmd, io.WriteCloser) { args := []string{ "-loglevel", "error", "-autoexit", "-nodisp", "-f", "s16le", "-ar", strconv.Itoa(sampleRate), "-i", "pipe:0", "-af", fmt.Sprintf("aformat=channel_layouts=%s", channelLayout(channels)), }
cmd := exec.Command("ffplay", args...) stdin, err := cmd.StdinPipe() if err != nil { log.Fatalf("Failed to open ffplay stdin: %v", err) }
if err := cmd.Start(); err != nil { log.Fatalf("Failed to start ffplay: %v", err) }
return cmd, stdin}
func finishFfplay(cmd *exec.Cmd, stdin io.WriteCloser) { if err := stdin.Close(); err != nil { log.Fatalf("Failed to close ffplay stdin: %v", err) }
if err := cmd.Wait(); err != nil { log.Fatalf("ffplay exited with error: %v", err) }}
func pluralSuffix(n int) string { if n == 1 { return "" } return "s"}
func channelLayout(channels int) string { if channels <= 1 { return "mono" } return "stereo"}
func redactPartial(partial *llmsdk.PartialModelResponse) any { if partial == nil { return nil }
bytes, err := json.Marshal(partial) if err != nil { return partial }
var data any if err := json.Unmarshal(bytes, &data); err != nil { return partial }
return redactAudioFields(data)}
func redactAudioFields(value any) any { switch v := value.(type) { case map[string]any: if raw, ok := v["data"].(string); ok { decoded, err := base64.StdEncoding.DecodeString(raw) if err == nil { v["data"] = fmt.Sprintf("[%d bytes]", len(decoded)) } else { v["data"] = "[invalid data]" } } for key, val := range v { v[key] = redactAudioFields(val) } return v case []any: for i, item := range v { v[i] = redactAudioFields(item) } return v default: return value }}