Audio generation
Audio modality is represented as AudioPart
s.
interface AudioPart { type: "audio"; /** * The base64-encoded audio data. */ audio_data: string; format: AudioFormat; /** * The sample rate of the audio. E.g. 44100, 48000. */ sample_rate?: number; /** * The number of channels of the audio. E.g. 1, 2. */ channels?: number; /** * The transcript of the audio. */ transcript?: string; /** * ID of the audio part, if applicable */ id?: string;}
type AudioFormat = | "wav" | "mp3" | "linear16" | "flac" | "mulaw" | "alaw" | "aac" | "opus";
pub struct AudioPart { /// The base64-encoded audio data. pub audio_data: String, /// The format of the audio. pub format: AudioFormat, /// The sample rate of the audio. E.g. 44100, 48000. #[serde(skip_serializing_if = "Option::is_none")] pub sample_rate: Option<u32>, /// The number of channels of the audio. E.g. 1, 2. #[serde(skip_serializing_if = "Option::is_none")] pub channels: Option<u32>, /// The transcript of the audio. #[serde(skip_serializing_if = "Option::is_none")] pub transcript: Option<String>, /// The ID of the audio part, if applicable #[serde(skip_serializing_if = "Option::is_none")] pub id: Option<String>,}
pub enum AudioFormat { Wav, Mp3, Linear16, Flac, Mulaw, Alaw, Aac, Opus,}
type AudioPart struct { // The base64-encoded audio data. AudioData string `json:"audio_data"` Format AudioFormat `json:"format"` // The sample rate of the audio. E.g. 44100, 48000. SampleRate *int `json:"sample_rate,omitempty"` // The number of channels of the audio. E.g. 1, 2. Channels *int `json:"channels,omitempty"` // The transcript of the audio. Transcript *string `json:"transcript,omitempty"` // The ID of the part, if applicable. ID *string `json:"id,omitempty"`}
type AudioFormat string
To ensure the audio can be played correctly, the application code must consider the provided audio format
, sample_rate
, and channels
.
Depending on the provider, you may have to pass additional parameters to their API, such as voice
or format
, using the audio
field.
interface AudioOptions { /** * The desired audio format. */ format?: AudioFormat; /** * The provider-specific voice ID to use for audio generation. */ voice?: string; /** * The language code for the audio generation. */ language?: string;}
pub struct AudioOptions { /// The format of the audio. #[serde(skip_serializing_if = "Option::is_none")] pub format: Option<AudioFormat>, /// The provider-specifc voice ID to use for audio generation. #[serde(skip_serializing_if = "Option::is_none")] pub voice: Option<String>, /// The language code for the audio generation. #[serde(skip_serializing_if = "Option::is_none")] pub language: Option<String>,}
type AudioOptions struct { // The desired audio format. Format *AudioFormat `json:"format,omitempty"` // The provider-specific voice ID to use for audio generation. Voice *string `json:"voice,omitempty"` // The language code for the audio generation LanguageCode *string `json:"language_code,omitempty"`}
Generate audio
Section titled “Generate audio”To generate audio, specify audio
in the input modalities
.
import audioContext from "audio-context";import decodeAudio from "audio-decode";import play from "audio-play";import { getModel } from "./get-model.ts";
const model = getModel("openai-chat-completion", "gpt-4o-audio-preview");
const response = await model.generate({ modalities: ["text", "audio"], audio: { format: "mp3", voice: "alloy", }, messages: [ { role: "user", content: [ { type: "text", text: "Is a golden retriever a good family dog?", }, ], }, ],});
console.dir(response, { depth: null });
const audioPart = response.content.find((part) => part.type === "audio");
if (audioPart) { const audioBuffer = await decodeAudio( Buffer.from(audioPart.audio_data, "base64"), ); const playback = play( audioBuffer, { context: audioContext } as unknown as play.Options, () => { console.log("Playback finished"); }, ); playback.play();}
use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};use dotenvy::dotenv;use llm_sdk::{AudioFormat, AudioOptions, LanguageModelInput, Message, Modality, Part};use std::io::Cursor;
mod common;
#[tokio::main]async fn main() { dotenv().ok();
let model = common::get_model("openai", "gpt-4o-audio-preview");
let response = model .generate(LanguageModelInput { modalities: Some(vec![Modality::Text, Modality::Audio]), messages: vec![Message::user(vec![Part::text( "Is a golden retriever a good family dog?", )])], audio: Some(AudioOptions { format: Some(AudioFormat::Mp3), voice: Some("alloy".into()), ..Default::default() }), ..Default::default() }) .await .expect("model.generate failed");
println!("{response:#?}");
if let Some(audio_part) = response.content.iter().find_map(|p| match p { Part::Audio(a) => Some(a), _ => None, }) { let audio_bytes = BASE64_STANDARD .decode(&audio_part.audio_data) .expect("invalid base64 audio data");
let cursor = Cursor::new(audio_bytes); let (_stream, stream_handle) = rodio::OutputStream::try_default().unwrap(); let sink = rodio::Sink::try_new(&stream_handle).unwrap();
let source = rodio::Decoder::new(cursor).unwrap(); sink.append(source);
sink.sleep_until_end(); }}
package main
import ( "bytes" "context" "encoding/base64" "log"
"github.com/ebitengine/oto/v3" "github.com/hajimehoshi/go-mp3" llmsdk "github.com/hoangvvo/llm-sdk/sdk-go" "github.com/hoangvvo/llm-sdk/sdk-go/examples" "github.com/hoangvvo/llm-sdk/sdk-go/utils/ptr" "github.com/sanity-io/litter")
func main() { model := examples.GetModel("openai-chat-completion", "gpt-4o-audio-preview")
response, err := model.Generate(context.Background(), &llmsdk.LanguageModelInput{ Modalities: []llmsdk.Modality{llmsdk.ModalityText, llmsdk.ModalityAudio}, Audio: &llmsdk.AudioOptions{ Format: ptr.To(llmsdk.AudioFormatMP3), Voice: ptr.To("alloy"), }, Messages: []llmsdk.Message{ llmsdk.NewUserMessage( llmsdk.NewTextPart("Is a golden retriever a good family dog?"), ), }, })
if err != nil { log.Fatalf("Generation failed: %v", err) }
litter.Dump(response)
// Find and play audio part for _, part := range response.Content { if part.AudioPart != nil { audioData, err := base64.StdEncoding.DecodeString(part.AudioPart.AudioData) if err != nil { log.Printf("Failed to decode audio data: %v", err) continue }
// Decode MP3 audio data decoder, err := mp3.NewDecoder(bytes.NewReader(audioData)) if err != nil { log.Printf("Failed to create MP3 decoder: %v", err) continue }
// Initialize audio context for MP3 playback op := &oto.NewContextOptions{ SampleRate: decoder.SampleRate(), ChannelCount: 2, // MP3 is typically stereo Format: oto.FormatSignedInt16LE, }
otoContext, ready, err := oto.NewContext(op) if err != nil { log.Printf("Failed to create audio context: %v", err) continue } <-ready
audioPlayer := otoContext.NewPlayer(decoder)
audioPlayer.Play()
// Wait for playback to complete for audioPlayer.IsPlaying() { }
audioPlayer.Close() } }}
Stream audio
Section titled “Stream audio”Audio generation can also be streamed using the stream()
method. In streamed responses, AudioPart
will be represented as AudioPartDelta
:
interface AudioPartDelta { type: "audio"; /** * The base64-encoded audio data. */ audio_data?: string; format?: AudioFormat; /** * The sample rate of the audio. E.g. 44100, 48000. */ sample_rate?: number; /** * The number of channels of the audio. E.g. 1, 2. */ channels?: number; /** * The transcript of the audio. */ transcript?: string; /** * The ID of the audio part, if applicable */ id?: string;}
type AudioFormat = | "wav" | "mp3" | "linear16" | "flac" | "mulaw" | "alaw" | "aac" | "opus";
pub struct AudioPartDelta { /// The base64-encoded audio data. #[serde(skip_serializing_if = "Option::is_none")] pub audio_data: Option<String>, #[serde(skip_serializing_if = "Option::is_none")] pub format: Option<AudioFormat>, /// The sample rate of the audio. E.g. 44100, 48000. #[serde(skip_serializing_if = "Option::is_none")] pub sample_rate: Option<u32>, /// The number of channels of the audio. E.g. 1, 2. #[serde(skip_serializing_if = "Option::is_none")] pub channels: Option<u32>, /// The transcript of the audio. #[serde(skip_serializing_if = "Option::is_none")] pub transcript: Option<String>, /// The ID of the audio part, if applicable #[serde(skip_serializing_if = "Option::is_none")] pub id: Option<String>,}
pub enum AudioFormat { Wav, Mp3, Linear16, Flac, Mulaw, Alaw, Aac, Opus,}
type AudioPartDelta struct { AudioData *string `json:"audio_data,omitempty"` Format *AudioFormat `json:"format,omitempty"` SampleRate *int `json:"sample_rate,omitempty"` Channels *int `json:"channels,omitempty"` Transcript *string `json:"transcript,omitempty"` ID *string `json:"id,omitempty"`}
type AudioFormat string
Individual audio_data
chunks can be played back as they are received. They can be combined to create the final audio output.
import Speaker from "speaker";import { getModel } from "./get-model.ts";
let speaker: Speaker | undefined;
const model = getModel("openai-chat-completion", "gpt-4o-audio-preview");
const response = model.stream({ modalities: ["text", "audio"], audio: { format: "linear16", voice: "alloy", }, messages: [ { role: "user", content: [ { type: "text", text: "Is a golden retriever a good family dog?", }, ], }, ],});
let current = await response.next();while (!current.done) { console.dir(current.value, { depth: null }); const part = current.value.delta?.part; if (part?.type === "audio") { if (part.audio_data) { speaker = speaker ?? new Speaker({ sampleRate: part.sample_rate ?? 24000, bitDepth: 16, channels: part.channels ?? 1, }); speaker.write(Buffer.from(part.audio_data, "base64")); } } current = await response.next();}
use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};use dotenvy::dotenv;use futures::StreamExt;use llm_sdk::{AudioFormat, AudioOptions, LanguageModelInput, Message, Modality, Part, PartDelta};use rodio::{buffer::SamplesBuffer, OutputStream, Sink};
mod common;
fn bytes_to_i16_le_samples(bytes: &[u8]) -> Vec<i16> { let mut out = Vec::with_capacity(bytes.len() / 2); for chunk in bytes.chunks_exact(2) { out.push(i16::from_le_bytes([chunk[0], chunk[1]])); } out}
#[tokio::main]async fn main() { dotenv().ok();
let model = common::get_model("openai", "gpt-4o-audio-preview");
let mut stream = model .stream(LanguageModelInput { modalities: Some(vec![Modality::Text, Modality::Audio]), messages: vec![Message::user(vec![Part::text( "Is a golden retriever a good family dog?", )])], audio: Some(AudioOptions { format: Some(AudioFormat::Linear16), voice: Some("alloy".into()), ..Default::default() }), ..Default::default() }) .await .expect("failed to start stream");
// Lazy init: keep OutputStream alive and reuse its Sink let mut out_stream: Option<(OutputStream, rodio::OutputStreamHandle, Sink)> = None;
while let Some(item) = stream.next().await { let chunk = item.expect("stream error"); println!("{chunk:#?}");
if let Some(delta) = chunk.delta { let part = delta.part; // not an Option if let PartDelta::Audio(a) = part { // audio_data is a String, not Option<String> if let Some(b64_data) = a.audio_data { let sample_rate: u32 = a.sample_rate.unwrap_or(24_000); let channels: u32 = a.channels.unwrap_or(1);
// Ensure audio output is initialized if out_stream.is_none() { let (s, h) = OutputStream::try_default().expect("no default audio output device"); let k = Sink::try_new(&h).expect("failed creating sink"); out_stream = Some((s, h, k)); }
// Get a mutable handle to the sink let sink = &mut out_stream.as_mut().unwrap().2;
// Decode base64 -> bytes -> i16 samples let bytes = BASE64_STANDARD .decode(b64_data.as_bytes()) .expect("invalid base64 audio"); let samples = bytes_to_i16_le_samples(&bytes);
// Append chunk to the sink (seamless streaming) let source = SamplesBuffer::new(u16::try_from(channels).unwrap(), sample_rate, samples); sink.append(source); } } } }
// Let the queued audio finish if let Some((_, _, sink)) = out_stream { sink.sleep_until_end(); println!("Playback finished"); }}
package main
import ( "context" "encoding/base64" "fmt" "io" "log" "sync"
"github.com/ebitengine/oto/v3" llmsdk "github.com/hoangvvo/llm-sdk/sdk-go" "github.com/hoangvvo/llm-sdk/sdk-go/examples" "github.com/hoangvvo/llm-sdk/sdk-go/utils/ptr" "github.com/sanity-io/litter")
func main() { model := examples.GetModel("openai-chat-completion", "gpt-4o-audio-preview")
response, err := model.Stream(context.Background(), &llmsdk.LanguageModelInput{ Modalities: []llmsdk.Modality{llmsdk.ModalityText, llmsdk.ModalityAudio}, Audio: &llmsdk.AudioOptions{ Format: ptr.To(llmsdk.AudioFormatLinear16), Voice: ptr.To("alloy"), }, Messages: []llmsdk.Message{ llmsdk.NewUserMessage( llmsdk.NewTextPart("Is a golden retriever a good family dog?"), ), }, })
if err != nil { log.Fatalf("Stream failed: %v", err) }
// Create a pipe for streaming audio audioReader, audioWriter := io.Pipe() var otoContext *oto.Context var audioPlayer *oto.Player var wg sync.WaitGroup var audioInitialized bool
accumulator := llmsdk.NewStreamAccumulator()
for response.Next() { current := response.Current() litter.Dump(current)
if current.Delta != nil && current.Delta.Part.AudioPartDelta != nil { audioDelta := current.Delta.Part.AudioPartDelta if audioDelta.AudioData != nil { audioData, err := base64.StdEncoding.DecodeString(*audioDelta.AudioData) if err != nil { log.Printf("Failed to decode audio chunk: %v", err) continue }
// Initialize audio context and player on first chunk if !audioInitialized { sampleRate := 24000 if audioDelta.SampleRate != nil { sampleRate = int(*audioDelta.SampleRate) }
channels := 1 if audioDelta.Channels != nil { channels = int(*audioDelta.Channels) }
op := &oto.NewContextOptions{ SampleRate: sampleRate, ChannelCount: channels, Format: oto.FormatSignedInt16LE, }
var ready chan struct{} otoContext, ready, err = oto.NewContext(op) if err != nil { log.Printf("Failed to create audio context: %v", err) continue } <-ready
audioPlayer = otoContext.NewPlayer(audioReader) fmt.Printf("Initialized audio playback (sample rate: %d, channels: %d)\n", sampleRate, channels)
// Start playback in a goroutine wg.Add(1) go func() { defer wg.Done() audioPlayer.Play() fmt.Println("Audio playback started") }()
audioInitialized = true }
// Stream audio chunk immediately _, err = audioWriter.Write(audioData) if err != nil { log.Printf("Failed to write audio chunk: %v", err) } } }
if err := accumulator.AddPartial(*current); err != nil { log.Printf("Failed to add partial: %v", err) } }
if err := response.Err(); err != nil { log.Fatalf("Stream error: %v", err) }
// Close the writer to signal end of audio stream if audioInitialized { audioWriter.Close() fmt.Println("Waiting for audio playback to finish...") wg.Wait() audioPlayer.Close() fmt.Println("Audio playback finished") }
finalResponse, err := accumulator.ComputeResponse() if err != nil { log.Fatalf("Failed to compute response: %v", err) }
litter.Dump(finalResponse)}