Audio generation

Audio modality is represented as AudioParts.

interface AudioPart {
  type: "audio";
  /**
   * The base64-encoded audio data.
   */
  data: string;
  format: AudioFormat;
  /**
   * The sample rate of the audio. E.g. 44100, 48000.
   */
  sample_rate?: number;
  /**
   * The number of channels of the audio. E.g. 1, 2.
   */
  channels?: number;
  /**
   * The transcript of the audio.
   */
  transcript?: string;
  /**
   * ID of the audio part, if applicable
   */
  id?: string;
}

type AudioFormat =
  | "wav"
  | "mp3"
  | "linear16"
  | "flac"
  | "mulaw"
  | "alaw"
  | "aac"
  | "opus";

pub struct AudioPart {
    /// The base64-encoded audio data.
    pub data: String,
    /// The format of the audio.
    pub format: AudioFormat,
    /// The sample rate of the audio. E.g. 44100, 48000.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub sample_rate: Option<u32>,
    /// The number of channels of the audio. E.g. 1, 2.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub channels: Option<u32>,
    /// The transcript of the audio.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub transcript: Option<String>,
    /// The ID of the audio part, if applicable
    #[serde(skip_serializing_if = "Option::is_none")]
    pub id: Option<String>,
}

pub enum AudioFormat {
    Wav,
    Mp3,
    Linear16,
    Flac,
    Mulaw,
    Alaw,
    Aac,
    Opus,
}

type AudioPart struct {
  // The base64-encoded audio data.
  Data   string      `json:"data"`
  Format AudioFormat `json:"format"`
  // The sample rate of the audio. E.g. 44100, 48000.
  SampleRate *int `json:"sample_rate,omitempty"`
  // The number of channels of the audio. E.g. 1, 2.
  Channels *int `json:"channels,omitempty"`
  // The transcript of the audio.
  Transcript *string `json:"transcript,omitempty"`
  // The ID of the part, if applicable.
  ID *string `json:"id,omitempty"`
}

type AudioFormat string

To ensure the audio can be played correctly, the application code must consider the provided audio format, sample_rate, and channels.

Depending on the provider, you may have to pass additional parameters to their API, such as voice or format, using the audio field.

interface AudioOptions {
  /**
   * The desired audio format.
   */
  format?: AudioFormat;
  /**
   * The provider-specific voice ID to use for audio generation.
   */
  voice?: string;
  /**
   * The language code for the audio generation.
   */
  language?: string;
}

pub struct AudioOptions {
    /// The format of the audio.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub format: Option<AudioFormat>,
    /// The provider-specifc voice ID to use for audio generation.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub voice: Option<String>,
    /// The language code for the audio generation.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub language: Option<String>,
}

type AudioOptions struct {
  // The desired audio format.
  Format *AudioFormat `json:"format,omitempty"`
  // The provider-specific voice ID to use for audio generation.
  Voice *string `json:"voice,omitempty"`
  // The language code for the audio generation
  LanguageCode *string `json:"language_code,omitempty"`
}

Generate audio

To generate audio, specify audio in the input modalities.

// Requires ffplay (https://ffmpeg.org/) on PATH.
import { spawn, type ChildProcess } from "node:child_process";
import { getModel } from "./get-model.ts";

const model = getModel("openai-chat-completion", "gpt-4o-audio-preview");

const response = await model.generate({
  modalities: ["text", "audio"],
  audio: {
    format: "mp3",
    voice: "alloy",
  },
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Is a golden retriever a good family dog?",
        },
      ],
    },
  ],
});

console.dir(response, { depth: null });

const audioPart = response.content.find((part) => part.type === "audio");

if (!audioPart) {
  throw new Error("Audio part not found in response");
}

await play(Buffer.from(audioPart.data, "base64"));

async function play(audio: Buffer) {
  const player = spawn(
    "ffplay",
    ["-autoexit", "-nodisp", "-loglevel", "error", "-"],
    {
      stdio: ["pipe", "ignore", "inherit"],
    },
  );

  await waitForSpawn(player);

  const stdin = player.stdin;
  if (!stdin) {
    throw new Error("ffplay stdin unavailable");
  }

  stdin.end(audio);

  await new Promise<void>((resolve, reject) => {
    player.once("close", (code) => {
      if (code === 0) {
        resolve();
      } else {
        reject(new Error(`ffplay exited with code ${code}`));
      }
    });
    player.once("error", reject);
  });
}

function waitForSpawn(child: ChildProcess) {
  return new Promise<void>((resolve, reject) => {
    child.once("spawn", resolve);
    child.once("error", reject);
  });
}

// Requires ffplay (https://ffmpeg.org/) on PATH.
use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};
use dotenvy::dotenv;
use llm_sdk::{AudioFormat, AudioOptions, LanguageModelInput, Message, Modality, Part};
use std::{
    io::Write,
    process::{Command, Stdio},
};

mod common;

#[tokio::main]
async fn main() {
    dotenv().ok();

    let model = common::get_model("openai-chat-completion", "gpt-4o-audio-preview");

    let response = model
        .generate(LanguageModelInput {
            modalities: Some(vec![Modality::Text, Modality::Audio]),
            messages: vec![Message::user(vec![Part::text(
                "Is a golden retriever a good family dog?",
            )])],
            audio: Some(AudioOptions {
                format: Some(AudioFormat::Mp3),
                voice: Some("alloy".into()),
                ..Default::default()
            }),
            ..Default::default()
        })
        .await
        .expect("model.generate failed");

    println!("{response:#?}");

    if let Some(audio_part) = response.content.iter().find_map(|p| match p {
        Part::Audio(a) => Some(a),
        _ => None,
    }) {
        let audio_bytes = BASE64_STANDARD
            .decode(&audio_part.data)
            .expect("invalid base64 audio data");

        play(&audio_bytes).expect("ffplay playback failed");
    } else {
        println!("Audio part not found in response");
    }
}

fn play(audio: &[u8]) -> std::io::Result<()> {
    let mut child = Command::new("ffplay")
        .args(["-autoexit", "-nodisp", "-loglevel", "error", "-"])
        .stdin(Stdio::piped())
        .stdout(Stdio::null())
        .stderr(Stdio::inherit())
        .spawn()?;

    {
        let stdin = child.stdin.as_mut().expect("ffplay stdin unavailable");
        stdin.write_all(audio)?;
    }

    let status = child.wait()?;
    if status.success() {
        Ok(())
    } else {
        Err(std::io::Error::other("ffplay exited with error"))
    }
}

package main

// Requires ffplay (https://ffmpeg.org/) on PATH.
import (
  "context"
  "encoding/base64"
  "log"
  "os/exec"

  llmsdk "github.com/hoangvvo/llm-sdk/sdk-go"
  "github.com/hoangvvo/llm-sdk/sdk-go/examples"
  "github.com/hoangvvo/llm-sdk/sdk-go/utils/ptr"
  "github.com/sanity-io/litter"
)

func main() {
  model := examples.GetModel("openai-chat-completion", "gpt-4o-audio-preview")

  response, err := model.Generate(context.Background(), &llmsdk.LanguageModelInput{
    Modalities: []llmsdk.Modality{llmsdk.ModalityText, llmsdk.ModalityAudio},
    Audio: &llmsdk.AudioOptions{
      Format: ptr.To(llmsdk.AudioFormatMP3),
      Voice:  ptr.To("alloy"),
    },
    Messages: []llmsdk.Message{
      llmsdk.NewUserMessage(
        llmsdk.NewTextPart("Is a golden retriever a good family dog?"),
      ),
    },
  })
  if err != nil {
    log.Fatalf("Generation failed: %v", err)
  }

  litter.Dump(response)

  for _, part := range response.Content {
    if part.AudioPart == nil {
      continue
    }

    audioData, err := base64.StdEncoding.DecodeString(part.AudioPart.Data)
    if err != nil {
      log.Fatalf("Failed to decode audio data: %v", err)
    }

    if err := play(audioData); err != nil {
      log.Fatalf("ffplay failed: %v", err)
    }

    return
  }

  log.Fatal("Audio part not found in response")
}

func play(audio []byte) error {
  cmd := exec.Command("ffplay", "-autoexit", "-nodisp", "-loglevel", "error", "-")
  stdin, err := cmd.StdinPipe()
  if err != nil {
    return err
  }

  if err := cmd.Start(); err != nil {
    return err
  }

  if _, err := stdin.Write(audio); err != nil {
    return err
  }
  if err := stdin.Close(); err != nil {
    return err
  }

  return cmd.Wait()
}

Stream audio

Audio generation can also be streamed using the stream() method. In streamed responses, AudioPart will be represented as AudioPartDelta:

interface AudioPartDelta {
  type: "audio";
  /**
   * The base64-encoded audio data.
   */
  data?: string;
  format?: AudioFormat;
  /**
   * The sample rate of the audio. E.g. 44100, 48000.
   */
  sample_rate?: number;
  /**
   * The number of channels of the audio. E.g. 1, 2.
   */
  channels?: number;
  /**
   * The transcript of the audio.
   */
  transcript?: string;
  /**
   * The ID of the audio part, if applicable
   */
  id?: string;
}

type AudioFormat =
  | "wav"
  | "mp3"
  | "linear16"
  | "flac"
  | "mulaw"
  | "alaw"
  | "aac"
  | "opus";

pub struct AudioPartDelta {
    /// The base64-encoded audio data.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub data: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub format: Option<AudioFormat>,
    /// The sample rate of the audio. E.g. 44100, 48000.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub sample_rate: Option<u32>,
    /// The number of channels of the audio. E.g. 1, 2.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub channels: Option<u32>,
    /// The transcript of the audio.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub transcript: Option<String>,
    /// The ID of the audio part, if applicable
    #[serde(skip_serializing_if = "Option::is_none")]
    pub id: Option<String>,
}

pub enum AudioFormat {
    Wav,
    Mp3,
    Linear16,
    Flac,
    Mulaw,
    Alaw,
    Aac,
    Opus,
}

type AudioPartDelta struct {
  // Data is the base64-encoded audio data.
  Data       *string      `json:"data,omitempty"`
  Format     *AudioFormat `json:"format,omitempty"`
  SampleRate *int         `json:"sample_rate,omitempty"`
  Channels   *int         `json:"channels,omitempty"`
  Transcript *string      `json:"transcript,omitempty"`
  ID         *string      `json:"id,omitempty"`
}

type AudioFormat string

Individual data chunks can be played back as they are received. They can be combined to create the final audio output.

// Requires ffplay (https://ffmpeg.org/) on PATH.
import { spawn, type ChildProcessByStdio } from "node:child_process";
import type Stream from "node:stream";
import { getModel } from "./get-model.ts";

const model = getModel("openai-chat-completion", "gpt-4o-audio-preview");

const stream = model.stream({
  modalities: ["text", "audio"],
  audio: {
    format: "linear16",
    voice: "alloy",
  },
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Is a golden retriever a good family dog?",
        },
      ],
    },
  ],
});

type Player = ChildProcessByStdio<Stream.Writable, null, null>;

let player: Player | undefined;
let sampleRate: number | undefined;
let channels: number | undefined;

for await (const partial of stream) {
  console.dir(redactAudioData(partial), { depth: null });

  const part = partial.delta?.part;
  if (part?.type !== "audio" || !part.data) continue;

  if (part.format && part.format !== "linear16") {
    throw new Error(`Unsupported audio format: ${part.format}`);
  }

  sampleRate ??= part.sample_rate ?? 24_000;
  channels ??= part.channels ?? 1;

  if (!player) {
    player = await startFfplay(sampleRate, channels);
    console.log(
      `Streaming audio with ffplay (${sampleRate} Hz, ${channels} channel${channels === 1 ? "" : "s"}).`,
    );
  }

  const currentPlayer = player;
  currentPlayer.stdin.write(Buffer.from(part.data, "base64"), (err) => {
    if (err) {
      console.error("Error writing to ffplay stdin:", err);
    }
  });
}

if (player) {
  await finishFfplay(player);
}

async function startFfplay(sampleRate: number, channels: number) {
  const child = spawn(
    "ffplay",
    [
      "-loglevel",
      "error",
      "-autoexit",
      "-nodisp",
      "-f",
      "s16le",
      "-ar",
      String(sampleRate),
      "-i",
      "pipe:0",
      "-af",
      `aformat=channel_layouts=${channels === 1 ? "mono" : "stereo"}`,
    ],
    { stdio: ["pipe", "ignore", "inherit"] },
  );

  await waitForSpawn(child);

  return child;
}

async function finishFfplay(child: Player) {
  if (child.stdin.writable) {
    child.stdin.end();
  }

  await new Promise<void>((resolve, reject) => {
    child.once("close", (code) => {
      if (code === 0) {
        resolve();
      } else {
        reject(new Error(`ffplay exited with code ${code}`));
      }
    });
    child.once("error", reject);
  });
}

function waitForSpawn(child: Player) {
  return new Promise<void>((resolve, reject) => {
    child.once("spawn", resolve);
    child.once("error", reject);
  });
}

function redactAudioData(partial: unknown) {
  if (!partial || typeof partial !== "object") {
    return partial;
  }

  return JSON.parse(
    JSON.stringify(partial, (_key, value) => {
      if (
        value &&
        typeof value === "object" &&
        "data" in value &&
        typeof value.data === "string"
      ) {
        const byteLength = Buffer.from(value.data, "base64").length;
        return { ...value, data: `[${byteLength} bytes]` };
      }
      return value;
    }),
  );
}

// Requires ffplay (https://ffmpeg.org/) on PATH.
use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};
use dotenvy::dotenv;
use futures::StreamExt;
use llm_sdk::{AudioFormat, AudioOptions, LanguageModelInput, Message, Modality, Part, PartDelta};
use serde_json::Value;
use std::{
    io::Write,
    process::{Child, ChildStdin, Command, Stdio},
};

mod common;

#[tokio::main]
async fn main() {
    dotenv().ok();

    let model = common::get_model("openai-chat-completion", "gpt-4o-audio-preview");

    let mut stream = model
        .stream(LanguageModelInput {
            modalities: Some(vec![Modality::Text, Modality::Audio]),
            messages: vec![Message::user(vec![Part::text(
                "Is a golden retriever a good family dog?",
            )])],
            audio: Some(AudioOptions {
                format: Some(AudioFormat::Linear16),
                voice: Some("alloy".into()),
                ..Default::default()
            }),
            ..Default::default()
        })
        .await
        .expect("failed to start stream");

    let mut sample_rate: Option<u32> = None;
    let mut channels: Option<u32> = None;
    let mut ffplay: Option<(Child, ChildStdin)> = None;

    while let Some(item) = stream.next().await {
        let chunk = item.expect("stream error");
        log_partial(&chunk);

        if let Some(delta) = chunk.delta {
            if let PartDelta::Audio(audio) = delta.part {
                if let Some(format) = audio.format {
                    if format != AudioFormat::Linear16 {
                        panic!("unsupported audio format: {format:?}");
                    }
                }
                if let Some(b64) = audio.data {
                    let bytes = BASE64_STANDARD
                        .decode(b64.as_bytes())
                        .expect("invalid base64 audio");

                    if sample_rate.is_none() {
                        sample_rate = Some(audio.sample_rate.unwrap_or(24_000));
                    }
                    if channels.is_none() {
                        channels = Some(audio.channels.unwrap_or(1));
                    }

                    if ffplay.is_none() {
                        let rate = sample_rate.unwrap();
                        let ch = channels.unwrap();
                        ffplay = Some(start_ffplay(rate, ch));
                        println!(
                            "Streaming audio with ffplay ({} Hz, {} channel{}).",
                            rate,
                            ch,
                            if ch == 1 { "" } else { "s" },
                        );
                    }

                    if let Some((_, ref mut stdin)) = ffplay {
                        stdin
                            .write_all(&bytes)
                            .expect("failed to write audio to ffplay");
                    }
                }
            }
        }
    }

    if let Some((child, stdin)) = ffplay {
        finish_ffplay(child, stdin);
    }
}

fn start_ffplay(sample_rate: u32, channels: u32) -> (Child, ChildStdin) {
    let mut child = Command::new("ffplay")
        .args([
            "-loglevel",
            "error",
            "-autoexit",
            "-nodisp",
            "-f",
            "s16le",
            "-ar",
            &sample_rate.to_string(),
            "-i",
            "pipe:0",
            "-af",
            &format!(
                "aformat=channel_layouts={}",
                if channels <= 1 { "mono" } else { "stereo" }
            ),
        ])
        .stdin(Stdio::piped())
        .stdout(Stdio::null())
        .stderr(Stdio::inherit())
        .spawn()
        .expect("failed to start ffplay");

    let stdin = child.stdin.take().expect("ffplay stdin unavailable");
    (child, stdin)
}

fn finish_ffplay(mut child: Child, mut stdin: ChildStdin) {
    stdin.flush().expect("failed to flush ffplay stdin");
    drop(stdin);

    let status = child.wait().expect("failed to wait for ffplay");
    if !status.success() {
        panic!("ffplay exited with error");
    }
}

fn log_partial(partial: &llm_sdk::PartialModelResponse) {
    match serde_json::to_value(partial) {
        Ok(mut value) => {
            redact_data(&mut value);
            println!("{value:#?}");
        }
        Err(_) => println!("{partial:#?}"),
    }
}

fn redact_data(value: &mut Value) {
    match value {
        Value::Object(map) => {
            if let Some(Value::String(data)) = map.get_mut("data") {
                if let Ok(bytes) = BASE64_STANDARD.decode(data.as_bytes()) {
                    *data = format!("[{} bytes]", bytes.len());
                } else {
                    *data = "[invalid data]".to_string();
                }
            }
            for entry in map.values_mut() {
                redact_data(entry);
            }
        }
        Value::Array(array) => {
            for item in array.iter_mut() {
                redact_data(item);
            }
        }
        _ => {}
    }
}

package main

// Requires ffplay (https://ffmpeg.org/) on PATH.
import (
  "context"
  "encoding/base64"
  "encoding/json"
  "fmt"
  "io"
  "log"
  "os/exec"
  "strconv"

  llmsdk "github.com/hoangvvo/llm-sdk/sdk-go"
  "github.com/hoangvvo/llm-sdk/sdk-go/examples"
  "github.com/hoangvvo/llm-sdk/sdk-go/utils/ptr"
  "github.com/sanity-io/litter"
)

func main() {
  model := examples.GetModel("openai-chat-completion", "gpt-4o-audio-preview")

  stream, err := model.Stream(context.Background(), &llmsdk.LanguageModelInput{
    Modalities: []llmsdk.Modality{llmsdk.ModalityText, llmsdk.ModalityAudio},
    Audio: &llmsdk.AudioOptions{
      Format: ptr.To(llmsdk.AudioFormatLinear16),
      Voice:  ptr.To("alloy"),
    },
    Messages: []llmsdk.Message{
      llmsdk.NewUserMessage(
        llmsdk.NewTextPart("Is a golden retriever a good family dog?"),
      ),
    },
  })
  if err != nil {
    log.Fatalf("Stream failed: %v", err)
  }

  var (
    ffplayCmd   *exec.Cmd
    ffplayStdin io.WriteCloser
    sampleRate  int
    channels    int
  )

  accumulator := llmsdk.NewStreamAccumulator()

  for stream.Next() {
    current := stream.Current()
    litter.Dump(redactPartial(current))

    if current.Delta != nil && current.Delta.Part.AudioPartDelta != nil {
      delta := current.Delta.Part.AudioPartDelta
      if delta.Format != nil && *delta.Format != llmsdk.AudioFormatLinear16 {
        log.Fatalf("unsupported audio format: %s", *delta.Format)
      }
      if delta.Data != nil {
        pcm, err := base64.StdEncoding.DecodeString(*delta.Data)
        if err != nil {
          log.Fatalf("Failed to decode audio: %v", err)
        }

        if sampleRate == 0 {
          if delta.SampleRate != nil {
            sampleRate = int(*delta.SampleRate)
          } else {
            sampleRate = 24_000
          }
        }
        if channels == 0 {
          if delta.Channels != nil {
            channels = int(*delta.Channels)
          } else {
            channels = 1
          }
        }

        if ffplayStdin == nil {
          ffplayCmd, ffplayStdin = startFfplay(sampleRate, channels)
          log.Printf(
            "Streaming audio with ffplay (%d Hz, %d channel%s).",
            sampleRate,
            channels,
            pluralSuffix(channels),
          )
        }

        if _, err := ffplayStdin.Write(pcm); err != nil {
          log.Fatalf("Failed to write audio: %v", err)
        }
      }
    }

    if err := accumulator.AddPartial(*current); err != nil {
      log.Printf("Failed to add partial: %v", err)
    }
  }

  if err := stream.Err(); err != nil {
    log.Fatalf("Stream error: %v", err)
  }

  if ffplayStdin != nil {
    finishFfplay(ffplayCmd, ffplayStdin)
  }

  finalResponse, err := accumulator.ComputeResponse()
  if err != nil {
    log.Fatalf("Failed to compute response: %v", err)
  }

  litter.Dump(finalResponse)
}

func startFfplay(sampleRate, channels int) (*exec.Cmd, io.WriteCloser) {
  args := []string{
    "-loglevel", "error",
    "-autoexit",
    "-nodisp",
    "-f", "s16le",
    "-ar", strconv.Itoa(sampleRate),
    "-i", "pipe:0",
    "-af", fmt.Sprintf("aformat=channel_layouts=%s", channelLayout(channels)),
  }

  cmd := exec.Command("ffplay", args...)
  stdin, err := cmd.StdinPipe()
  if err != nil {
    log.Fatalf("Failed to open ffplay stdin: %v", err)
  }

  if err := cmd.Start(); err != nil {
    log.Fatalf("Failed to start ffplay: %v", err)
  }

  return cmd, stdin
}

func finishFfplay(cmd *exec.Cmd, stdin io.WriteCloser) {
  if err := stdin.Close(); err != nil {
    log.Fatalf("Failed to close ffplay stdin: %v", err)
  }

  if err := cmd.Wait(); err != nil {
    log.Fatalf("ffplay exited with error: %v", err)
  }
}

func pluralSuffix(n int) string {
  if n == 1 {
    return ""
  }
  return "s"
}

func channelLayout(channels int) string {
  if channels <= 1 {
    return "mono"
  }
  return "stereo"
}

func redactPartial(partial *llmsdk.PartialModelResponse) any {
  if partial == nil {
    return nil
  }

  bytes, err := json.Marshal(partial)
  if err != nil {
    return partial
  }

  var data any
  if err := json.Unmarshal(bytes, &data); err != nil {
    return partial
  }

  return redactAudioFields(data)
}

func redactAudioFields(value any) any {
  switch v := value.(type) {
  case map[string]any:
    if raw, ok := v["data"].(string); ok {
      decoded, err := base64.StdEncoding.DecodeString(raw)
      if err == nil {
        v["data"] = fmt.Sprintf("[%d bytes]", len(decoded))
      } else {
        v["data"] = "[invalid data]"
      }
    }
    for key, val := range v {
      v[key] = redactAudioFields(val)
    }
    return v
  case []any:
    for i, item := range v {
      v[i] = redactAudioFields(item)
    }
    return v
  default:
    return value
  }
}