Audio understanding

Audio can be sent to the model using an AudioPart.

interface AudioPart {
  type: "audio";
  /**
   * The base64-encoded audio data.
   */
  data: string;
  format: AudioFormat;
  /**
   * The sample rate of the audio. E.g. 44100, 48000.
   */
  sample_rate?: number;
  /**
   * The number of channels of the audio. E.g. 1, 2.
   */
  channels?: number;
  /**
   * The transcript of the audio.
   */
  transcript?: string;
  /**
   * ID of the audio part, if applicable
   */
  id?: string;
}

type AudioFormat =
  | "wav"
  | "mp3"
  | "linear16"
  | "flac"
  | "mulaw"
  | "alaw"
  | "aac"
  | "opus";

pub struct AudioPart {
    /// The base64-encoded audio data.
    pub data: String,
    /// The format of the audio.
    pub format: AudioFormat,
    /// The sample rate of the audio. E.g. 44100, 48000.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub sample_rate: Option<u32>,
    /// The number of channels of the audio. E.g. 1, 2.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub channels: Option<u32>,
    /// The transcript of the audio.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub transcript: Option<String>,
    /// The ID of the audio part, if applicable
    #[serde(skip_serializing_if = "Option::is_none")]
    pub id: Option<String>,
}

pub enum AudioFormat {
    Wav,
    Mp3,
    Linear16,
    Flac,
    Mulaw,
    Alaw,
    Aac,
    Opus,
}

type AudioPart struct {
  // The base64-encoded audio data.
  Data   string      `json:"data"`
  Format AudioFormat `json:"format"`
  // The sample rate of the audio. E.g. 44100, 48000.
  SampleRate *int `json:"sample_rate,omitempty"`
  // The number of channels of the audio. E.g. 1, 2.
  Channels *int `json:"channels,omitempty"`
  // The transcript of the audio.
  Transcript *string `json:"transcript,omitempty"`
  // The ID of the part, if applicable.
  ID *string `json:"id,omitempty"`
}

type AudioFormat string

This enables use cases such as:

Transcribing audio to text
Summarizing spoken content
Analyzing sentiment in speech

summarize-audio

import { getModel } from "./get-model.ts";

const audioUrl = "https://archive.org/download/MLKDream/MLKDream.ogg";
const audioRes = await fetch(audioUrl);

const audio = await audioRes.arrayBuffer();

const model = getModel("google", "gemini-2.0-flash");

const response = await model.generate({
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "What is this speech about?",
        },
        {
          type: "audio",
          data: Buffer.from(audio).toString("base64"),
          format: "opus",
        },
      ],
    },
  ],
});

console.dir(response, { depth: null });

use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};
use dotenvy::dotenv;
use llm_sdk::{AudioFormat, LanguageModelInput};

mod common;

#[tokio::main]
async fn main() {
    dotenv().ok();

    let audio_url = "https://archive.org/download/MLKDream/MLKDream.ogg";

    let audio_res = reqwest::get(audio_url)
        .await
        .expect("failed to fetch audio");
    let audio_bytes = audio_res.bytes().await.expect("failed to read bytes");

    let audio_b64 = BASE64_STANDARD.encode(&audio_bytes);

    let model = common::get_model("google", "gemini-2.0-flash");

    let response = model
        .generate(LanguageModelInput {
            messages: vec![llm_sdk::Message::user(vec![
                llm_sdk::Part::text("What is this speech about?"),
                llm_sdk::Part::audio(audio_b64, AudioFormat::Opus),
            ])],
            ..Default::default()
        })
        .await
        .expect("model.generate failed");

    println!("{response:#?}");
}

package main

import (
  "context"
  "encoding/base64"
  "io"
  "log"
  "net/http"

  llmsdk "github.com/hoangvvo/llm-sdk/sdk-go"
  "github.com/hoangvvo/llm-sdk/sdk-go/examples"
  "github.com/sanity-io/litter"
)

func main() {
  audioURL := "https://archive.org/download/MLKDream/MLKDream.ogg"

  resp, err := http.Get(audioURL)
  if err != nil {
    log.Fatalf("Failed to fetch audio: %v", err)
  }
  defer resp.Body.Close()

  audioBytes, err := io.ReadAll(resp.Body)
  if err != nil {
    log.Fatalf("Failed to read audio: %v", err)
  }

  audioData := base64.StdEncoding.EncodeToString(audioBytes)

  model := examples.GetModel("google", "gemini-2.0-flash")

  response, err := model.Generate(context.Background(), &llmsdk.LanguageModelInput{
    Messages: []llmsdk.Message{
      llmsdk.NewUserMessage(
        llmsdk.NewTextPart("What is this speech about?"),
        llmsdk.NewAudioPart(audioData, llmsdk.AudioFormatOpus),
      ),
    },
  })

  if err != nil {
    log.Fatalf("Generation failed: %v", err)
  }

  litter.Dump(response)
}