Audio understanding
Audio can be sent to the model using an AudioPart
.
interface AudioPart { type: "audio"; /** * The base64-encoded audio data. */ audio_data: string; format: AudioFormat; /** * The sample rate of the audio. E.g. 44100, 48000. */ sample_rate?: number; /** * The number of channels of the audio. E.g. 1, 2. */ channels?: number; /** * The transcript of the audio. */ transcript?: string; /** * ID of the audio part, if applicable */ id?: string;}
type AudioFormat = | "wav" | "mp3" | "linear16" | "flac" | "mulaw" | "alaw" | "aac" | "opus";
pub struct AudioPart { /// The base64-encoded audio data. pub audio_data: String, /// The format of the audio. pub format: AudioFormat, /// The sample rate of the audio. E.g. 44100, 48000. #[serde(skip_serializing_if = "Option::is_none")] pub sample_rate: Option<u32>, /// The number of channels of the audio. E.g. 1, 2. #[serde(skip_serializing_if = "Option::is_none")] pub channels: Option<u32>, /// The transcript of the audio. #[serde(skip_serializing_if = "Option::is_none")] pub transcript: Option<String>, /// The ID of the audio part, if applicable #[serde(skip_serializing_if = "Option::is_none")] pub id: Option<String>,}
pub enum AudioFormat { Wav, Mp3, Linear16, Flac, Mulaw, Alaw, Aac, Opus,}
type AudioPart struct { // The base64-encoded audio data. AudioData string `json:"audio_data"` Format AudioFormat `json:"format"` // The sample rate of the audio. E.g. 44100, 48000. SampleRate *int `json:"sample_rate,omitempty"` // The number of channels of the audio. E.g. 1, 2. Channels *int `json:"channels,omitempty"` // The transcript of the audio. Transcript *string `json:"transcript,omitempty"` // The ID of the part, if applicable. ID *string `json:"id,omitempty"`}
type AudioFormat string
This enables use cases such as:
- Transcribing audio to text
- Summarizing spoken content
- Analyzing sentiment in speech
import { getModel } from "./get-model.ts";
const audioUrl = "https://archive.org/download/MLKDream/MLKDream.ogg";const audioRes = await fetch(audioUrl);
const audio = await audioRes.arrayBuffer();
const model = getModel("google", "gemini-2.0-flash");
const response = await model.generate({ messages: [ { role: "user", content: [ { type: "text", text: "What is this speech about?", }, { type: "audio", audio_data: Buffer.from(audio).toString("base64"), format: "opus", }, ], }, ],});
console.dir(response, { depth: null });
use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};use dotenvy::dotenv;use llm_sdk::{AudioFormat, LanguageModelInput};
mod common;
#[tokio::main]async fn main() { dotenv().ok();
let audio_url = "https://archive.org/download/MLKDream/MLKDream.ogg";
let audio_res = reqwest::get(audio_url) .await .expect("failed to fetch audio"); let audio_bytes = audio_res.bytes().await.expect("failed to read bytes");
let audio_b64 = BASE64_STANDARD.encode(&audio_bytes);
let model = common::get_model("google", "gemini-2.0-flash");
let response = model .generate(LanguageModelInput { messages: vec![llm_sdk::Message::user(vec![ llm_sdk::Part::text("What is this speech about?"), llm_sdk::Part::audio(audio_b64, AudioFormat::Opus), ])], ..Default::default() }) .await .expect("model.generate failed");
println!("{response:#?}");}
package main
import ( "context" "encoding/base64" "io" "log" "net/http"
llmsdk "github.com/hoangvvo/llm-sdk/sdk-go" "github.com/hoangvvo/llm-sdk/sdk-go/examples" "github.com/sanity-io/litter")
func main() { audioURL := "https://archive.org/download/MLKDream/MLKDream.ogg"
resp, err := http.Get(audioURL) if err != nil { log.Fatalf("Failed to fetch audio: %v", err) } defer resp.Body.Close()
audioBytes, err := io.ReadAll(resp.Body) if err != nil { log.Fatalf("Failed to read audio: %v", err) }
audioData := base64.StdEncoding.EncodeToString(audioBytes)
model := examples.GetModel("google", "gemini-2.0-flash")
response, err := model.Generate(context.Background(), &llmsdk.LanguageModelInput{ Messages: []llmsdk.Message{ llmsdk.NewUserMessage( llmsdk.NewTextPart("What is this speech about?"), llmsdk.NewAudioPart(audioData, llmsdk.AudioFormatOpus), ), }, })
if err != nil { log.Fatalf("Generation failed: %v", err) }
litter.Dump(response)}