Image understanding
Image modality is represented as ImagePart
s.
interface ImagePart { type: "image"; /** * The MIME type of the image. E.g. "image/jpeg", "image/png". */ mime_type: string; /** * The base64-encoded image data. */ image_data: string; /** * The width of the image in pixels. */ width?: number; /** * The height of the image in pixels. */ height?: number; /** * ID of the image part, if applicable */ id?: string;}
pub struct ImagePart { /// The MIME type of the image. E.g. "image/jpeg", "image/png". pub mime_type: String, /// The base64-encoded image data. pub image_data: String, /// The width of the image in pixels. #[serde(skip_serializing_if = "Option::is_none")] pub width: Option<u32>, /// The height of the image in pixels. #[serde(skip_serializing_if = "Option::is_none")] pub height: Option<u32>, /// The ID of the image part, if applicable #[serde(skip_serializing_if = "Option::is_none")] pub id: Option<String>,}
type ImagePart struct { // The MIME type of the image. E.g. "image/jpeg", "image/png". MimeType string `json:"mime_type"` // The base64-encoded image data. ImageData string `json:"image_data"` // The width of the image in pixels. Width *int `json:"width,omitempty"` // The height of the image in pixels. Height *int `json:"height,omitempty"` // ID of the image part, if applicable ID *string `json:"id,omitempty"`}
Images can be sent to the model as ImagePart
objects.
import { getModel } from "./get-model.ts";
const imageUrl = "https://images.unsplash.com/photo-1464809142576-df63ca4ed7f0";const imageRes = await fetch(imageUrl);
const image = await imageRes.arrayBuffer();
const model = getModel("openai", "gpt-4o");
const response = await model.generate({ messages: [ { role: "user", content: [ { type: "text", text: "Describe this image", }, { type: "image", image_data: Buffer.from(image).toString("base64"), mime_type: imageRes.headers.get("content-type") ?? "image/jpeg", }, ], }, ],});
console.dir(response, { depth: null });
use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};use dotenvy::dotenv;use llm_sdk::{LanguageModelInput, Message, Part};
mod common;
#[tokio::main]async fn main() { dotenv().ok();
let image_url = "https://images.unsplash.com/photo-1464809142576-df63ca4ed7f0"; let image_res = reqwest::get(image_url) .await .expect("failed to fetch image"); let mime_type = image_res .headers() .get(reqwest::header::CONTENT_TYPE) .and_then(|ct| ct.to_str().ok()) .unwrap_or("image/jpeg") .to_string(); let image_bytes = image_res.bytes().await.expect("failed to read bytes");
let image_b64 = BASE64_STANDARD.encode(&image_bytes);
let model = common::get_model("openai", "gpt-4o");
let response = model .generate(LanguageModelInput { messages: vec![Message::user(vec![ Part::text("Describe this image"), Part::image(image_b64, mime_type), ])], ..Default::default() }) .await .expect("model.generate failed");
println!("{response:#?}");}
package main
import ( "context" "encoding/base64" "io" "log" "net/http"
llmsdk "github.com/hoangvvo/llm-sdk/sdk-go" "github.com/hoangvvo/llm-sdk/sdk-go/examples" "github.com/sanity-io/litter")
func main() { imageURL := "https://images.unsplash.com/photo-1464809142576-df63ca4ed7f0"
resp, err := http.Get(imageURL) if err != nil { log.Fatalf("Failed to fetch image: %v", err) } defer resp.Body.Close()
imageBytes, err := io.ReadAll(resp.Body) if err != nil { log.Fatalf("Failed to read image: %v", err) }
mimeType := resp.Header.Get("Content-Type") if mimeType == "" { mimeType = "image/jpeg" }
imageData := base64.StdEncoding.EncodeToString(imageBytes)
model := examples.GetModel("openai", "gpt-4o")
response, err := model.Generate(context.Background(), &llmsdk.LanguageModelInput{ Messages: []llmsdk.Message{ llmsdk.NewUserMessage( llmsdk.NewTextPart("Describe this image"), llmsdk.NewImagePart(mimeType, imageData), ), }, })
if err != nil { log.Fatalf("Generation failed: %v", err) }
litter.Dump(response)}