Image understanding

Image modality is represented as ImageParts.

interface ImagePart {
  type: "image";
  /**
   * The MIME type of the image. E.g. "image/jpeg", "image/png".
   */
  mime_type: string;
  /**
   * The base64-encoded image data.
   */
  data: string;
  /**
   * The width of the image in pixels.
   */
  width?: number;
  /**
   * The height of the image in pixels.
   */
  height?: number;
  /**
   * ID of the image part, if applicable
   */
  id?: string;
}

pub struct ImagePart {
    /// The MIME type of the image. E.g. "image/jpeg", "image/png".
    pub mime_type: String,
    /// The base64-encoded image data.
    pub data: String,
    /// The width of the image in pixels.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub width: Option<u32>,
    /// The height of the image in pixels.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub height: Option<u32>,
    /// The ID of the image part, if applicable
    #[serde(skip_serializing_if = "Option::is_none")]
    pub id: Option<String>,
}

type ImagePart struct {
  // The MIME type of the image. E.g. "image/jpeg", "image/png".
  MimeType string `json:"mime_type"`
  // The base64-encoded image data.
  Data string `json:"data"`
  // The width of the image in pixels.
  Width *int `json:"width,omitempty"`
  // The height of the image in pixels.
  Height *int `json:"height,omitempty"`
  // ID of the image part, if applicable
  ID *string `json:"id,omitempty"`
}

Images can be sent to the model as ImagePart objects.

describe-image

import { getModel } from "./get-model.ts";

const imageUrl = "https://images.unsplash.com/photo-1464809142576-df63ca4ed7f0";
const imageRes = await fetch(imageUrl);

const image = await imageRes.arrayBuffer();

const model = getModel("openai", "gpt-4o");

const response = await model.generate({
  messages: [
    {
      role: "user",
      content: [
        {
          type: "text",
          text: "Describe this image",
        },
        {
          type: "image",
          data: Buffer.from(image).toString("base64"),
          mime_type: imageRes.headers.get("content-type") ?? "image/jpeg",
        },
      ],
    },
  ],
});

console.dir(response, { depth: null });

use base64::{engine::general_purpose::STANDARD as BASE64_STANDARD, Engine};
use dotenvy::dotenv;
use llm_sdk::{LanguageModelInput, Message, Part};

mod common;

#[tokio::main]
async fn main() {
    dotenv().ok();

    let image_url = "https://images.unsplash.com/photo-1464809142576-df63ca4ed7f0";
    let image_res = reqwest::get(image_url)
        .await
        .expect("failed to fetch image");
    let mime_type = image_res
        .headers()
        .get(reqwest::header::CONTENT_TYPE)
        .and_then(|ct| ct.to_str().ok())
        .unwrap_or("image/jpeg")
        .to_string();
    let image_bytes = image_res.bytes().await.expect("failed to read bytes");

    let image_b64 = BASE64_STANDARD.encode(&image_bytes);

    let model = common::get_model("openai", "gpt-4o");

    let response = model
        .generate(LanguageModelInput {
            messages: vec![Message::user(vec![
                Part::text("Describe this image"),
                Part::image(image_b64, mime_type),
            ])],
            ..Default::default()
        })
        .await
        .expect("model.generate failed");

    println!("{response:#?}");
}

package main

import (
  "context"
  "encoding/base64"
  "io"
  "log"
  "net/http"

  llmsdk "github.com/hoangvvo/llm-sdk/sdk-go"
  "github.com/hoangvvo/llm-sdk/sdk-go/examples"
  "github.com/sanity-io/litter"
)

func main() {
  imageURL := "https://images.unsplash.com/photo-1464809142576-df63ca4ed7f0"

  resp, err := http.Get(imageURL)
  if err != nil {
    log.Fatalf("Failed to fetch image: %v", err)
  }
  defer resp.Body.Close()

  imageBytes, err := io.ReadAll(resp.Body)
  if err != nil {
    log.Fatalf("Failed to read image: %v", err)
  }

  mimeType := resp.Header.Get("Content-Type")
  if mimeType == "" {
    mimeType = "image/jpeg"
  }

  imageData := base64.StdEncoding.EncodeToString(imageBytes)

  model := examples.GetModel("openai", "gpt-4o")

  response, err := model.Generate(context.Background(), &llmsdk.LanguageModelInput{
    Messages: []llmsdk.Message{
      llmsdk.NewUserMessage(
        llmsdk.NewTextPart("Describe this image"),
        llmsdk.NewImagePart(mimeType, imageData),
      ),
    },
  })

  if err != nil {
    log.Fatalf("Generation failed: %v", err)
  }

  litter.Dump(response)
}