Add vision support in llama-server (#901)

* server: add support for vision model webui: add support for vision model * server : remove hack for extra parallel slot#10187 * llama : fix KV shift for qwen2vl #13870 * add no-context-shift parameter --------- Co-authored-by: firecoperana <firecoperana>
2026-04-25 00:49:34 +00:00 · 2025-11-05 08:43:46 +00:00
parent 92607d44c4
commit 7978f04996
26 changed files with 2456 additions and 729 deletions
--- a/examples/server/webui/src/utils/app.context.tsx
+++ b/examples/server/webui/src/utils/app.context.tsx
@@ -3,6 +3,7 @@ import {
  APIMessage,
  CanvasData,
  Conversation,
+  LlamaCppServerProps,
  Message,
  PendingMessage,
  ViewingChat,
@@ -12,6 +13,7 @@ import {
  filterThoughtFromMsgs,
  normalizeMsgsForAPI,
  getSSEStreamAsync,
+  getServerProps
 } from './misc';
 import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
 import { matchPath, useLocation, useNavigate } from 'react-router';
@@ -54,6 +56,10 @@ interface AppContextValue {
  saveConfig: (config: typeof CONFIG_DEFAULT) => void;
  showSettings: boolean;
  setShowSettings: (show: boolean) => void;
+
+    // props
+  serverProps: LlamaCppServerProps | null;
+
 }

 // this callback is used for scrolling to the bottom of the chat and switching to the last node
@@ -82,6 +88,9 @@ export const AppContextProvider = ({
  const params = matchPath('/chat/:convId', pathname);
  const convId = params?.params?.convId;

+  const [serverProps, setServerProps] = useState<LlamaCppServerProps | null>(
+    null
+  );
  const [viewingChat, setViewingChat] = useState<ViewingChat | null>(null);
  const [pendingMessages, setPendingMessages] = useState<
    Record<Conversation['id'], PendingMessage>
@@ -93,6 +102,20 @@ export const AppContextProvider = ({
  const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
  const [showSettings, setShowSettings] = useState(false);

+  // get server props
+  useEffect(() => {
+    getServerProps(BASE_URL, config.apiKey)
+      .then((props) => {
+        console.debug('Server props:', props);
+        setServerProps(props);
+      })
+      .catch((err) => {
+        console.error(err);
+        toast.error('Failed to fetch server props');
+      });
+    // eslint-disable-next-line
+  }, []);
+
  // handle change when the convId from URL is changed
  useEffect(() => {
    // also reset the canvas data
@@ -469,6 +492,7 @@ export const AppContextProvider = ({
        saveConfig,
        showSettings,
        setShowSettings,
+        serverProps,
      }}
    >
      {children}
--- a/examples/server/webui/src/utils/llama-vscode.ts
+++ b/examples/server/webui/src/utils/llama-vscode.ts
@@ -1,6 +1,6 @@
-import { useEffect, useState } from 'react';
-import { MessageExtraContext } from './types';
+import { useEffect } from 'react';
 import { ChatTextareaApi } from '../components/useChatTextarea.ts';
+import { ChatExtraContextApi } from '../components/useChatExtraContext.tsx';

 // Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
 // Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,11 +15,10 @@ interface SetTextEvData {
 * window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n  return 123' }, '*');
 */

-export const useVSCodeContext = (textarea: ChatTextareaApi) => {
-  const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
-    null
-  );
-
+export const useVSCodeContext = (
+  textarea: ChatTextareaApi,
+  extraContext: ChatExtraContextApi
+) => {
  // Accept setText message from a parent window and set inputMsg and extraContext
  useEffect(() => {
    const handleMessage = (event: MessageEvent) => {
@@ -27,18 +26,25 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
        const data: SetTextEvData = event.data;
        textarea.setValue(data?.text);
        if (data?.context && data.context.length > 0) {
-          setExtraContext({
-            type: 'context',
-            content: data.context,
-          });
+          extraContext.clearItems();
+          extraContext.addItems([
+            {
+              type: 'context',
+              name: 'Extra context',
+              content: data.context,
+            },
+          ]);
        }
        textarea.focus();
+        setTimeout(() => {
+          textarea.refOnSubmit.current?.();
+        }, 10); // wait for setExtraContext to finish
      }
    };

    window.addEventListener('message', handleMessage);
    return () => window.removeEventListener('message', handleMessage);
-  }, [textarea]);
+  }, [textarea, extraContext]);

  // Add a keydown listener that sends the "escapePressed" message to the parent window
  useEffect(() => {
@@ -52,9 +58,5 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
    return () => window.removeEventListener('keydown', handleKeyDown);
  }, []);

-  return {
-    extraContext,
-    // call once the user message is sent, to clear the extra context
-    clearExtraContext: () => setExtraContext(null),
-  };
+  return {};
 };
--- a/examples/server/webui/src/utils/misc.ts
+++ b/examples/server/webui/src/utils/misc.ts
@@ -1,6 +1,6 @@
 // @ts-expect-error this package does not have typing
 import TextLineStream from 'textlinestream';
-import { APIMessage, Message } from './types';
+import { APIMessage, Message, LlamaCppServerProps, APIMessageContentPart } from './types';

 // ponyfill for missing ReadableStream asyncIterator on Safari
 import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
@@ -57,21 +57,55 @@ export const copyStr = (textToCopy: string) => {
 */
 export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
  return messages.map((msg) => {
-    let newContent = '';
+    if (msg.role !== 'user' || !msg.extra) {
+      return {
+        role: msg.role,
+        content: msg.content,
+      } as APIMessage;
+    }
+
+    // extra content first, then user text message in the end
+    // this allow re-using the same cache prefix for long context
+    const contentArr: APIMessageContentPart[] = [];

    for (const extra of msg.extra ?? []) {
      if (extra.type === 'context') {
-		if (extra.content!='') {
-            newContent += `${extra.content}\n\n`;
-		}
+        contentArr.push({
+          type: 'text',
+          text: extra.content,
+        });
+      } else if (extra.type === 'textFile') {
+        contentArr.push({
+          type: 'text',
+          text: `File: ${extra.name}\nContent:\n\n${extra.content}`,
+        });
+      } else if (extra.type === 'imageFile') {
+        contentArr.push({
+          type: 'image_url',
+          image_url: { url: extra.base64Url },
+        });
+      } else if (extra.type === 'audioFile') {
+        contentArr.push({
+          type: 'input_audio',
+          input_audio: {
+            data: extra.base64Data,
+            format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
+          },
+        });
+      } else {
+        throw new Error('Unknown extra type');
      }
    }

-    newContent += msg.content;
+    // add user message to the end
+    contentArr.push({
+      type: 'text',
+      text: msg.content,
+    });

    return {
      role: msg.role,
-      content: newContent,
+      content: contentArr,
    };
  }) as APIMessage[];
 }
@@ -137,3 +171,25 @@ export const cleanCurrentUrl = (removeQueryParams: string[]) => {
  });
  window.history.replaceState({}, '', url.toString());
 };
+
+export const getServerProps = async (
+  baseUrl: string,
+  apiKey?: string
+): Promise<LlamaCppServerProps> => {
+  try {
+    const response = await fetch(`${baseUrl}/props`, {
+      headers: {
+        'Content-Type': 'application/json',
+        ...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
+      },
+    });
+    if (!response.ok) {
+      throw new Error('Failed to fetch server props');
+    }
+    const data = await response.json();
+    return data as LlamaCppServerProps;
+  } catch (error) {
+    console.error('Error fetching server props:', error);
+    throw error;
+  }
+};
--- a/examples/server/webui/src/utils/types.ts
+++ b/examples/server/webui/src/utils/types.ts
@@ -48,7 +48,11 @@ export interface Message {
  children: Message['id'][];
 }

-type MessageExtra = MessageExtraTextFile | MessageExtraContext; // TODO: will add more in the future
+export type MessageExtra =
+  | MessageExtraTextFile
+  | MessageExtraImageFile
+  | MessageExtraAudioFile
+  | MessageExtraContext;

 export interface MessageExtraTextFile {
  type: 'textFile';
@@ -56,12 +60,43 @@ export interface MessageExtraTextFile {
  content: string;
 }

+export interface MessageExtraImageFile {
+  type: 'imageFile';
+  name: string;
+  base64Url: string;
+}
+
+export interface MessageExtraAudioFile {
+  type: 'audioFile';
+  name: string;
+  base64Data: string;
+  mimeType: string;
+}
+
 export interface MessageExtraContext {
  type: 'context';
+  name: string;
  content: string;
 }

-export type APIMessage = Pick<Message, 'role' | 'content'>;
+export type APIMessageContentPart =
+  | {
+      type: 'text';
+      text: string;
+    }
+  | {
+      type: 'image_url';
+      image_url: { url: string };
+    }
+  | {
+      type: 'input_audio';
+      input_audio: { data: string; format: 'wav' | 'mp3' };
+    };
+
+export type APIMessage = {
+  role: Message['role'];
+  content: string | APIMessageContentPart[];
+};

 export interface Conversation {
  id: string; // format: `conv-{timestamp}`
@@ -96,4 +131,15 @@ export interface SettingsPreset {
  name: string;
  createdAt: number; // timestamp from Date.now()
  config: Record<string, string | number | boolean>; // partial CONFIG_DEFAULT
+}
+
+// a non-complete list of props, only contains the ones we need
+export interface LlamaCppServerProps {
+  model_path: string;
+  n_ctx: number;
+  modalities?: {
+    vision: boolean;
+    audio: boolean;
+  };
+  // TODO: support params
 }