mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 19:31:48 +00:00
Update webui to handle reasoning content and include usage stats in server only when requested (#791)
* handle reasoning content in webui server : include usage statistics only when user request them (#16052) server : only attempt to enable thinking if using jinja (#15967) * config reasoning_content in webui and change default to auto --------- Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -253,7 +253,7 @@ struct gpt_params {
|
|||||||
bool use_jinja = false; // NOLINT
|
bool use_jinja = false; // NOLINT
|
||||||
std::string system_prompt = "";
|
std::string system_prompt = "";
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
int reasoning_budget = -1;
|
int reasoning_budget = -1;
|
||||||
bool prefill_assistant = true;
|
bool prefill_assistant = true;
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@@ -173,6 +173,7 @@ struct server_task_result {
|
|||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
bool stream;
|
bool stream;
|
||||||
|
bool include_usage;
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
//slot_params generation_params;
|
//slot_params generation_params;
|
||||||
|
|
||||||
@@ -500,22 +501,22 @@ struct server_task_result {
|
|||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
{"object", "chat.completion.chunk"},
|
{"object", "chat.completion.chunk"},
|
||||||
});
|
});
|
||||||
|
if (include_usage) {
|
||||||
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
|
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
|
||||||
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
|
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
|
||||||
deltas.push_back({
|
deltas.push_back({
|
||||||
{"choices", json::array()},
|
{"choices", json::array()},
|
||||||
{"created", t},
|
{"created", t},
|
||||||
{"id", oaicompat_cmpl_id},
|
{"id", oaicompat_cmpl_id},
|
||||||
{"model", oaicompat_model},
|
{"model", oaicompat_model},
|
||||||
{"object", "chat.completion.chunk"},
|
{"object", "chat.completion.chunk"},
|
||||||
{"usage", json {
|
{"usage", json {
|
||||||
{"completion_tokens", n_decoded},
|
{"completion_tokens", n_decoded},
|
||||||
{"prompt_tokens", n_prompt_tokens},
|
{"prompt_tokens", n_prompt_tokens},
|
||||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||||
}},
|
}},
|
||||||
});
|
});
|
||||||
|
}
|
||||||
if (timings.prompt_n >= 0) {
|
if (timings.prompt_n >= 0) {
|
||||||
deltas.back().push_back({ "timings", timings.to_json() });
|
deltas.back().push_back({ "timings", timings.to_json() });
|
||||||
}
|
}
|
||||||
@@ -547,6 +548,7 @@ struct server_task_multi {
|
|||||||
|
|
||||||
struct slot_params {
|
struct slot_params {
|
||||||
bool stream = true;
|
bool stream = true;
|
||||||
|
bool include_usage = false;
|
||||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||||
|
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
@@ -1359,7 +1361,7 @@ struct server_context {
|
|||||||
// thinking is enabled if:
|
// thinking is enabled if:
|
||||||
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
||||||
// 2. The chat template supports it
|
// 2. The chat template supports it
|
||||||
const bool enable_thinking = params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
const bool enable_thinking = params.use_jinja && params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
||||||
//LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);
|
//LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);
|
||||||
|
|
||||||
oai_parser_opt = {
|
oai_parser_opt = {
|
||||||
@@ -1514,6 +1516,8 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
slot.params.timings_per_token = json_value(data, "timings_per_token", false);
|
slot.params.timings_per_token = json_value(data, "timings_per_token", false);
|
||||||
slot.params.stream = json_value(data, "stream", false);
|
slot.params.stream = json_value(data, "stream", false);
|
||||||
|
auto stream_opt = json_value(data, "stream_options", json::object());
|
||||||
|
slot.params.include_usage = json_value(stream_opt, "include_usage", false);
|
||||||
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
|
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||||
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
|
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
|
||||||
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
@@ -2206,6 +2210,7 @@ struct server_context {
|
|||||||
res.error = false;
|
res.error = false;
|
||||||
res.stop = true; // to do: set value
|
res.stop = true; // to do: set value
|
||||||
res.stream = slot.params.stream;
|
res.stream = slot.params.stream;
|
||||||
|
res.include_usage = slot.params.include_usage;
|
||||||
res.content = slot.generated_text;
|
res.content = slot.generated_text;
|
||||||
res.oaicompat = slot.params.oaicompat;
|
res.oaicompat = slot.params.oaicompat;
|
||||||
res.oaicompat_model = slot.params.oaicompat_model;
|
res.oaicompat_model = slot.params.oaicompat_model;
|
||||||
|
|||||||
74
examples/server/webui/dist/index.html
vendored
74
examples/server/webui/dist/index.html
vendored
File diff suppressed because one or more lines are too long
@@ -16,6 +16,7 @@ export const CONFIG_DEFAULT = {
|
|||||||
showTokensPerSecond: false,
|
showTokensPerSecond: false,
|
||||||
showThoughtInProgress: false,
|
showThoughtInProgress: false,
|
||||||
excludeThoughtOnReq: true,
|
excludeThoughtOnReq: true,
|
||||||
|
reasoning_format: 'auto',
|
||||||
// make sure these default values are in sync with `common.h`
|
// make sure these default values are in sync with `common.h`
|
||||||
samplers: 'dkypmxnt',
|
samplers: 'dkypmxnt',
|
||||||
temperature: 0.8,
|
temperature: 0.8,
|
||||||
@@ -42,6 +43,7 @@ export const CONFIG_DEFAULT = {
|
|||||||
pyIntepreterEnabled: false,
|
pyIntepreterEnabled: false,
|
||||||
};
|
};
|
||||||
export const CONFIG_INFO: Record<string, string> = {
|
export const CONFIG_INFO: Record<string, string> = {
|
||||||
|
reasoning_format : 'Specify how to parse reasoning content. none: reasoning content in content block. auto: reasoning content in reasoning_content. ',
|
||||||
apiKey: 'Set the API Key if you are using --api-key option for the server.',
|
apiKey: 'Set the API Key if you are using --api-key option for the server.',
|
||||||
systemMessage: 'The starting message that defines how model should behave.',
|
systemMessage: 'The starting message that defines how model should behave.',
|
||||||
samplers:
|
samplers:
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import toast from 'react-hot-toast'
|
|||||||
type SettKey = keyof typeof CONFIG_DEFAULT;
|
type SettKey = keyof typeof CONFIG_DEFAULT;
|
||||||
|
|
||||||
const BASIC_KEYS: SettKey[] = [
|
const BASIC_KEYS: SettKey[] = [
|
||||||
|
'reasoning_format',
|
||||||
'temperature',
|
'temperature',
|
||||||
'top_k',
|
'top_k',
|
||||||
'top_p',
|
'top_p',
|
||||||
|
|||||||
@@ -215,7 +215,7 @@ export const AppContextProvider = ({
|
|||||||
messages,
|
messages,
|
||||||
stream: true,
|
stream: true,
|
||||||
cache_prompt: true,
|
cache_prompt: true,
|
||||||
reasoning_format: 'none',
|
reasoning_format: config.reasoning_format===''?'auto':config.reasoning_format,
|
||||||
samplers: config.samplers,
|
samplers: config.samplers,
|
||||||
temperature: config.temperature,
|
temperature: config.temperature,
|
||||||
dynatemp_range: config.dynatemp_range,
|
dynatemp_range: config.dynatemp_range,
|
||||||
@@ -226,7 +226,7 @@ export const AppContextProvider = ({
|
|||||||
typical_p: config.typical_p,
|
typical_p: config.typical_p,
|
||||||
xtc_probability: config.xtc_probability,
|
xtc_probability: config.xtc_probability,
|
||||||
xtc_threshold: config.xtc_threshold,
|
xtc_threshold: config.xtc_threshold,
|
||||||
top_n_sigma: config.top_n_sigma,
|
top_n_sigma: config.top_n_sigma,
|
||||||
repeat_last_n: config.repeat_last_n,
|
repeat_last_n: config.repeat_last_n,
|
||||||
repeat_penalty: config.repeat_penalty,
|
repeat_penalty: config.repeat_penalty,
|
||||||
presence_penalty: config.presence_penalty,
|
presence_penalty: config.presence_penalty,
|
||||||
@@ -257,14 +257,35 @@ export const AppContextProvider = ({
|
|||||||
throw new Error(body?.error?.message || 'Unknown error');
|
throw new Error(body?.error?.message || 'Unknown error');
|
||||||
}
|
}
|
||||||
const chunks = getSSEStreamAsync(fetchResponse);
|
const chunks = getSSEStreamAsync(fetchResponse);
|
||||||
|
let thinkingTagOpen = false;
|
||||||
for await (const chunk of chunks) {
|
for await (const chunk of chunks) {
|
||||||
// const stop = chunk.stop;
|
// const stop = chunk.stop;
|
||||||
if (chunk.error) {
|
if (chunk.error) {
|
||||||
throw new Error(chunk.error?.message || 'Unknown error');
|
throw new Error(chunk.error?.message || 'Unknown error');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const reasoningContent = chunk.choices?.[0]?.delta?.reasoning_content;
|
||||||
|
if (reasoningContent) {
|
||||||
|
if (pendingMsg.content === null || pendingMsg.content === '') {
|
||||||
|
thinkingTagOpen = true;
|
||||||
|
pendingMsg = {
|
||||||
|
...pendingMsg,
|
||||||
|
content: '<think>' + reasoningContent,
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
pendingMsg = {
|
||||||
|
...pendingMsg,
|
||||||
|
content: pendingMsg.content + reasoningContent,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
const addedContent = chunk.choices?.[0]?.delta?.content;
|
const addedContent = chunk.choices?.[0]?.delta?.content;
|
||||||
const lastContent = pendingMsg.content || '';
|
let lastContent = pendingMsg.content || '';
|
||||||
if (addedContent) {
|
if (addedContent) {
|
||||||
|
if (thinkingTagOpen) {
|
||||||
|
lastContent = lastContent + '</think>';
|
||||||
|
thinkingTagOpen = false;
|
||||||
|
}
|
||||||
pendingMsg = {
|
pendingMsg = {
|
||||||
...pendingMsg,
|
...pendingMsg,
|
||||||
content: lastContent + addedContent,
|
content: lastContent + addedContent,
|
||||||
|
|||||||
Reference in New Issue
Block a user