diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7d29149d..6b138ce1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -141,8 +141,6 @@ struct server_task_result { }; -std::unordered_map server_task_result_dict = {}; - // Helper functions for content cleaning static std::string remove_simple_function_calls(const std::string& content) { std::string cleaned = content; @@ -435,9 +433,8 @@ struct server_slot { timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed; timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - timings.predicted_n = n_decoded; - timings.predicted_ms = (ggml_time_us() - t_start_generation) / 1e3; + timings.predicted_ms = t_token_generation; timings.predicted_per_token_ms = t_token_generation / n_decoded; timings.predicted_per_second = 1e3 / t_token_generation * n_decoded; @@ -1808,9 +1805,9 @@ struct server_context { // populate timings if this is final response or timings_per_token is enabled if (slot.params.timings_per_token) { //res.data["timings"] = slot.get_formated_timings(); + slot.t_token_generation = (ggml_time_us() - slot.t_start_generation) / 1e3; res.timings = slot.get_timings(); } - server_task_result_dict[slot.id_task] = res; queue_results.send(std::move(res)); } @@ -1836,7 +1833,12 @@ struct server_context { {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()} + {"timings", slot.get_formated_timings()}, + {"usage", json { + {"completion_tokens", slot.n_decoded}, + {"prompt_tokens", slot.n_prompt_tokens}, + {"total_tokens", slot.n_decoded + slot.n_prompt_tokens} + }} }; if (slot.sparams.n_probs > 0) { @@ -1857,6 +1859,8 @@ struct server_context { res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs); } + res.timings = slot.get_timings(); + if (slot.oaicompat) { res.data["oaicompat_token_ctr"] = slot.n_decoded; res.data["model"] = slot.oaicompat_model; @@ -3047,6 +3051,11 @@ static json format_final_response_oaicompat(const json& request, json result, co {"id", completion_id} }; + json timings = json_value(result, "timings", json::object()); + if (!timings.empty()) { + res["timings"] = timings; + } + if (server_verbose) { res["__verbose"] = result; } @@ -3129,6 +3138,10 @@ static std::vector format_partial_response_oaicompat(server_task_result ta // Always add final chunk (like original llama.cpp) if (!finish_reason.empty()) { + // usage + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + json finish_chunk = { {"choices", json::array({json{{"finish_reason", finish_reason}, {"index", 0}, @@ -3136,16 +3149,21 @@ static std::vector format_partial_response_oaicompat(server_task_result ta {"created", t}, {"id", completion_id}, {"model", modelname}, - {"object", "chat.completion.chunk"} + {"object", "chat.completion.chunk"}, + {"usage", json { + {"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens} + }} }; streaming_chunks.push_back(finish_chunk); } - if (server_task_result_dict.count(task_result.id) > 0) - { + if (task_result.timings.prompt_n != -1) { for (auto& chunk : streaming_chunks) - chunk.push_back({ "timings", server_task_result_dict[task_result.id].timings.to_json() }); + chunk.push_back({ "timings", task_result.timings.to_json() }); } + // Return streaming chunks (could be just final chunk if no diffs) if (!streaming_chunks.empty()) { return streaming_chunks; @@ -3218,11 +3236,11 @@ static std::vector format_partial_response_oaicompat(server_task_result ta {"model", modelname}, {"object", "chat.completion.chunk"} }; - if (server_task_result_dict.count(task_result.id) > 0) - { - ret.push_back({ "timings", server_task_result_dict[task_result.id].timings.to_json() }); + + if (task_result.timings.prompt_n != -1) { + ret.push_back({ "timings", task_result.timings.to_json() }); } - + // if (!finish_reason.empty()) { int num_tokens_predicted = json_value(result, "tokens_predicted", 0);