diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7d29149d..6b138ce1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -141,8 +141,6 @@ struct server_task_result {
 
 };
 
-std::unordered_map<int, server_task_result > server_task_result_dict = {};
-
 // Helper functions for content cleaning
 static std::string remove_simple_function_calls(const std::string& content) {
     std::string cleaned = content;
@@ -435,9 +433,8 @@ struct server_slot {
         timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
         timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
 
-
         timings.predicted_n = n_decoded;
-        timings.predicted_ms = (ggml_time_us() - t_start_generation) / 1e3;
+        timings.predicted_ms = t_token_generation;
         timings.predicted_per_token_ms = t_token_generation / n_decoded;
         timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
 
@@ -1808,9 +1805,9 @@ struct server_context {
         // populate timings if this is final response or timings_per_token is enabled
         if (slot.params.timings_per_token) {
             //res.data["timings"] = slot.get_formated_timings();
+            slot.t_token_generation = (ggml_time_us() - slot.t_start_generation) / 1e3;
             res.timings = slot.get_timings();
         }
-        server_task_result_dict[slot.id_task] = res;
         queue_results.send(std::move(res));
     }
 
@@ -1836,7 +1833,12 @@ struct server_context {
             {"stopped_limit",       slot.stopped_limit},
             {"stopping_word",       slot.stopping_word},
             {"tokens_cached",       slot.n_past},
-            {"timings",             slot.get_formated_timings()}
+            {"timings",             slot.get_formated_timings()},
+            {"usage",               json {
+                {"completion_tokens", slot.n_decoded},
+                {"prompt_tokens",     slot.n_prompt_tokens},
+                {"total_tokens",      slot.n_decoded + slot.n_prompt_tokens}
+            }}
         };
 
         if (slot.sparams.n_probs > 0) {
@@ -1857,6 +1859,8 @@ struct server_context {
             res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs);
         }
 
+        res.timings = slot.get_timings();
+
         if (slot.oaicompat) {
             res.data["oaicompat_token_ctr"] = slot.n_decoded;
             res.data["model"] = slot.oaicompat_model;
@@ -3047,6 +3051,11 @@ static json format_final_response_oaicompat(const json& request, json result, co
         {"id", completion_id}
     };
 
+    json timings = json_value(result, "timings", json::object());
+    if (!timings.empty()) {
+        res["timings"] = timings;
+    }
+
     if (server_verbose) {
         res["__verbose"] = result;
     }
@@ -3129,6 +3138,10 @@ static std::vector<json> format_partial_response_oaicompat(server_task_result ta
 
     // Always add final chunk (like original llama.cpp)
     if (!finish_reason.empty()) {
+        // usage
+        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+        int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
+
         json finish_chunk = {
             {"choices", json::array({json{{"finish_reason", finish_reason},
                                         {"index", 0},
@@ -3136,16 +3149,21 @@ static std::vector<json> format_partial_response_oaicompat(server_task_result ta
             {"created", t},
             {"id", completion_id},
             {"model", modelname},
-            {"object", "chat.completion.chunk"}
+            {"object", "chat.completion.chunk"},
+            {"usage", json {
+                {"completion_tokens", num_tokens_predicted},
+                {"prompt_tokens",     num_prompt_tokens},
+                {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+            }}
         };
         streaming_chunks.push_back(finish_chunk);
     }
 
-    if (server_task_result_dict.count(task_result.id) > 0)
-    {
+    if (task_result.timings.prompt_n != -1) {
         for (auto& chunk : streaming_chunks)
-            chunk.push_back({ "timings", server_task_result_dict[task_result.id].timings.to_json() });
+            chunk.push_back({ "timings", task_result.timings.to_json() });
     }
+
     // Return streaming chunks (could be just final chunk if no diffs)
     if (!streaming_chunks.empty()) {
         return streaming_chunks;
@@ -3218,11 +3236,11 @@ static std::vector<json> format_partial_response_oaicompat(server_task_result ta
         {"model",   modelname},
         {"object",  "chat.completion.chunk"}
     };
-    if (server_task_result_dict.count(task_result.id) > 0)
-    {
-        ret.push_back({ "timings", server_task_result_dict[task_result.id].timings.to_json() });
+    
+    if (task_result.timings.prompt_n != -1) {
+        ret.push_back({ "timings", task_result.timings.to_json() });
     }
-
+    
     //
     if (!finish_reason.empty()) {
         int num_tokens_predicted = json_value(result, "tokens_predicted", 0);