mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-15 00:07:28 +00:00
API: Add timings to usage stats
It's useful for the client to know what the T/s and total time for generation are per-request. Works with both completions and chat completions. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
@@ -73,8 +73,9 @@ def _create_response(
|
||||
|
||||
choices.append(choice)
|
||||
|
||||
prompt_tokens = unwrap(generations[-1].get("prompt_tokens"), 0)
|
||||
completion_tokens = unwrap(generations[-1].get("generated_tokens"), 0)
|
||||
final_generation = generations[-1]
|
||||
prompt_tokens = unwrap(final_generation.get("prompt_tokens"), 0)
|
||||
completion_tokens = unwrap(final_generation.get("gen_tokens"), 0)
|
||||
|
||||
response = CompletionResponse(
|
||||
id=f"cmpl-{request_id}",
|
||||
@@ -82,8 +83,13 @@ def _create_response(
|
||||
model=model_name,
|
||||
usage=UsageStats(
|
||||
prompt_tokens=prompt_tokens,
|
||||
prompt_time=final_generation.get("prompt_time"),
|
||||
prompt_tokens_per_sec=final_generation.get("prompt_tokens_per_sec"),
|
||||
completion_tokens=completion_tokens,
|
||||
completion_time=final_generation.get("gen_time"),
|
||||
completion_tokens_per_sec=final_generation.get("gen_tokens_per_sec"),
|
||||
total_tokens=prompt_tokens + completion_tokens,
|
||||
total_time=final_generation.get("total_time"),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user