mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-15 00:07:28 +00:00
Model: Use true async jobs and add logprobs
The new async dynamic job allows for native async support without the need of threading. Also add logprobs and metrics back to responses. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
@@ -70,29 +70,38 @@ def log_response(response: str):
|
||||
|
||||
|
||||
def log_metrics(
|
||||
queue_time: float,
|
||||
prompt_tokens: int,
|
||||
prompt_time: float,
|
||||
generated_tokens: int,
|
||||
elapsed_time: float,
|
||||
generate_time: float,
|
||||
context_len: Optional[int],
|
||||
max_seq_len: int,
|
||||
):
|
||||
initial_response = (
|
||||
f"Metrics: {generated_tokens} tokens generated in "
|
||||
f"{round(elapsed_time, 2)} seconds"
|
||||
f"{round(queue_time + prompt_time + generate_time, 2)} seconds"
|
||||
)
|
||||
itemization = []
|
||||
extra_parts = []
|
||||
|
||||
# Add tokens per second
|
||||
tokens_per_second = (
|
||||
"Indeterminate"
|
||||
if elapsed_time == 0
|
||||
else round(generated_tokens / elapsed_time, 2)
|
||||
itemization.append(f"Queue: {round(queue_time, 2)} s")
|
||||
|
||||
prompt_ts = (
|
||||
"Indeterminate" if prompt_time == 0 else round(prompt_tokens / prompt_time, 2)
|
||||
)
|
||||
itemization.append(f"{tokens_per_second} T/s")
|
||||
itemization.append(f"Process: {prompt_ts} T/s")
|
||||
|
||||
generate_ts = (
|
||||
"Indeterminate"
|
||||
if generate_time == 0
|
||||
else round(generated_tokens / generate_time, 2)
|
||||
)
|
||||
itemization.append(f"Generate: {generate_ts} T/s")
|
||||
|
||||
# Add context (original token count)
|
||||
if context_len:
|
||||
itemization.append(f"context {context_len} tokens")
|
||||
itemization.append(f"Context: {context_len} tokens")
|
||||
|
||||
if context_len > max_seq_len:
|
||||
extra_parts.append("<-- Not accurate (truncated)")
|
||||
|
||||
Reference in New Issue
Block a user