API: Add timings to usage stats

It's useful for the client to know what the T/s and total time for
generation are per-request.

Works with both completions and chat completions.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
kingbri
2025-06-17 22:54:51 -04:00
parent 5d94d4d022
commit 2913ce29fc
6 changed files with 110 additions and 63 deletions

View File

@@ -1,7 +1,7 @@
"""Common types for OAI."""
from pydantic import BaseModel, Field
from typing import Optional
from typing import Optional, Union
from common.sampling import BaseSamplerRequest, get_default_sampler_value
@@ -10,8 +10,13 @@ class UsageStats(BaseModel):
"""Represents usage stats."""
prompt_tokens: int
prompt_time: Optional[float] = None
prompt_tokens_per_sec: Optional[Union[float, str]] = None
completion_tokens: int
completion_time: Optional[float] = None
completion_tokens_per_sec: Optional[Union[float, str]] = None
total_tokens: int
total_time: Optional[float] = None
class CompletionResponseFormat(BaseModel):