mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-26 17:28:54 +00:00
Logging: Clarify new vs cached tokens in prompt processing
This commit is contained in:
@@ -1125,6 +1125,7 @@ class ExllamaV2Container:
|
|||||||
log_metrics(
|
log_metrics(
|
||||||
result.get("time_enqueued"),
|
result.get("time_enqueued"),
|
||||||
result.get("prompt_tokens"),
|
result.get("prompt_tokens"),
|
||||||
|
result.get("cached_tokens"),
|
||||||
result.get("time_prefill"),
|
result.get("time_prefill"),
|
||||||
result.get("new_tokens"),
|
result.get("new_tokens"),
|
||||||
result.get("time_generate"),
|
result.get("time_generate"),
|
||||||
|
|||||||
@@ -72,6 +72,7 @@ def log_response(response: str):
|
|||||||
def log_metrics(
|
def log_metrics(
|
||||||
queue_time: float,
|
queue_time: float,
|
||||||
prompt_tokens: int,
|
prompt_tokens: int,
|
||||||
|
cached_tokens: int,
|
||||||
prompt_time: float,
|
prompt_time: float,
|
||||||
generated_tokens: int,
|
generated_tokens: int,
|
||||||
generate_time: float,
|
generate_time: float,
|
||||||
@@ -88,9 +89,13 @@ def log_metrics(
|
|||||||
itemization.append(f"Queue: {round(queue_time, 2)} s")
|
itemization.append(f"Queue: {round(queue_time, 2)} s")
|
||||||
|
|
||||||
prompt_ts = (
|
prompt_ts = (
|
||||||
"Indeterminate" if prompt_time == 0 else round(prompt_tokens / prompt_time, 2)
|
"Indeterminate"
|
||||||
|
if prompt_time == 0
|
||||||
|
else round((prompt_tokens - cached_tokens) / prompt_time, 2)
|
||||||
|
)
|
||||||
|
itemization.append(
|
||||||
|
f"Process: {cached_tokens} cached tokens and {prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s"
|
||||||
)
|
)
|
||||||
itemization.append(f"Process: {prompt_ts} T/s")
|
|
||||||
|
|
||||||
generate_ts = (
|
generate_ts = (
|
||||||
"Indeterminate"
|
"Indeterminate"
|
||||||
|
|||||||
Reference in New Issue
Block a user