Logging: Clarify new vs cached tokens in prompt processing

This commit is contained in:
DocShotgun
2024-05-26 18:21:17 -07:00
parent 3dcae8b023
commit ce5e2ec8de
2 changed files with 8 additions and 2 deletions

View File

@@ -1125,6 +1125,7 @@ class ExllamaV2Container:
log_metrics(
result.get("time_enqueued"),
result.get("prompt_tokens"),
result.get("cached_tokens"),
result.get("time_prefill"),
result.get("new_tokens"),
result.get("time_generate"),

View File

@@ -72,6 +72,7 @@ def log_response(response: str):
def log_metrics(
queue_time: float,
prompt_tokens: int,
cached_tokens: int,
prompt_time: float,
generated_tokens: int,
generate_time: float,
@@ -88,9 +89,13 @@ def log_metrics(
itemization.append(f"Queue: {round(queue_time, 2)} s")
prompt_ts = (
"Indeterminate" if prompt_time == 0 else round(prompt_tokens / prompt_time, 2)
"Indeterminate"
if prompt_time == 0
else round((prompt_tokens - cached_tokens) / prompt_time, 2)
)
itemization.append(
f"Process: {cached_tokens} cached tokens and {prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s"
)
itemization.append(f"Process: {prompt_ts} T/s")
generate_ts = (
"Indeterminate"