From ce5e2ec8dea84b1b78585fa5e4f73c01b9be9ad0 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Sun, 26 May 2024 18:21:17 -0700 Subject: [PATCH] Logging: Clarify new vs cached tokens in prompt processing --- backends/exllamav2/model.py | 1 + common/gen_logging.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 7ffc9ce..5d022a8 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -1125,6 +1125,7 @@ class ExllamaV2Container: log_metrics( result.get("time_enqueued"), result.get("prompt_tokens"), + result.get("cached_tokens"), result.get("time_prefill"), result.get("new_tokens"), result.get("time_generate"), diff --git a/common/gen_logging.py b/common/gen_logging.py index bfc6c2e..3259a71 100644 --- a/common/gen_logging.py +++ b/common/gen_logging.py @@ -72,6 +72,7 @@ def log_response(response: str): def log_metrics( queue_time: float, prompt_tokens: int, + cached_tokens: int, prompt_time: float, generated_tokens: int, generate_time: float, @@ -88,9 +89,13 @@ def log_metrics( itemization.append(f"Queue: {round(queue_time, 2)} s") prompt_ts = ( - "Indeterminate" if prompt_time == 0 else round(prompt_tokens / prompt_time, 2) + "Indeterminate" + if prompt_time == 0 + else round((prompt_tokens - cached_tokens) / prompt_time, 2) + ) + itemization.append( + f"Process: {cached_tokens} cached tokens and {prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s" ) - itemization.append(f"Process: {prompt_ts} T/s") generate_ts = ( "Indeterminate"