From ce5e2ec8dea84b1b78585fa5e4f73c01b9be9ad0 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Sun, 26 May 2024 18:21:17 -0700 Subject: [PATCH 1/2] Logging: Clarify new vs cached tokens in prompt processing --- backends/exllamav2/model.py | 1 + common/gen_logging.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 7ffc9ce..5d022a8 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -1125,6 +1125,7 @@ class ExllamaV2Container: log_metrics( result.get("time_enqueued"), result.get("prompt_tokens"), + result.get("cached_tokens"), result.get("time_prefill"), result.get("new_tokens"), result.get("time_generate"), diff --git a/common/gen_logging.py b/common/gen_logging.py index bfc6c2e..3259a71 100644 --- a/common/gen_logging.py +++ b/common/gen_logging.py @@ -72,6 +72,7 @@ def log_response(response: str): def log_metrics( queue_time: float, prompt_tokens: int, + cached_tokens: int, prompt_time: float, generated_tokens: int, generate_time: float, @@ -88,9 +89,13 @@ def log_metrics( itemization.append(f"Queue: {round(queue_time, 2)} s") prompt_ts = ( - "Indeterminate" if prompt_time == 0 else round(prompt_tokens / prompt_time, 2) + "Indeterminate" + if prompt_time == 0 + else round((prompt_tokens - cached_tokens) / prompt_time, 2) + ) + itemization.append( + f"Process: {cached_tokens} cached tokens and {prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s" ) - itemization.append(f"Process: {prompt_ts} T/s") generate_ts = ( "Indeterminate" From 7084081b1f497577938a1607599862d8e570a4a9 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Sun, 26 May 2024 18:27:30 -0700 Subject: [PATCH 2/2] Tree: Lint --- common/gen_logging.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/gen_logging.py b/common/gen_logging.py index 3259a71..fbf10f6 100644 --- a/common/gen_logging.py +++ b/common/gen_logging.py @@ -94,7 +94,8 @@ def log_metrics( else round((prompt_tokens - cached_tokens) / prompt_time, 2) ) itemization.append( - f"Process: {cached_tokens} cached tokens and {prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s" + f"Process: {cached_tokens} cached tokens and " + f"{prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s" ) generate_ts = (