From ce5e2ec8dea84b1b78585fa5e4f73c01b9be9ad0 Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Sun, 26 May 2024 18:21:17 -0700
Subject: [PATCH] Logging: Clarify new vs cached tokens in prompt processing

---
 backends/exllamav2/model.py | 1 +
 common/gen_logging.py       | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 7ffc9ce..5d022a8 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -1125,6 +1125,7 @@ class ExllamaV2Container:
                         log_metrics(
                             result.get("time_enqueued"),
                             result.get("prompt_tokens"),
+                            result.get("cached_tokens"),
                             result.get("time_prefill"),
                             result.get("new_tokens"),
                             result.get("time_generate"),
diff --git a/common/gen_logging.py b/common/gen_logging.py
index bfc6c2e..3259a71 100644
--- a/common/gen_logging.py
+++ b/common/gen_logging.py
@@ -72,6 +72,7 @@ def log_response(response: str):
 def log_metrics(
     queue_time: float,
     prompt_tokens: int,
+    cached_tokens: int,
     prompt_time: float,
     generated_tokens: int,
     generate_time: float,
@@ -88,9 +89,13 @@ def log_metrics(
     itemization.append(f"Queue: {round(queue_time, 2)} s")
 
     prompt_ts = (
-        "Indeterminate" if prompt_time == 0 else round(prompt_tokens / prompt_time, 2)
+        "Indeterminate"
+        if prompt_time == 0
+        else round((prompt_tokens - cached_tokens) / prompt_time, 2)
+    )
+    itemization.append(
+        f"Process: {cached_tokens} cached tokens and {prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s"
     )
-    itemization.append(f"Process: {prompt_ts} T/s")
 
     generate_ts = (
         "Indeterminate"