diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 64f571f..31e0643 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -1144,6 +1144,7 @@ class ExllamaV2Container: log_metrics( result.get("time_enqueued"), result.get("prompt_tokens"), + result.get("cached_tokens"), result.get("time_prefill"), result.get("new_tokens"), result.get("time_generate"), diff --git a/common/gen_logging.py b/common/gen_logging.py index bfc6c2e..fbf10f6 100644 --- a/common/gen_logging.py +++ b/common/gen_logging.py @@ -72,6 +72,7 @@ def log_response(response: str): def log_metrics( queue_time: float, prompt_tokens: int, + cached_tokens: int, prompt_time: float, generated_tokens: int, generate_time: float, @@ -88,9 +89,14 @@ def log_metrics( itemization.append(f"Queue: {round(queue_time, 2)} s") prompt_ts = ( - "Indeterminate" if prompt_time == 0 else round(prompt_tokens / prompt_time, 2) + "Indeterminate" + if prompt_time == 0 + else round((prompt_tokens - cached_tokens) / prompt_time, 2) + ) + itemization.append( + f"Process: {cached_tokens} cached tokens and " + f"{prompt_tokens - cached_tokens} new tokens at {prompt_ts} T/s" ) - itemization.append(f"Process: {prompt_ts} T/s") generate_ts = ( "Indeterminate"