From 21516bd7b5ca90b190c785c0c767e6045136e4ab Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Mon, 22 Jul 2024 12:23:49 -0400
Subject: [PATCH] Model: Skip empty token chunks

This helps make the generation loop more efficient by skipping past
chunks that aren't providing any tokens anyways. The offset isn't
affected.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 backends/exllamav2/model.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 200be6b..f42dd00 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -1185,13 +1185,15 @@ class ExllamaV2Container:
                 result_id = result.get("identifier")
 
                 if stage == "streaming" and result_id == job_id:
+                    chunk_tokens = result.get("token_ids")
+                    if chunk_tokens is None:
+                        continue
+                    else:
+                        generated_tokens += chunk_tokens.size(dim=0)
+
                     chunk = unwrap(result.get("text"), "")
                     full_response += chunk
 
-                    chunk_tokens = result.get("token_ids")
-                    if chunk_tokens is not None:
-                        generated_tokens += chunk_tokens.size(dim=0)
-
                     generation = {
                         "text": chunk,
                         "prompt_tokens": context_len,