mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-14 15:57:27 +00:00
Model: Add tokens/second output
Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
9
model.py
9
model.py
@@ -1,5 +1,6 @@
|
||||
import gc, time, pathlib
|
||||
import torch
|
||||
from datetime import datetime
|
||||
from exllamav2 import(
|
||||
ExLlamaV2,
|
||||
ExLlamaV2Config,
|
||||
@@ -305,7 +306,8 @@ class ModelContainer:
|
||||
|
||||
generated_tokens = 0
|
||||
full_response = ""
|
||||
last_chunk_time = time.time()
|
||||
start_time = time.time()
|
||||
last_chunk_time = start_time
|
||||
|
||||
save_tokens = torch.empty((1, 0), dtype = torch.bool)
|
||||
chunk_buffer = ""
|
||||
@@ -350,4 +352,7 @@ class ModelContainer:
|
||||
chunk_buffer = ""
|
||||
last_chunk_time = now
|
||||
|
||||
if eos or generated_tokens == max_tokens: break
|
||||
if eos or generated_tokens == max_tokens: break
|
||||
|
||||
elapsed_time = last_chunk_time - start_time
|
||||
print(f"Response generated in {round(elapsed_time, 2)} seconds ({round(generated_tokens / elapsed_time, 2)} T/s)")
|
||||
|
||||
Reference in New Issue
Block a user