From 7262fb8f9df9979509e4efd9b5b481968cfb133a Mon Sep 17 00:00:00 2001 From: turboderp Date: Sat, 23 Dec 2023 22:04:40 +0100 Subject: [PATCH] Batch latency test script --- tests/test_batch_latency.py | 50 +++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/test_batch_latency.py diff --git a/tests/test_batch_latency.py b/tests/test_batch_latency.py new file mode 100644 index 0000000..4d2c0c2 --- /dev/null +++ b/tests/test_batch_latency.py @@ -0,0 +1,50 @@ + +import sys, os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from exllamav2 import( + ExLlamaV2, + ExLlamaV2Config, + # ExLlamaV2Cache, + # ExLlamaV2Tokenizer, +) + +import torch, time + +# model_directory = "/mnt/str/models/_gptq/llama-7b-4bit-128g/" +# model_directory = "/mnt/str/models/llama2-7b-exl2/4.0bpw" +model_directory = "/mnt/str/models/_gptq/TheBloke_Phine-CodeLlama-34B-v2-GPTQ/" +# model_directory = "/mnt/str/models/llama2-7b" + +config = ExLlamaV2Config() +config.model_dir = model_directory +config.prepare() + +model = ExLlamaV2(config) +print("Loading model: " + model_directory) + +model.load(gpu_split = [20, 20, 24]) + +samples = 100 +samples_1 = 200 +tests = list(range(1, 17)) + [20, 24, 28, 32] #, 48, 64, 96, 128, 256] + +with torch.no_grad(): + for i in tests: + input_ids = torch.randint(config.vocab_size - 1, (1, i)) + + a = time.time() + s = samples if i > 1 else samples_1 + for j in range(s): + model.forward(input_ids) + b = time.time() + + latency = (b - a) / s * 1000 + latency_tok = latency / i + if i == 1: base_latency = latency + efficiency = base_latency / latency + + print(f"{i:3} tokens avg. latency: {latency:7.2f} ms avg. latency/token: {latency_tok:7.2f} ms batch eff.: {efficiency:.4f}") + + +