From ead4c1e1808924c25e862efcb7e563a2cb76c2b4 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Fri, 20 Sep 2024 19:33:59 +0300 Subject: [PATCH] POC per row scale: add CUDA TODOs There are two places in ggml-cuda.cu left where it is assumed that type_size * n_per_row / block_size is the way to compute and handle row sizes. This does not affect simple usage, but will lead to issues when tensors are split between GPUs. --- ggml/src/ggml-cuda.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 87d7e17e..b9ec6020 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1179,6 +1179,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( const int64_t nb2 = src->nb[2]; const int64_t nb3 = src->nb[3]; const enum ggml_type type = src->type; + // TODO: fix this usage of type size and block size const int64_t ts = ggml_type_size(type); const int64_t bs = ggml_blck_size(type); int64_t i1_diff = i1_high - i1_low; @@ -1441,6 +1442,7 @@ static void ggml_cuda_op_mul_mat( const int64_t i02_divisor = ne12 / ne02; + // TODO: fix this usage of type size and block size const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type); const size_t q8_1_ts = sizeof(block_q8_1);