From ead4c1e1808924c25e862efcb7e563a2cb76c2b4 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Fri, 20 Sep 2024 19:33:59 +0300
Subject: [PATCH] POC per row scale: add CUDA TODOs

There are two places in ggml-cuda.cu left where it is assumed
that type_size * n_per_row / block_size is the way to compute
and handle row sizes. This does not affect simple usage,
but will lead to issues when tensors are split between GPUs.
---
 ggml/src/ggml-cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 87d7e17e..b9ec6020 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1179,6 +1179,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     const int64_t nb2 = src->nb[2];
     const int64_t nb3 = src->nb[3];
     const enum ggml_type type = src->type;
+    // TODO: fix this usage of type size and block size
     const int64_t ts = ggml_type_size(type);
     const int64_t bs = ggml_blck_size(type);
     int64_t i1_diff = i1_high - i1_low;
@@ -1441,6 +1442,7 @@ static void ggml_cuda_op_mul_mat(
 
     const int64_t i02_divisor = ne12 / ne02;
 
+    // TODO: fix this usage of type size and block size
     const size_t src0_ts = ggml_type_size(src0->type);
     const size_t src0_bs = ggml_blck_size(src0->type);
     const size_t q8_1_ts = sizeof(block_q8_1);