POC per row scale: add CUDA TODOs

There are two places in ggml-cuda.cu left where it is assumed that type_size * n_per_row / block_size is the way to compute and handle row sizes. This does not affect simple usage, but will lead to issues when tensors are split between GPUs.
2026-02-25 07:34:10 +00:00 · 2024-09-20 19:33:59 +03:00
parent eb2403f057
commit ead4c1e180
1 changed files with 2 additions and 0 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1179,6 +1179,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
    const int64_t nb2 = src->nb[2];
    const int64_t nb3 = src->nb[3];
    const enum ggml_type type = src->type;
+    // TODO: fix this usage of type size and block size
    const int64_t ts = ggml_type_size(type);
    const int64_t bs = ggml_blck_size(type);
    int64_t i1_diff = i1_high - i1_low;
@@ -1441,6 +1442,7 @@ static void ggml_cuda_op_mul_mat(

    const int64_t i02_divisor = ne12 / ne02;

+    // TODO: fix this usage of type size and block size
    const size_t src0_ts = ggml_type_size(src0->type);
    const size_t src0_bs = ggml_blck_size(src0->type);
    const size_t q8_1_ts = sizeof(block_q8_1);