Remove more split mode row remnants

2026-02-18 20:30:11 +00:00 · 2025-11-29 14:00:58 +00:00
parent bf2a1dad98
commit 663a9ccbbf
1 changed files with 23 additions and 241 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -704,59 +704,25 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {

 // cuda split buffer

-static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
-    int64_t row_rounding = 0;
-    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-        if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
-            continue;
-        }
-
-        const int cc = ggml_cuda_info().devices[id].cc;
-        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
-    }
-    return row_rounding;
-}
-
-static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
-    const int64_t nrows = ggml_nrows(tensor);
-    const int64_t rounding = get_row_rounding(tensor_split);
-
-    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
-    *row_low -= *row_low % rounding;
-
-    if (id == ggml_backend_cuda_get_device_count() - 1) {
-        *row_high = nrows;
-    } else {
-        *row_high = nrows*tensor_split[id + 1];
-        *row_high -= *row_high % rounding;
-    }
-}
-
-static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
-}
-
 struct ggml_backend_cuda_split_buffer_type_context {
-    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+    //std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
 };

 struct ggml_backend_cuda_split_buffer_context {
    ~ggml_backend_cuda_split_buffer_context() {
-        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
-                for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
-                    if (extra->events[id][is] != nullptr) {
-                        CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
-                    }
-                }
-                if (extra->data_device[id] != nullptr) {
-                    CUDA_CHECK(cudaFree(extra->data_device[id]));
-                }
-            }
-            delete extra;
-        }
+        //for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+        //    for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
+        //        for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
+        //            if (extra->events[id][is] != nullptr) {
+        //                CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
+        //            }
+        //        }
+        //        if (extra->data_device[id] != nullptr) {
+        //            CUDA_CHECK(cudaFree(extra->data_device[id]));
+        //        }
+        //    }
+        //    delete extra;
+        //}
    }

    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
@@ -818,52 +784,6 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]
    }
    return;

-    //if (tensor->view_src != nullptr) fprintf(stderr, "%s: tensor %s is a view into %s\n", __func__, tensor->name, tensor->view_src->name);
-    //GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
-
-    //ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
-    //ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    //const int64_t ne0 = tensor->ne[0];
-
-    //ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
-    //ctx->tensor_extras.push_back(extra);
-
-    //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-    //    int64_t row_low, row_high;
-    //    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-    //    int64_t nrows_split = row_high - row_low;
-    //    if (nrows_split == 0) {
-    //        continue;
-    //    }
-
-    //    size_t size = ggml_nbytes_split(tensor, nrows_split);
-    //    const size_t original_size = size;
-
-    //    // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-    //    if (ne0 % MATRIX_ROW_PADDING != 0) {
-    //        size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-    //    }
-
-    //    // FIXME: do not crash if cudaMalloc fails
-    //    // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
-    //    ggml_cuda_set_device(id);
-    //    char * buf;
-    //    CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
-
-    //    // set padding to 0 to avoid possible NaN values
-    //    if (size > original_size) {
-    //        CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
-    //    }
-
-    //    extra->data_device[id] = buf;
-
-    //    for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
-    //        CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
-    //    }
-    //}
-    //tensor->extra = extra;
 }

 GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -895,9 +815,6 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
            GGML_ABORT("Dim 0 copy of row-interleaved quants is not supported yet");
        }
        auto tt = ggml_internal_get_type_traits(tensor->type);
-        //if (tt.row_meta_size > 0) {
-        //    GGML_ABORT("Dim 0 copy is not implemented for tensors with row meta data\n");
-        //}
        std::vector<char> host_buffer;
        GGML_ASSERT(ggml_is_contiguous(tensor));
        int nrows = ggml_nrows(tensor);
@@ -913,7 +830,6 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
            GGML_ASSERT((int)ggml_nrows(split) == nrows);
            GGML_ASSERT(split->ne[0] % bs == 0);
            auto source_offset = tt.row_meta_size + (ne / bs) * ts;
-            auto chost0 = (const char *)data;
            auto split_row_size = ggml_row_size(split->type, split->ne[0]);
            if (host_buffer.size() < nrows*split_row_size) host_buffer.resize(nrows*split_row_size);
            for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) {
@@ -928,25 +844,6 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
            }
            CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
            ne += split->ne[0];
-            //for (int ir = 0; ir < nrows; ++ir) {
-            //    auto dst = host_buffer.data() + ir*split_row_size;
-            //    if (tt.row_meta_size > 0) {
-            //        memcpy(dst, chost0, tt.row_meta_size);
-            //    }
-            //    memcpy(dst + tt.row_meta_size, chost0 + source_offset, split_row_size - tt.row_meta_size);
-            //    chost0 += row_size;
-            //}
-            //CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
-            //for (int ir = 0; ir < nrows; ++ir) {
-            //    auto dst = (char *)split->data + ir*split_row_size;
-            //    if (tt.row_meta_size > 0) {
-            //        CUDA_CHECK(cudaMemcpyAsync(dst, chost0, tt.row_meta_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
-            //    }
-            //    CUDA_CHECK(cudaMemcpyAsync(dst + tt.row_meta_size, chost0 + source_offset,
-            //                split_row_size - tt.row_meta_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
-            //    chost0 += row_size;
-            //}
-            //ne += split->ne[0];
        }
    }
    else if (extra->split_dim == 1) {
@@ -991,75 +888,15 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
    }

-    //ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    //const int64_t ne0 = tensor->ne[0];
-    //const size_t nb1 = tensor->nb[1];
-    //ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-    //    int64_t row_low, row_high;
-    //    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-    //    int64_t nrows_split = row_high - row_low;
-    //    if (nrows_split == 0) {
-    //        continue;
-    //    }
-
-    //    const size_t offset_split = row_low*nb1;
-    //    size_t size = ggml_nbytes_split(tensor, nrows_split);
-    //    const size_t original_size = size;
-
-    //    // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-    //    if (ne0 % MATRIX_ROW_PADDING != 0) {
-    //        size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-    //    }
-
-    //    const char * buf_host = (const char *)data + offset_split;
-    //    CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
-    //}
-
-    //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-    //    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
-    //}
 }

-GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, const ggml_tensor * tensor,
+        [[maybe_unused]] void * data, size_t offset, size_t size) {
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));

-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-    const size_t nb1 = tensor->nb[1];
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        char * buf_host = (char *)data + offset_split;
-        CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
-    }
-
-    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
-    }
+    GGML_ABORT("not implemented");
 }

 GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -1128,30 +965,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size([[may
    }
    return total_size;

-    //ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
-
-    //size_t total_size = 0;
-
-    //const int64_t ne0 = tensor->ne[0];
-
-    //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-    //    int64_t row_low, row_high;
-    //    get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
-
-    //    int64_t nrows_split = row_high - row_low;
-    //    if (nrows_split == 0) {
-    //        continue;
-    //    }
-
-    //    total_size += ggml_nbytes_split(tensor, nrows_split);
-
-    //    // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-    //    if (ne0 % MATRIX_ROW_PADDING != 0) {
-    //        total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-    //    }
-    //}
-
-    //return total_size;
 }

 GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
@@ -1169,40 +982,12 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };

-GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
-    static std::mutex mutex;
-    std::lock_guard<std::mutex> lock(mutex);
-
-    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
-
-    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
-
-    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
-    if (all_zero) {
-        tensor_split_arr = ggml_cuda_info().default_tensor_split;
-    } else {
-        float split_sum = 0.0f;
-        for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
-            tensor_split_arr[i] = split_sum;
-            split_sum += tensor_split[i];
-        }
-        for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
-            tensor_split_arr[i] /= split_sum;
-        }
-    }
-
-    auto it = buft_map.find(tensor_split_arr);
-    if (it != buft_map.end()) {
-        return &it->second;
-    }
-
-    struct ggml_backend_buffer_type buft {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * /*tensor_split*/) {
+    static ggml_backend_buffer_type buft {
        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
-        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
+        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{}, //{tensor_split_arr},
    };
-
-    auto result = buft_map.emplace(tensor_split_arr, buft);
-    return &result.first->second;
+    return &buft;
 }

 // host buffer type
@@ -1610,6 +1395,7 @@ static void ggml_cuda_op_mul_mat_cublas(
    GGML_UNUSED(src1_padded_row_size);
 }

+#if 0
 static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
    static bool peer_access_enabled = false;

@@ -1661,6 +1447,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {

    GGML_UNUSED(main_device);
 }
+#endif

 static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
    void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
@@ -3119,11 +2906,6 @@ static inline bool ops_are_same_device(const ggml_cgraph * cgraph, int first, in
 }

 static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst, const ggml_cgraph * cgraph, int & i) {
-    // why is this here instead of mul_mat?
-    if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
-        //printf("%s: split buffer for %s(%s)\n", __func__, ggml_op_name(dst->op), dst->name);
-        ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
-    }

 #if IK_PRINT_TIMING
    int64_t tim1 = ggml_time_us();