WIP

2026-03-01 17:40:25 +00:00 · 2025-11-25 14:51:33 +00:00
parent 5ea430aaa4
commit 32c6df015b
8 changed files with 449 additions and 118 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1277,10 +1277,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            params.split_mode = LLAMA_SPLIT_MODE_LAYER;
        }
        else if (arg_next == "row") {
-            fprintf(stderr, "\n\n=====================================================================================\n");
-            fprintf(stderr, " Split mode row is no longer supported\n");
-            fprintf(stderr, "=====================================================================================\n\n\n");
-            GGML_ABORT("fatal error");
+            //fprintf(stderr, "\n\n=====================================================================================\n");
+            //fprintf(stderr, " Split mode row is no longer supported\n");
+            //fprintf(stderr, "=====================================================================================\n\n\n");
+            //GGML_ABORT("fatal error");
            params.split_mode = LLAMA_SPLIT_MODE_ROW;
        }
        else {
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -3021,6 +3021,13 @@ extern "C" {

    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

+    typedef struct {
+        int                   n_device;
+        int                   split_dim;
+        struct ggml_tensor *  tensor;
+        struct ggml_tensor ** splits;
+    } ggml_split_tensor_t;
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -785,90 +785,145 @@ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buf
    GGML_UNUSED(buffer);
 }

-GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
-
-    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-
-    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
-    ctx->tensor_extras.push_back(extra);
-
-    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]] ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    if (!tensor->extra) return;
+    printf("%s(%s, %p)\n", __func__, tensor->name, tensor->extra);
+    auto extra = (ggml_split_tensor_t *)tensor->extra;
+    GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());
+    for (int i = 0; i < extra->n_device; ++i) {
+        if (!extra->splits[i]) continue;
+        auto split = extra->splits[i];
+        auto ne0 = split->ne[0];
+        auto size = ggml_nbytes(split);
+        auto padded_size = size;
        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+            int nblock = (ne0 + MATRIX_ROW_PADDING - 1)/MATRIX_ROW_PADDING;
+            auto padded_row_size = ggml_row_size(split->type, nblock*MATRIX_ROW_PADDING);
+            auto row_size = ggml_row_size(split->type, ne0);
+            padded_size += padded_row_size - row_size;
        }
-
-        // FIXME: do not crash if cudaMalloc fails
-        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
-        ggml_cuda_set_device(id);
+        ggml_cuda_set_device(i);
        char * buf;
-        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
-
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
-        }
-
-        extra->data_device[id] = buf;
-
-        for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
-            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
+        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, padded_size, i));
+        if (padded_size > size) {
+            CUDA_CHECK(cudaMemset(buf + size, 0, padded_size - size));
        }
+        printf("    allocated %zu bytes for tensor %s of type %s, dim = %ld x %ld x %ld. padding: %zu\n", padded_size, split->name, ggml_type_name(split->type),
+                split->ne[0], split->ne[1], split->ne[2], padded_size - size);
+        split->data = buf;
    }
-    tensor->extra = extra;
+    return;
+
+    //if (tensor->view_src != nullptr) fprintf(stderr, "%s: tensor %s is a view into %s\n", __func__, tensor->name, tensor->view_src->name);
+    //GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+
+    //ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+    //ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+    //const int64_t ne0 = tensor->ne[0];
+
+    //ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+    //ctx->tensor_extras.push_back(extra);
+
+    //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+    //    int64_t row_low, row_high;
+    //    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+    //    int64_t nrows_split = row_high - row_low;
+    //    if (nrows_split == 0) {
+    //        continue;
+    //    }
+
+    //    size_t size = ggml_nbytes_split(tensor, nrows_split);
+    //    const size_t original_size = size;
+
+    //    // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    //    if (ne0 % MATRIX_ROW_PADDING != 0) {
+    //        size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+    //    }
+
+    //    // FIXME: do not crash if cudaMalloc fails
+    //    // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
+    //    ggml_cuda_set_device(id);
+    //    char * buf;
+    //    CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
+
+    //    // set padding to 0 to avoid possible NaN values
+    //    if (size > original_size) {
+    //        CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+    //    }
+
+    //    extra->data_device[id] = buf;
+
+    //    for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
+    //        CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
+    //    }
+    //}
+    //tensor->extra = extra;
 }

 GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    if (!tensor->extra) return;
+    printf("%s(%s)\n", __func__, tensor->name);
+
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));

-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+    auto extra = (ggml_split_tensor_t *)tensor->extra;
+    GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());

-    const int64_t ne0 = tensor->ne[0];
-    const size_t nb1 = tensor->nb[1];
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        const char * buf_host = (const char *)data + offset_split;
-        CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+    if (extra->split_dim != 0) {
+        fprintf(stderr, "Split tensor copy not yet immplemented for dim 0\n");
+        return;
    }

-    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+    size_t cur_offset = 0;
+    for (int i = 0; i < extra->n_device; ++i) {
+        auto split = extra->splits[i];
+        if (!split) continue;
+        auto size = ggml_nbytes(split);
+        const char * buf_host = (const char *)data + cur_offset;
+        CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+        cur_offset += size;
+    }
+
+    for (int i = 0; i < extra->n_device; ++i) {
+        if (!extra->splits[i]) continue;
        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
    }
+
+    //ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+    //const int64_t ne0 = tensor->ne[0];
+    //const size_t nb1 = tensor->nb[1];
+    //ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+    //    int64_t row_low, row_high;
+    //    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+    //    int64_t nrows_split = row_high - row_low;
+    //    if (nrows_split == 0) {
+    //        continue;
+    //    }
+
+    //    const size_t offset_split = row_low*nb1;
+    //    size_t size = ggml_nbytes_split(tensor, nrows_split);
+    //    const size_t original_size = size;
+
+    //    // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    //    if (ne0 % MATRIX_ROW_PADDING != 0) {
+    //        size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+    //    }
+
+    //    const char * buf_host = (const char *)data + offset_split;
+    //    CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+    //}
+
+    //for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+    //    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+    //}
 }

 GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -57,6 +57,9 @@ struct llama_kv_cache {
    std::vector<struct ggml_tensor *> k_l; // per layer
    std::vector<struct ggml_tensor *> v_l;

+    std::vector<llama_split_tensor> split_k_l;
+    std::vector<llama_split_tensor> split_v_l;
+
    std::vector<struct ggml_context *> ctxs;
    std::vector<ggml_backend_buffer_t> bufs;

--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -224,3 +224,8 @@ struct gguf_context;
 std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);

 ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);
+
+struct llama_split_tensor {
+    std::vector<ggml_tensor *> tensor_splits;
+    ggml_split_tensor_t        ggml;
+};
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -192,6 +192,11 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
    ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
    ctx_size += ggml_tensor_overhead()*n_layer*3;         // for moe merged tensors

+    if (model.splits.size() > 1) {
+        ctx_size += ggml_tensor_overhead()*n_layer*4;    // for KV cache
+        ctx_size *= model.splits.size();
+    }
+
    for (auto & it : buft_layer_count) {
        struct ggml_init_params params = {
            /*.mem_size   =*/ ctx_size,
@@ -205,10 +210,78 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
        ctx_map[it.first] = ctx;
        model.ctxs.push_back(ctx);
    }
+    printf("=======================================================================\n");
+    auto n_device = model.device_count();
+    printf(" Model has %d devices:\n", n_device);
+    for (int device = 0; device < n_device; ++device) {
+        auto buft = model.default_buffer_type_offload(device);
+        if (buft) {
+            printf("  %d   %s\n", device, ggml_backend_buft_name(buft));
+        } else {
+            printf("  Oops: null buft for debvice %d\n", device);
+        }
+    }
+    if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
+        printf("model.splits:");
+        for (auto s : model.splits) printf(" %g", s);
+        printf("\n");
+    }
+}
+
+static std::vector<int> create_split(int nr, int granularity, const std::vector<float> & splits) {
+    GGML_ASSERT(nr % granularity == 0);
+    GGML_ASSERT(!splits.empty());
+    int nchunk = nr / granularity;
+    std::vector<int> result(splits.size());
+    float last_split = 0;
+    int sum = 0;
+    for (int i = 0; i < (int)splits.size(); ++i) {
+        float p = splits[i] - last_split;
+        result[i] = roundf(p*nchunk);
+        sum += result[i];
+        last_split = splits[i];
+    }
+    while (sum > nchunk) {
+        last_split = 0;
+        float best_err = std::numeric_limits<float>::max();
+        int ibest = -1;
+        for (int i = 0; i < (int)splits.size(); ++i) {
+            if (result[i] > 0) {
+                float n_want = (splits[i] - last_split)*nchunk;
+                float err = std::abs(n_want - result[i] + 1);
+                if (err < best_err) {
+                    best_err = err; ibest = i;
+                }
+            }
+            last_split = splits[i];
+        }
+        GGML_ASSERT(ibest >= 0 && result[ibest] > 0);
+        --result[ibest];
+        --sum;
+    }
+    while (sum < nchunk) {
+        last_split = 0;
+        float best_err = std::numeric_limits<float>::max();
+        int ibest = -1;
+        for (int i = 0; i < (int)splits.size(); ++i) {
+            float n_want = (splits[i] - last_split)*nchunk;
+            float err = std::abs(n_want - result[i] - 1);
+            if (err < best_err) {
+                best_err = err; ibest = i;
+            }
+            last_split = splits[i];
+        }
+        GGML_ASSERT(ibest >= 0);
+        ++result[ibest];
+        ++sum;
+    }
+    for (auto & r : result) r *= granularity;
+    return result;
 }

 ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
        int flags, ggml_context ** actual_context) {
+    //auto requested_ctx = ctx;
    if (ml.tensor_buft_overrides) {
        for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
            std::regex pattern(overrides->pattern);
@@ -220,7 +293,40 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
        }
    }
    if (actual_context) *actual_context = ctx;
-    return ml.create_tensor(ctx, name, ne, flags);
+    auto tensor = ml.create_tensor(ctx, name, ne, flags);
+    //if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_ROW) {
+    //    int i_layer = -1;
+    //    if (auto pos = name.find("blk."); pos == 0) {
+    //        GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1);
+    //    }
+    //    if (i_layer >= 0) {
+    //        auto & layer = model.layers[i_layer];
+    //        auto & hparams = model.hparams;
+    //        if (auto pos = name.find("attn_q.weight"); pos != std::string::npos) {
+    //            auto split = create_split(tensor->ne[1], hparams.n_embd_head_k, model.splits);
+    //            printf("%s(%s):", __func__, name.c_str());
+    //            for (auto s : split) printf(" %d", s);
+    //            printf("\n");
+    //            layer.split_wq.tensor_splits.resize(split.size());
+    //            size_t offset = 0;
+    //            for (int i = 0; i < (int)split.size(); ++i) {
+    //                if (split[i] > 0) {
+    //                    layer.split_wq.tensor_splits[i] = ggml_view_2d(ctx, tensor, tensor->ne[0], split[i], tensor->nb[1], offset);
+    //                    auto split_name = name + '.' + std::to_string(i);
+    //                    ggml_set_name(layer.split_wq.tensor_splits[i], split_name.c_str());
+    //                    offset += tensor->nb[1]*split[i];
+    //                } else {
+    //                    layer.split_wq.tensor_splits[i] = nullptr;
+    //                }
+    //            }
+    //            layer.split_wq.ggml.n_device  = split.size();
+    //            layer.split_wq.ggml.split_dim = 1;
+    //            layer.split_wq.ggml.splits    = layer.split_wq.tensor_splits.data();
+    //        }
+    //    }
+    //}
+    return tensor;
+    //return ml.create_tensor(ctx, name, ne, flags);
 }

 #define LOADING_PRELUDE \
@@ -2638,6 +2744,40 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i
    return fused_qkv;
 }

+static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor, const std::vector<int> & splits) {
+    GGML_ASSERT(split_dim == 0 || split_dim == 1);
+    GGML_ASSERT(splits.size() > 1);
+    std::string name{tensor->name};
+    split_tensor.tensor_splits.resize(splits.size());
+    if (split_dim == 1) {
+        size_t offset = 0;
+        for (int i = 0; i < int(splits.size()); ++i) {
+            if (splits[i] > 0) {
+                split_tensor.tensor_splits[i] = ggml_view_3d(ctx, tensor, tensor->ne[0], splits[i], tensor->ne[2], tensor->nb[1], tensor->nb[2], offset);
+                auto name_i = name + '.' + std::to_string(i);
+                ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
+                offset += tensor->nb[1]*splits[i];
+            } else {
+                split_tensor.tensor_splits[i] = nullptr;
+            }
+        }
+    } else {
+        for (int i = 0; i < int(splits.size()); ++i) {
+            if (splits[i] > 0) {
+                split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, splits[i], tensor->ne[1], tensor->ne[2]);
+                auto name_i = name + '.' + std::to_string(i);
+                ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
+            } else {
+                split_tensor.tensor_splits[i] = nullptr;
+            }
+        }
+    }
+    split_tensor.ggml.n_device  = splits.size();
+    split_tensor.ggml.split_dim = split_dim;
+    split_tensor.ggml.splits    = split_tensor.tensor_splits.data();
+    tensor->extra = (void *)&split_tensor.ggml;
+}
+
 bool create_tensors_helper::create_tensors() {
    const auto tn = LLM_TN(model.arch);
    bool use_mmap_buffer = true;
@@ -2761,6 +2901,71 @@ bool create_tensors_helper::create_tensors() {
        default:
            throw std::runtime_error("unknown architecture");
    }
+    if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
+        const auto & hparams = model.hparams;
+        int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
+        printf("GQA ratio: %d\n", gqa_ratio);
+        for (int il = 0; il < int(model.layers.size()); ++il) {
+            auto & layer = model.layers[il];
+            auto ctx_split = ctx_for_layer_split(il);
+            if (layer.wo && layer.wq && layer.wk && layer.wv) {
+                int attn_granularity = hparams.n_embd_head_k;
+                if (ggml_is_quantized(layer.wo->type)) {
+                    auto tt = ggml_internal_get_type_traits(layer.wo->type);
+                    if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size;
+                }
+                GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0);
+                auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits);
+                prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split);
+                prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split);
+                for (auto & s : split) s /= gqa_ratio;
+                prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split);
+                prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split);
+            }
+
+            if (layer.ffn_down && layer.ffn_up && layer.ffn_gate) {
+                int ffn_granularity = 16;
+                if (ggml_is_quantized(layer.ffn_down->type)) {
+                    auto tt = ggml_internal_get_type_traits(layer.ffn_down->type);
+                    if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
+                }
+                auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits);
+                prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_up,   layer.split_ffn_up,   split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split);
+            }
+
+            if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
+                int ffn_granularity = 16;
+                if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
+                    auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
+                    if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
+                }
+                auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits);
+                prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp,   layer.split_ffn_up_shexp,   split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split);
+            }
+
+            if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
+                int ffn_granularity = 16;
+                if (ggml_is_quantized(layer.ffn_down_exps->type)) {
+                    auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
+                    if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
+                }
+                auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits);
+                prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_up_exps,   layer.split_ffn_up_exps,   split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split);
+            }
+        }
+
+        if (model.output) {
+            auto ctx_split = ctx_map[model.buft_output.buft_matrix];
+            auto split = create_split(model.output->ne[1], 16, model.splits);
+            prepare_split_tensors(1, ctx_split, model.output, model.split_output, split);
+        }
+    }
    return use_mmap_buffer;
 }

--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -183,6 +183,21 @@ struct llama_layer {
    struct ggml_tensor * bqk  = nullptr;
    struct ggml_tensor * bkv  = nullptr;

+    llama_split_tensor split_wq;
+    llama_split_tensor split_wk;
+    llama_split_tensor split_wv;
+    llama_split_tensor split_wo;
+    llama_split_tensor split_wqkv;
+    llama_split_tensor split_wqk;
+    llama_split_tensor split_wkv;
+    llama_split_tensor split_bq;
+    llama_split_tensor split_bk;
+    llama_split_tensor split_bv;
+    llama_split_tensor split_bo;
+    llama_split_tensor split_bqkv;
+    llama_split_tensor split_bqk;
+    llama_split_tensor split_bkv;
+
    // relative position bias
    struct ggml_tensor * attn_rel_b = nullptr;
    struct ggml_tensor * attn_rel_b_enc = nullptr;
@@ -205,12 +220,20 @@ struct llama_layer {
    struct ggml_tensor * ffn_down_enc = nullptr;
    struct ggml_tensor * ffn_up_enc = nullptr;

+    llama_split_tensor split_ffn_up;
+    llama_split_tensor split_ffn_gate;
+    llama_split_tensor split_ffn_down;
+
    // ff MoE
    struct ggml_tensor * ffn_gate_inp = nullptr;
    struct ggml_tensor * ffn_gate_exps = nullptr;
    struct ggml_tensor * ffn_down_exps = nullptr;
    struct ggml_tensor * ffn_up_exps  = nullptr;

+    llama_split_tensor split_ffn_up_exps;
+    llama_split_tensor split_ffn_gate_exps;
+    llama_split_tensor split_ffn_down_exps;
+
    // ff MoE bias
    struct ggml_tensor * ffn_gate_inp_b = nullptr;
    struct ggml_tensor * ffn_gate_exps_b = nullptr;
@@ -226,6 +249,10 @@ struct llama_layer {
    struct ggml_tensor * ffn_down_shexp = nullptr;
    struct ggml_tensor * ffn_up_shexp = nullptr;

+    llama_split_tensor split_ffn_up_shexp;
+    llama_split_tensor split_ffn_gate_shexp;
+    llama_split_tensor split_ffn_down_shexp;
+
    // ff bias
    struct ggml_tensor * ffn_gate_b = nullptr;
    struct ggml_tensor * ffn_down_b = nullptr; // b2
@@ -298,6 +325,8 @@ struct llama_model {
    struct ggml_tensor * output_b;
    struct ggml_tensor * output_norm_enc;

+    llama_split_tensor split_output;
+
    std::vector<llama_layer> layers;

    llama_split_mode split_mode;
@@ -358,6 +387,11 @@ struct llama_model {
    }

    void set_tensor_overrides(const llama_model_params& params);
+
+    int device_count() const;
+    ggml_backend_buffer_type_t default_buffer_type_offload(int device) const;
+
+    std::vector<float> splits;
 };

 struct llama_lora_weight {
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -483,6 +483,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
    GGML_UNUSED(tensor_split);
 }

+int llama_model::device_count() const {
+    return llama_get_device_count(*this);
+}
+
+ggml_backend_buffer_type_t llama_model::default_buffer_type_offload(int device) const {
+    return llama_default_buffer_type_offload(*this, device);
+}
+
 static size_t llama_get_device_memory(const llama_model & model, int device) {
 #if defined(GGML_USE_RPC)
    int dev_count = (int)llama_get_device_count(model);
@@ -626,42 +634,35 @@ static bool llama_kv_cache_init(
        }
    }

+    bool split_cache   = false;
+    if (model.split_mode == LLAMA_SPLIT_MODE_ROW && model.arch != LLM_ARCH_DEEPSEEK2 && offload) {
+        cache.split_k_l.reserve(n_layer);
+        cache.split_v_l.reserve(n_layer);
+        split_cache = true;
+    }
+
    // count used buffer types
    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
    if (offload) {
        for (int64_t i = 0; i < n_layer; ++i) {
-            buft_layer_count[model.buft_layer[i].buft]++;
+            if (split_cache) {
+                buft_layer_count[model.buft_layer[i].buft_matrix]++;
+            } else {
+                buft_layer_count[model.buft_layer[i].buft]++;
+            }
        }
    } else {
        buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
    }

-    //if (cparams.fused_moe_up_gate) {
-    //    int nbad = 0;
-    //    for (int i = 0; i < (int) n_layer; i++) {
-    //        auto& layer = model.layers[i];
-    //        if (layer.ffn_gate_exps && layer.ffn_up_exps && layer.ffn_gate_exps->type != layer.ffn_up_exps->type) {
-    //            ++nbad;
-    //        }
-    //    }
-    //    if (nbad > 0) {
-    //        if (nbad == (int)n_layer) {
-    //            LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different type => disabling fmoe\n");
-    //            const_cast<llama_cparams&>(cparams).fused_moe_up_gate = false;
-    //        }
-    //        else {
-    //            LLAMA_LOG_WARN("=============== ffn_up and ffn_gate are of different in %d out of %d layers, where fmoe will be disabled\n",
-    //                    nbad, (int)n_layer);
-    //        }
-    //    }
-    //}
-
    // create a context for each buffer type
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    for (auto & it : buft_layer_count) {
        int n_layers = it.second;
+        size_t ctx_mem_size = 5u*n_layers*ggml_tensor_overhead();
+        if (split_cache) ctx_mem_size += 2*model.splits.size()*n_layers*ggml_tensor_overhead();
        struct ggml_init_params params = {
-            /*.mem_size   =*/ 5u*n_layers*ggml_tensor_overhead(),
+            /*.mem_size   =*/ ctx_mem_size,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
@@ -698,8 +699,8 @@ static bool llama_kv_cache_init(
        }
    }

-    cache.k_l.reserve(n_layer);
    bool needs_v_cache = true;
+    cache.k_l.reserve(n_layer);
    if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
        needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn;
    }
@@ -711,11 +712,10 @@ static bool llama_kv_cache_init(
        const uint32_t n_head_kv    = hparams.n_head_kv(i);
        const uint32_t n_embd_head_k= hparams.n_embd_head_k;

-
-        struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+        struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
        ggml_tensor * k;
        ggml_tensor * v;
-        if (cparams.mla_attn) {
+        if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
            // DeepSeek MLA
            const uint32_t n_embd_head_qk_rope = hparams.n_rot;
            const uint32_t kv_lora_rank = hparams.n_lora_kv;
@@ -744,6 +744,36 @@ static bool llama_kv_cache_init(
            ggml_format_name(v, "cache_v_l%d", i);
            cache.k_l.push_back(k);
            cache.v_l.push_back(v);
+            if (split_cache) {
+                auto K = model.layers[i].wk;
+                auto V = model.layers[i].wv;
+                if (K && V && K->extra && V->extra) {
+                    auto extra_K = (const ggml_split_tensor_t *)K->extra;
+                    auto extra_V = (const ggml_split_tensor_t *)V->extra;
+                    auto & split_k_l = cache.split_k_l.emplace_back();
+                    auto & split_v_l = cache.split_v_l.emplace_back();
+                    split_k_l.tensor_splits.resize(extra_K->n_device, nullptr);
+                    split_v_l.tensor_splits.resize(extra_V->n_device, nullptr);
+                    for (int is = 0; is < extra_K->n_device; ++is) {
+                        auto split = extra_K->splits[is];
+                        if (!split) continue;
+                        split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, split->ne[1]/n_embd_head_k * kv_size);
+                    }
+                    split_k_l.ggml.n_device  = extra_K->n_device;
+                    split_k_l.ggml.split_dim = 0;
+                    split_k_l.ggml.splits    = split_k_l.tensor_splits.data();
+                    for (int is = 0; is < extra_V->n_device; ++is) {
+                        auto split = extra_V->splits[is];
+                        if (!split) continue;
+                        split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size);
+                    }
+                    split_v_l.ggml.n_device  = extra_V->n_device;
+                    split_v_l.ggml.split_dim = 0;
+                    split_v_l.ggml.splits    = split_v_l.tensor_splits.data();
+                } else {
+                    printf("Oops: don't have yet K and V for layer %d\n", i);
+                }
+            }
        }
    }
    if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
@@ -1652,10 +1682,7 @@ static bool llm_load_tensors(
        model.buft_layer[i] = llama_default_buffer_type_cpu(true);
    }

-    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
-        // calculate the split points
-        // int device_count = llama_get_device_count(model);
-        int device_count = model.devices.size();
+    if (int device_count = model.devices.size(); device_count > 1) {
        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
        std::vector<float> splits(device_count);
        if (all_zero) {
@@ -1676,36 +1703,31 @@ static bool llm_load_tensors(
        for (int i = 0; i < device_count; ++i) {
            splits[i] /= split_sum;
        }
+        model.splits = std::move(splits);
+    } else {
+        model.splits = { 1.0f };
+    }

+    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
+
+        int device_count = model.splits.size();
        // assign the repeating layers to the devices according to the splits
        int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
        for (int i = i_gpu_start; i < n_layer; ++i) {
-            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
-#ifndef NDEBUG
-            ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
-            const char* name = ggml_backend_buft_name(buft);
-            LLAMA_LOG_DEBUG("load_tensors: layers %3d assigned to backend %s\n", i,
-                name);
-#endif
+            int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
            model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
        }
        // assign the output layer
        if (n_gpu_layers > n_layer) {
-            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
-#ifndef NDEBUG
-            ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
-            const char* name = ggml_backend_buft_name(buft);
-            LLAMA_LOG_DEBUG("load_tensors: output layers assigned to backend %s\n", 
-                name);
-#endif
+            int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - model.splits.begin();
            model.buft_output = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
        } else {
            model.buft_output = llama_default_buffer_type_cpu(true);
        }
    } else {
        ggml_backend_buffer_type_t split_buft;
-        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
-            split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], tensor_split);
+        if (split_mode == LLAMA_SPLIT_MODE_ROW && model.splits.size() > 1) {
+            split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], model.splits.data());
        } else {
            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
            split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);