WIP

2026-04-27 09:53:40 +00:00 · 2025-11-25 14:51:33 +00:00
parent 5ea430aaa4
commit 32c6df015b
8 changed files with 449 additions and 118 deletions
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -192,6 +192,11 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
    ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
    ctx_size += ggml_tensor_overhead()*n_layer*3;         // for moe merged tensors

+    if (model.splits.size() > 1) {
+        ctx_size += ggml_tensor_overhead()*n_layer*4;    // for KV cache
+        ctx_size *= model.splits.size();
+    }
+
    for (auto & it : buft_layer_count) {
        struct ggml_init_params params = {
            /*.mem_size   =*/ ctx_size,
@@ -205,10 +210,78 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
        ctx_map[it.first] = ctx;
        model.ctxs.push_back(ctx);
    }
+    printf("=======================================================================\n");
+    auto n_device = model.device_count();
+    printf(" Model has %d devices:\n", n_device);
+    for (int device = 0; device < n_device; ++device) {
+        auto buft = model.default_buffer_type_offload(device);
+        if (buft) {
+            printf("  %d   %s\n", device, ggml_backend_buft_name(buft));
+        } else {
+            printf("  Oops: null buft for debvice %d\n", device);
+        }
+    }
+    if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
+        printf("model.splits:");
+        for (auto s : model.splits) printf(" %g", s);
+        printf("\n");
+    }
+}
+
+static std::vector<int> create_split(int nr, int granularity, const std::vector<float> & splits) {
+    GGML_ASSERT(nr % granularity == 0);
+    GGML_ASSERT(!splits.empty());
+    int nchunk = nr / granularity;
+    std::vector<int> result(splits.size());
+    float last_split = 0;
+    int sum = 0;
+    for (int i = 0; i < (int)splits.size(); ++i) {
+        float p = splits[i] - last_split;
+        result[i] = roundf(p*nchunk);
+        sum += result[i];
+        last_split = splits[i];
+    }
+    while (sum > nchunk) {
+        last_split = 0;
+        float best_err = std::numeric_limits<float>::max();
+        int ibest = -1;
+        for (int i = 0; i < (int)splits.size(); ++i) {
+            if (result[i] > 0) {
+                float n_want = (splits[i] - last_split)*nchunk;
+                float err = std::abs(n_want - result[i] + 1);
+                if (err < best_err) {
+                    best_err = err; ibest = i;
+                }
+            }
+            last_split = splits[i];
+        }
+        GGML_ASSERT(ibest >= 0 && result[ibest] > 0);
+        --result[ibest];
+        --sum;
+    }
+    while (sum < nchunk) {
+        last_split = 0;
+        float best_err = std::numeric_limits<float>::max();
+        int ibest = -1;
+        for (int i = 0; i < (int)splits.size(); ++i) {
+            float n_want = (splits[i] - last_split)*nchunk;
+            float err = std::abs(n_want - result[i] - 1);
+            if (err < best_err) {
+                best_err = err; ibest = i;
+            }
+            last_split = splits[i];
+        }
+        GGML_ASSERT(ibest >= 0);
+        ++result[ibest];
+        ++sum;
+    }
+    for (auto & r : result) r *= granularity;
+    return result;
 }

 ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
        int flags, ggml_context ** actual_context) {
+    //auto requested_ctx = ctx;
    if (ml.tensor_buft_overrides) {
        for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
            std::regex pattern(overrides->pattern);
@@ -220,7 +293,40 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
        }
    }
    if (actual_context) *actual_context = ctx;
-    return ml.create_tensor(ctx, name, ne, flags);
+    auto tensor = ml.create_tensor(ctx, name, ne, flags);
+    //if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_ROW) {
+    //    int i_layer = -1;
+    //    if (auto pos = name.find("blk."); pos == 0) {
+    //        GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1);
+    //    }
+    //    if (i_layer >= 0) {
+    //        auto & layer = model.layers[i_layer];
+    //        auto & hparams = model.hparams;
+    //        if (auto pos = name.find("attn_q.weight"); pos != std::string::npos) {
+    //            auto split = create_split(tensor->ne[1], hparams.n_embd_head_k, model.splits);
+    //            printf("%s(%s):", __func__, name.c_str());
+    //            for (auto s : split) printf(" %d", s);
+    //            printf("\n");
+    //            layer.split_wq.tensor_splits.resize(split.size());
+    //            size_t offset = 0;
+    //            for (int i = 0; i < (int)split.size(); ++i) {
+    //                if (split[i] > 0) {
+    //                    layer.split_wq.tensor_splits[i] = ggml_view_2d(ctx, tensor, tensor->ne[0], split[i], tensor->nb[1], offset);
+    //                    auto split_name = name + '.' + std::to_string(i);
+    //                    ggml_set_name(layer.split_wq.tensor_splits[i], split_name.c_str());
+    //                    offset += tensor->nb[1]*split[i];
+    //                } else {
+    //                    layer.split_wq.tensor_splits[i] = nullptr;
+    //                }
+    //            }
+    //            layer.split_wq.ggml.n_device  = split.size();
+    //            layer.split_wq.ggml.split_dim = 1;
+    //            layer.split_wq.ggml.splits    = layer.split_wq.tensor_splits.data();
+    //        }
+    //    }
+    //}
+    return tensor;
+    //return ml.create_tensor(ctx, name, ne, flags);
 }

 #define LOADING_PRELUDE \
@@ -2638,6 +2744,40 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i
    return fused_qkv;
 }

+static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor, const std::vector<int> & splits) {
+    GGML_ASSERT(split_dim == 0 || split_dim == 1);
+    GGML_ASSERT(splits.size() > 1);
+    std::string name{tensor->name};
+    split_tensor.tensor_splits.resize(splits.size());
+    if (split_dim == 1) {
+        size_t offset = 0;
+        for (int i = 0; i < int(splits.size()); ++i) {
+            if (splits[i] > 0) {
+                split_tensor.tensor_splits[i] = ggml_view_3d(ctx, tensor, tensor->ne[0], splits[i], tensor->ne[2], tensor->nb[1], tensor->nb[2], offset);
+                auto name_i = name + '.' + std::to_string(i);
+                ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
+                offset += tensor->nb[1]*splits[i];
+            } else {
+                split_tensor.tensor_splits[i] = nullptr;
+            }
+        }
+    } else {
+        for (int i = 0; i < int(splits.size()); ++i) {
+            if (splits[i] > 0) {
+                split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, splits[i], tensor->ne[1], tensor->ne[2]);
+                auto name_i = name + '.' + std::to_string(i);
+                ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
+            } else {
+                split_tensor.tensor_splits[i] = nullptr;
+            }
+        }
+    }
+    split_tensor.ggml.n_device  = splits.size();
+    split_tensor.ggml.split_dim = split_dim;
+    split_tensor.ggml.splits    = split_tensor.tensor_splits.data();
+    tensor->extra = (void *)&split_tensor.ggml;
+}
+
 bool create_tensors_helper::create_tensors() {
    const auto tn = LLM_TN(model.arch);
    bool use_mmap_buffer = true;
@@ -2761,6 +2901,71 @@ bool create_tensors_helper::create_tensors() {
        default:
            throw std::runtime_error("unknown architecture");
    }
+    if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
+        const auto & hparams = model.hparams;
+        int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
+        printf("GQA ratio: %d\n", gqa_ratio);
+        for (int il = 0; il < int(model.layers.size()); ++il) {
+            auto & layer = model.layers[il];
+            auto ctx_split = ctx_for_layer_split(il);
+            if (layer.wo && layer.wq && layer.wk && layer.wv) {
+                int attn_granularity = hparams.n_embd_head_k;
+                if (ggml_is_quantized(layer.wo->type)) {
+                    auto tt = ggml_internal_get_type_traits(layer.wo->type);
+                    if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size;
+                }
+                GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0);
+                auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits);
+                prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split);
+                prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split);
+                for (auto & s : split) s /= gqa_ratio;
+                prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split);
+                prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split);
+            }
+
+            if (layer.ffn_down && layer.ffn_up && layer.ffn_gate) {
+                int ffn_granularity = 16;
+                if (ggml_is_quantized(layer.ffn_down->type)) {
+                    auto tt = ggml_internal_get_type_traits(layer.ffn_down->type);
+                    if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
+                }
+                auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits);
+                prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_up,   layer.split_ffn_up,   split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split);
+            }
+
+            if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
+                int ffn_granularity = 16;
+                if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
+                    auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
+                    if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
+                }
+                auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits);
+                prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp,   layer.split_ffn_up_shexp,   split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split);
+            }
+
+            if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
+                int ffn_granularity = 16;
+                if (ggml_is_quantized(layer.ffn_down_exps->type)) {
+                    auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
+                    if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
+                }
+                auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits);
+                prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_up_exps,   layer.split_ffn_up_exps,   split);
+                prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split);
+            }
+        }
+
+        if (model.output) {
+            auto ctx_split = ctx_map[model.buft_output.buft_matrix];
+            auto split = create_split(model.output->ne[1], 16, model.splits);
+            prepare_split_tensors(1, ctx_split, model.output, model.split_output, split);
+        }
+    }
    return use_mmap_buffer;
 }