mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-10 22:10:20 +00:00
WIP tensor overrides
Runs with wrong results, don't see where the issue could be.
This commit is contained in:
@@ -1075,36 +1075,83 @@ llm_expert_gating_func_type gating_op,
|
||||
llm_ffn_op_type type_op_shexp,
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph) {
|
||||
|
||||
auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra;
|
||||
auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra;
|
||||
auto split_down_exps = (ggml_split_tensor_t *)down_exps->extra;
|
||||
auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra;
|
||||
auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra;
|
||||
auto split_down_exps = (ggml_split_tensor_t *)down_exps->extra;
|
||||
auto split_up_shexp = up_shexp ? (ggml_split_tensor_t *)up_shexp->extra : nullptr;
|
||||
auto split_gate_shexp = gate_shexp ? (ggml_split_tensor_t *)gate_shexp->extra : nullptr;
|
||||
auto split_down_shexp = down_shexp ? (ggml_split_tensor_t *)down_shexp->extra : nullptr;
|
||||
auto split_up_b_shexp = up_b_shexp ? (ggml_split_tensor_t *)up_b_shexp : nullptr;
|
||||
auto split_gate_b_shexp = gate_b_shexp ? (ggml_split_tensor_t *)gate_b_shexp : nullptr;
|
||||
auto split_down_b_shexp = down_b_shexp ? (ggml_split_tensor_t *)down_b_shexp : nullptr;
|
||||
if (!split_up_exps && !split_gate_exps && !split_down_exps) {
|
||||
auto cur = input;
|
||||
if (ffn_norm) {
|
||||
cur = llm_build_norm(ctx, input, lctx.model.hparams, ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm;
|
||||
cur = llm_build_norm(ctx, input, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
}
|
||||
else if (cur->type != GGML_TYPE_F32) {
|
||||
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||
}
|
||||
auto the_gate_inp = gate_inp->extra ? ((ggml_split_tensor_t *)gate_inp->extra)->splits[lctx.model.main_gpu] : gate_inp;
|
||||
auto the_gate_inp_b = gate_inp_b ? gate_inp_b->extra ? ((ggml_split_tensor_t *)gate_inp_b->extra)->splits[lctx.model.main_gpu] : gate_inp_b : nullptr;
|
||||
auto the_exp_probs_b = exp_probs_b ? exp_probs_b->extra ? ((ggml_split_tensor_t *)exp_probs_b->extra)->splits[lctx.model.main_gpu] : exp_probs_b : nullptr;
|
||||
//printf("Using non-split llm_build_moe_ffn for layer %d\n", il);
|
||||
auto routed_out = llm_build_moe_ffn(ctx, lctx, cur,
|
||||
gate_inp, gate_inp_b,
|
||||
the_gate_inp, the_gate_inp_b,
|
||||
up_exps, up_exps_b,
|
||||
gate_exps, gate_exps_b,
|
||||
down_exps, down_exps_b,
|
||||
exp_probs_b,
|
||||
the_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
type_op, norm_w, scale_w, w_scale,
|
||||
gating_op, cb, il, graph);
|
||||
cb(routed_out, "routed_out", il);
|
||||
|
||||
if (up_shexp && gate_shexp && down_shexp) {
|
||||
auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur,
|
||||
up_shexp, up_b_shexp, nullptr,
|
||||
gate_shexp, gate_b_shexp, nullptr,
|
||||
down_shexp, down_b_shexp, nullptr,
|
||||
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il);
|
||||
cb(shared_out, "ffn_shexp_out", il);
|
||||
|
||||
cur = ggml_add(ctx, routed_out, shared_out);
|
||||
cb(cur, "ffn_out", il);
|
||||
if (split_up_shexp) {
|
||||
//printf("Using split ffn for shared experts in layer %d\n", il);
|
||||
std::vector<ggml_tensor *> results(split_up_shexp->n_device);
|
||||
GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_shexp->n_device);
|
||||
GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_shexp->n_device);
|
||||
GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_shexp->n_device);
|
||||
for (int id = 0; id < split_up_shexp->n_device; ++id) {
|
||||
int il_cb = 1000*id + il;
|
||||
auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur,
|
||||
split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr,
|
||||
split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr,
|
||||
split_down_shexp->splits[id], split_down_b_shexp ? split_down_b_shexp->splits[id] : nullptr, nullptr,
|
||||
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il);
|
||||
cb(shared_out, "ffn_shexp_out", il_cb);
|
||||
if (shared_out->ne[1] > 32) {
|
||||
shared_out = ggml_cast(ctx, shared_out, GGML_TYPE_F16);
|
||||
}
|
||||
results[id] = shared_out;
|
||||
}
|
||||
auto cur = ggml_add(ctx, results[0], results[1]);
|
||||
cur->op_params[0] = 0xff;
|
||||
cb(cur, "ffn_shared_combined", il);
|
||||
for (int id = 2; id < int(results.size()); ++id) {
|
||||
cur = ggml_add(ctx, cur, results[id]);
|
||||
cb(cur, "ffn_shared_combined", il);
|
||||
}
|
||||
if (cur->type == GGML_TYPE_F16) {
|
||||
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||
}
|
||||
cur = ggml_add(ctx, routed_out, cur);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
//printf("Using non-split ffn for shared experts in layer %d\n", il);
|
||||
auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur,
|
||||
up_shexp, up_b_shexp, nullptr,
|
||||
gate_shexp, gate_b_shexp, nullptr,
|
||||
down_shexp, down_b_shexp, nullptr,
|
||||
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il);
|
||||
cb(shared_out, "ffn_shexp_out", il);
|
||||
cur = ggml_add(ctx, routed_out, shared_out);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
} else {
|
||||
cur = routed_out;
|
||||
}
|
||||
@@ -1113,16 +1160,12 @@ llm_expert_gating_func_type gating_op,
|
||||
GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps);
|
||||
GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device);
|
||||
std::vector<ggml_tensor *> results(split_up_exps->n_device);
|
||||
auto split_up_shexp = up_shexp ? (ggml_split_tensor_t *)up_shexp->extra : nullptr;
|
||||
auto split_gate_shexp = gate_shexp ? (ggml_split_tensor_t *)gate_shexp->extra : nullptr;
|
||||
auto split_down_shexp = down_shexp ? (ggml_split_tensor_t *)down_shexp->extra : nullptr;
|
||||
GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) ||
|
||||
( split_up_shexp && split_gate_shexp && split_down_shexp));
|
||||
auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra;
|
||||
GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device);
|
||||
auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr;
|
||||
GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device);
|
||||
if (gate_inp_b || up_exps_b || gate_exps_b || down_exps_b) printf("Have expert biases %p, %p, %p, %p\n", (void *)gate_inp_b, (void *)up_exps_b, (void *)gate_exps_b, (void *)down_exps_b);
|
||||
for (int id = 0; id < split_up_exps->n_device; ++id) {
|
||||
int il_cb = 1000*(id + 1) + il;
|
||||
auto cur = input;
|
||||
@@ -1147,9 +1190,6 @@ llm_expert_gating_func_type gating_op,
|
||||
cb(routed_out, "routed_out", il_cb);
|
||||
|
||||
if (split_up_shexp) {
|
||||
auto split_up_b_shexp = up_b_shexp ? (ggml_split_tensor_t *)up_b_shexp : nullptr;
|
||||
auto split_gate_b_shexp = gate_b_shexp ? (ggml_split_tensor_t *)gate_b_shexp : nullptr;
|
||||
auto split_down_b_shexp = down_b_shexp ? (ggml_split_tensor_t *)down_b_shexp : nullptr;
|
||||
GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_exps->n_device);
|
||||
GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_exps->n_device);
|
||||
GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_exps->n_device);
|
||||
@@ -1499,12 +1539,12 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (q_norm) {
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Qcur);
|
||||
}
|
||||
if (k_norm) {
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Kcur);
|
||||
}
|
||||
@@ -1536,12 +1576,12 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (q_norm) {
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Qcur);
|
||||
}
|
||||
if (k_norm) {
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Kcur);
|
||||
}
|
||||
@@ -1559,7 +1599,7 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
|
||||
|
||||
auto Kcur = ggml_reshape_3d(ctx0, K, n_embd_head, K->ne[0]/n_embd_head, n_tokens);
|
||||
if (k_norm) {
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
}
|
||||
auto Vcur = V;
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <array>
|
||||
#include <future>
|
||||
#include <regex>
|
||||
#include <unordered_set>
|
||||
|
||||
#define LLAMA_API_INTERNAL
|
||||
|
||||
@@ -159,6 +160,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
ggml_context * ctx_output;
|
||||
ggml_context * ctx_output_split;
|
||||
|
||||
std::unordered_set<ggml_tensor *> split_tensors;
|
||||
|
||||
inline ggml_context * ctx_for_buft(ggml_backend_buffer_type_t buft) {
|
||||
if (auto it = ctx_map.find(buft); it != ctx_map.end()) return it->second;
|
||||
|
||||
@@ -292,7 +295,7 @@ static std::vector<int> create_split(int nr, int granularity, const std::vector<
|
||||
|
||||
ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
||||
int flags, ggml_context ** actual_context) {
|
||||
//auto requested_ctx = ctx;
|
||||
auto requested_ctx = ctx;
|
||||
if (ml.tensor_buft_overrides) {
|
||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
@@ -305,39 +308,11 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
|
||||
}
|
||||
if (actual_context) *actual_context = ctx;
|
||||
auto tensor = ml.create_tensor(ctx, name, ne, flags);
|
||||
//if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||
// int i_layer = -1;
|
||||
// if (auto pos = name.find("blk."); pos == 0) {
|
||||
// GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1);
|
||||
// }
|
||||
// if (i_layer >= 0) {
|
||||
// auto & layer = model.layers[i_layer];
|
||||
// auto & hparams = model.hparams;
|
||||
// if (auto pos = name.find("attn_q.weight"); pos != std::string::npos) {
|
||||
// auto split = create_split(tensor->ne[1], hparams.n_embd_head_k, model.splits);
|
||||
// printf("%s(%s):", __func__, name.c_str());
|
||||
// for (auto s : split) printf(" %d", s);
|
||||
// printf("\n");
|
||||
// layer.split_wq.tensor_splits.resize(split.size());
|
||||
// size_t offset = 0;
|
||||
// for (int i = 0; i < (int)split.size(); ++i) {
|
||||
// if (split[i] > 0) {
|
||||
// layer.split_wq.tensor_splits[i] = ggml_view_2d(ctx, tensor, tensor->ne[0], split[i], tensor->nb[1], offset);
|
||||
// auto split_name = name + '.' + std::to_string(i);
|
||||
// ggml_set_name(layer.split_wq.tensor_splits[i], split_name.c_str());
|
||||
// offset += tensor->nb[1]*split[i];
|
||||
// } else {
|
||||
// layer.split_wq.tensor_splits[i] = nullptr;
|
||||
// }
|
||||
// }
|
||||
// layer.split_wq.ggml.n_device = split.size();
|
||||
// layer.split_wq.ggml.split_dim = 1;
|
||||
// layer.split_wq.ggml.splits = layer.split_wq.tensor_splits.data();
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
if (tensor && ctx == requested_ctx) {
|
||||
printf("%s: adding tensor %s to split tensors\n", __func__, tensor->name);
|
||||
split_tensors.insert(tensor);
|
||||
}
|
||||
return tensor;
|
||||
//return ml.create_tensor(ctx, name, ne, flags);
|
||||
}
|
||||
|
||||
#define LOADING_PRELUDE \
|
||||
@@ -2998,41 +2973,45 @@ bool create_tensors_helper::create_tensors() {
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split, mem_used);
|
||||
}
|
||||
|
||||
//bool any_ffn_split = false;
|
||||
if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
|
||||
int ffn_granularity = 16;
|
||||
if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
|
||||
auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
|
||||
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
|
||||
bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() &&
|
||||
split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() &&
|
||||
split_tensors.find(layer.ffn_up_shexp) != split_tensors.end();
|
||||
if (use_split) {
|
||||
//any_ffn_split = true;
|
||||
int ffn_granularity = 16;
|
||||
if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
|
||||
auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
|
||||
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
|
||||
}
|
||||
auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits);
|
||||
prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split, mem_used);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used);
|
||||
}
|
||||
auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits);
|
||||
prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split, mem_used);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used);
|
||||
}
|
||||
|
||||
if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
|
||||
int ffn_granularity = 16;
|
||||
if (ggml_is_quantized(layer.ffn_down_exps->type)) {
|
||||
auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
|
||||
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
|
||||
}
|
||||
auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits);
|
||||
prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split, mem_used);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split, mem_used);
|
||||
//printf("=== Layer %d routed experts, %s, %s, %s:\n", il, ggml_type_name(layer.ffn_down_exps->type), ggml_type_name(layer.ffn_gate_exps->type), ggml_type_name(layer.ffn_up_exps->type));
|
||||
//printf("mem_used:"); for (auto mem : mem_used) printf(" %8.2f", mem/1024./1024.);
|
||||
//printf(" MiB\n");
|
||||
//printf(" down:");
|
||||
//for (auto split : layer.split_ffn_down_exps.tensor_splits) printf(" %ldx%ldx%ld", split->ne[0], split->ne[1], split->ne[2]);
|
||||
//printf("\n");
|
||||
//printf(" gate:");
|
||||
//for (auto split : layer.split_ffn_gate_exps.tensor_splits) printf(" %ldx%ldx%ld", split->ne[0], split->ne[1], split->ne[2]);
|
||||
//printf("\n");
|
||||
//printf(" up:");
|
||||
//for (auto split : layer.split_ffn_up_exps.tensor_splits) printf(" %ldx%ldx%ld", split->ne[0], split->ne[1], split->ne[2]);
|
||||
//printf("\n");
|
||||
bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() &&
|
||||
split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() &&
|
||||
split_tensors.find(layer.ffn_up_exps) != split_tensors.end();
|
||||
|
||||
if (use_split) {
|
||||
//any_ffn_split = true;
|
||||
int ffn_granularity = 16;
|
||||
if (ggml_is_quantized(layer.ffn_down_exps->type)) {
|
||||
auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
|
||||
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
|
||||
}
|
||||
auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits);
|
||||
prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split, mem_used);
|
||||
prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split, mem_used);
|
||||
}
|
||||
}
|
||||
|
||||
//if (any_ffn_split) {
|
||||
if (layer.ffn_gate_inp) {
|
||||
auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, model.splits);
|
||||
prepare_split_tensors(-1, ctx_split, layer.ffn_gate_inp, layer.split_ffn_gate_inp, shared_split, mem_used);
|
||||
@@ -3041,7 +3020,7 @@ bool create_tensors_helper::create_tensors() {
|
||||
auto shared_split = create_split(ggml_nrows(layer.ffn_exp_probs_b), -1, model.splits);
|
||||
prepare_split_tensors(-1, ctx_split, layer.ffn_exp_probs_b, layer.split_ffn_exp_probs_b, shared_split, mem_used);
|
||||
}
|
||||
}
|
||||
//}
|
||||
}
|
||||
|
||||
if (model.output) {
|
||||
|
||||
Reference in New Issue
Block a user