Faster tensor name formatting (#860)

* Adding fused mul+multi_add + CPU implementation

* fused mul+multi_add: command line argument to disable it

* Faster tensor name formatting

We gain ~1% for Ling-mini-2.0 when running on CUDA.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-10-24 07:46:18 +03:00
committed by GitHub
parent db3ba4999f
commit 2522c97dc9
2 changed files with 59 additions and 15 deletions

View File

@@ -1342,6 +1342,8 @@ ggml_cgraph * llm_build_context::build_llama() {
n_tokens = n_outputs;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
cb(cur, "last_attn", il);
cb(inpSA, "last_ffn_inp", il);
}
// For Granite architecture
@@ -5942,6 +5944,8 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
n_tokens = n_outputs;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
cb(cur, "last_attn", il);
cb(inpSA, "last_ffn_inp", il);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -8040,7 +8044,20 @@ ggml_cgraph * llm_build_context::llama_build_graph(
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
if (il >= 0) {
ggml_format_name(cur, "%s-%d", name, il);
int j = 0;
for (; j < GGML_MAX_NAME - 1; ++j) {
cur->name[j] = name[j];
if (!name[j]) break;
}
if (j < GGML_MAX_NAME - 3) {
cur->name[j++] = '-';
auto sil = std::to_string(il);
for (int k = 0; k < (int)sil.size() && j < GGML_MAX_NAME - 1; ++k) {
cur->name[j++] = sil[k];
}
}
cur->name[j] = 0;
//ggml_format_name(cur, "%s-%d", name, il);
} else {
ggml_set_name(cur, name);
}