Async compute graph evaluation (2 or more GPUs) (#1089)

* WIP: absorb adding input into std_attn and std_ffn

* WIP: NCCL infra

* WIP: add reduce and fake_cpy ops

* WIP

* WIP: graph appears to work, layer is broken

* WIP: Qwen3-MoE works with graph, layer still broken

* WIP: GLM-4.5 graph works

* WIP: fix sm layer (dense)

* WIP: fix sm layer (MoE)

* WIP: fast PP with bespoke 4-GPU NCCL

I guess, I'm not using NCCL the right way as PP is very
low with a single communicator group for 3 or more GPUs.
But if I create 4 communicator groups for pairs of GPUs
(0,1, 2,3, 0,2, 1,3) and use that, PP is fast: I'm hitting
1500 t/s for L3-70B on the 4x3090 system, which is
~20% better than the previous sm graph without NCCL.
But that cannot be the solution (I cannot be creating pairwise
communicators and associated logic for every possible number of GPUs).

* WIP: Cohere2

* Explicitely set device

* Bespoke 3-GPU case

* WIP

* Do not repeat get_rows multiple times

* Fix 3 GPUs

* OK, let's leave it in

* Simple async

* This sync seems enough

* Only do async for 4 or more backends

With 2 GPUs (so, 3 backends) not using async is slightly faster

* Scheduler changes

* Use OpenMP if available

Surprisingly (at least to me), this is quite a bit faster than
std::thread and std::barrier. GLM-4.5-AIR with 4 GPUs is now
at 105 t/s at zero context!

* Do not use OpenMP if there are tensor overrides

* Set omp max active levels

* Be more careful with having set the device before using a stream

* Command line option to turn on async. Set to false by defualt for now

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-12-27 08:18:06 +01:00
committed by GitHub
parent 7146de451d
commit 519405dc97
10 changed files with 321 additions and 132 deletions

View File

@@ -678,9 +678,6 @@ ggml_tensor * llm_build_context::llm_build_ffn(
auto norm = (ggml_split_tensor_t *)ffn_norm->extra;
GGML_ASSERT(norm->splits[id]);
if (is_norm) {
//cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM, cb, il);
//GGML_ASSERT(cur->src[0]->op == GGML_OP_NORM);
//cur->src[0]->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
cur = ggml_fused_norm(ctx, cur, norm->splits[id], lctx.model.hparams.f_norm_eps);
} else {
cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
@@ -9389,9 +9386,6 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
auto cur = get_input_tensor_sm_graph(input, id);
if (attn_norm) {
if (is_norm) {
//cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM, cb, il);
//GGML_ASSERT(cur->src[0]->op == GGML_OP_NORM);
//cur->src[0]->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
cur = ggml_fused_norm(ctx0, cur, attn_norm->splits[id], lctx.model.hparams.f_norm_eps);
} else {
cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM_RMS, cb, il);

View File

@@ -42,6 +42,7 @@ struct llama_cparams {
bool k_cache_hadamard;
bool split_mode_graph_scheduling;
bool split_mode_f16;
bool scheduler_async;
int min_experts;
float thresh_experts;

View File

@@ -4056,6 +4056,7 @@ struct llama_context_params llama_context_default_params() {
/*.k_cache_hadamard =*/ false,
/*.split_mode_graph_scheduling =*/ false,
/*.split_mode_f16 =*/ true,
/*.scheduler_async =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
/*.offload_policy =*/ nullptr,
@@ -4346,6 +4347,7 @@ struct llama_context * llama_new_context_with_model(
cparams.k_cache_hadamard = params.k_cache_hadamard;
cparams.split_mode_graph_scheduling = params.split_mode_graph_scheduling;
cparams.split_mode_f16 = params.split_mode_f16;
cparams.scheduler_async = params.scheduler_async;
cparams.min_experts = params.min_experts;
cparams.thresh_experts = params.thresh_experts;
cparams.cuda_params = params.cuda_params;
@@ -4436,6 +4438,7 @@ struct llama_context * llama_new_context_with_model(
LLAMA_LOG_INFO("%s: k_cache_hadam = %d\n", __func__, cparams.k_cache_hadamard);
LLAMA_LOG_INFO("%s: split_mode_graph_scheduling = %d\n", __func__, cparams.split_mode_graph_scheduling);
LLAMA_LOG_INFO("%s: split_mode_f16= %d\n", __func__, cparams.split_mode_f16);
LLAMA_LOG_INFO("%s: sched_async = %d\n", __func__, cparams.scheduler_async);
LLAMA_LOG_INFO("%s: ser = %d, %g\n", __func__, cparams.min_experts, cparams.thresh_experts);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
@@ -4780,13 +4783,13 @@ struct llama_context * llama_new_context_with_model(
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
}
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && (!model->has_tensor_overrides() || cparams.split_mode_graph_scheduling)) {
ggml_backend_sched_set_split_mode_graph(ctx->sched, true);
ggml_backend_sched_set_split_mode_graph(ctx->sched, true, cparams.scheduler_async);
ggml_backend_sched_set_max_extra_alloc(ctx->sched, params.max_extra_alloc);
if (model->has_tensor_overrides() && cparams.split_mode_graph_scheduling) {
LLAMA_LOG_INFO("XXXXXXXX Split Mode Graph Scheduling is FORCED despite tensor overrides due to user choice.\n");
LLAMA_LOG_INFO("XXXXXXXX It may or might NOT infer properly due to unsupported combinations between SMGS and every possible tensor overrides.\n");
}
}
}
return ctx;
}