diff --git a/common/common.cpp b/common/common.cpp index df5521e7..e695375c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1267,6 +1267,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa #endif // GGML_USE_CUDA_SYCL_VULKAN return true; } + else if (arg == "--max-gpu") { + CHECK_ARG + params.max_gpu = std::stoi(argv[i]); + return true; + } if (arg == "--split-mode" || arg == "-sm") { CHECK_ARG std::string arg_next = argv[i]; @@ -2265,6 +2270,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" }); options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); + options.push_back({ "*", "--max-gpu i", "max. number of GPUs to use at a time with split mode 'graph', (default: %d)", params.max_gpu }); } options.push_back({ "model" }); @@ -2973,6 +2979,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.mla = params.mla_attn; mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; + mparams.max_gpu = params.max_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; @@ -4173,6 +4180,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); diff --git a/common/common.h b/common/common.h index e23e18b3..1ac4d136 100644 --- a/common/common.h +++ b/common/common.h @@ -154,6 +154,7 @@ struct gpt_params { int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph" float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width diff --git a/include/llama.h b/include/llama.h index b425bc87..3bb72fe4 100644 --- a/include/llama.h +++ b/include/llama.h @@ -362,6 +362,7 @@ extern "C" { // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results // LLAMA_SPLIT_LAYER: ignored int32_t main_gpu; + int32_t max_gpu; // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 227d94e6..2f8794cc 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -268,15 +268,14 @@ static std::vector create_split(int nr, int granularity, const std::vector< } while (sum > nchunk) { last_split = 0; - float best_err = std::numeric_limits::max(); + float best_err = 0; int ibest = -1; for (int i = 0; i < (int)splits.size(); ++i) { if (result[i] > 0) { float p = splits[i] - last_split; float n_want = p*nchunk; - float err = std::abs(n_want - result[i] + 1); - //float err = std::abs(n_want - result[i] + 1) + std::abs(p - 1.f*mem_used[i]/tot_memory_used)*nchunk; - if (err < best_err) { + float err = result[i] - n_want; + if (err > best_err) { best_err = err; ibest = i; } } @@ -288,14 +287,13 @@ static std::vector create_split(int nr, int granularity, const std::vector< } while (sum < nchunk) { last_split = 0; - float best_err = std::numeric_limits::max(); + float best_err = 0; int ibest = -1; for (int i = 0; i < (int)splits.size(); ++i) { float p = splits[i] - last_split; float n_want = p*nchunk; - float err = std::abs(n_want - result[i] - 1); - //float err = std::abs(n_want - result[i] - 1) + std::abs(p - 1.f*mem_used[i]/tot_memory_used)*nchunk; - if (err < best_err) { + float err = n_want - result[i]; + if (err > best_err) { best_err = err; ibest = i; } last_split = splits[i]; @@ -2804,6 +2802,29 @@ static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor } } +static void adjust_split(std::vector & split, const std::vector & mem_used, int max_gpu) { + if (max_gpu < 1 || max_gpu >= int(split.size()) || split.size() != mem_used.size()) { + return; + } + size_t tot_mem_used = 1; + for (auto & mem : mem_used) tot_mem_used += mem; + for (int i = split.size() - 1; i > 0; --i) split[i] -= split[i-1]; + std::vector> sorted(split.size()); + for (int i = 0; i < int(split.size()); ++i) { + float mem_ideal = split[i]*tot_mem_used; + float err = mem_ideal - mem_used[i]; + sorted[i] = {err, i}; + } + std::partial_sort(sorted.begin(), sorted.begin() + max_gpu, sorted.end(), std::greater>{}); + for (auto & p : split) p = 0; + for (int j = 0; j < max_gpu; ++j) split[sorted[j].second] = 1; + float sum = 0; + for (auto & p : split) { + sum += p/max_gpu; + p = sum; + } +} + bool create_tensors_helper::create_tensors() { const auto tn = LLM_TN(model.arch); bool use_mmap_buffer = true; @@ -2936,23 +2957,36 @@ bool create_tensors_helper::create_tensors() { throw std::runtime_error("unknown architecture"); } if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) { + printf("================================ max_gpu = %d\n", model.max_gpu); std::vector mem_used(model.splits.size(), 0); const auto & hparams = model.hparams; int gqa_ratio = hparams.n_head() / hparams.n_head_kv(); - //printf("GQA ratio: %d\n", gqa_ratio); + auto cur_splits = model.splits; + int adjust_step = std::max(1, int(model.layers.size() / (2*model.splits.size()))); for (int il = 0; il < int(model.layers.size()); ++il) { if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) { LLAMA_LOG_INFO("%s: not splitting layer %d because buffer type is host\n", __func__, il); continue; } + if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size()) && il % adjust_step == 0) { + cur_splits = model.splits; + adjust_split(cur_splits, mem_used, model.max_gpu); + printf("Adjusted split at layer %2d:", il); + float last_split = 0; + for (auto & p : cur_splits) { + printf(" %g", p - last_split); + last_split = p; + } + printf("\n"); + } auto & layer = model.layers[il]; auto ctx_split = ctx_for_layer_split(il); if (layer.attn_norm) { - auto split = create_split(ggml_nrows(layer.attn_norm), -1, model.splits, mem_used); + auto split = create_split(ggml_nrows(layer.attn_norm), -1, cur_splits, mem_used); prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, split, mem_used); } if (layer.rope_freqs) { - auto split = create_split(ggml_nrows(layer.rope_freqs), -1, model.splits, mem_used); + auto split = create_split(ggml_nrows(layer.rope_freqs), -1, cur_splits, mem_used); prepare_split_tensors(-1, ctx_split, layer.rope_freqs, layer.split_rope_freqs, split, mem_used); } if (layer.wo && layer.wq && layer.wk && layer.wv) { @@ -2962,7 +2996,7 @@ bool create_tensors_helper::create_tensors() { if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size; } GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0); - auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits, mem_used); + auto split = create_split(layer.wo->ne[0], attn_granularity, cur_splits, mem_used); prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split, mem_used); prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split, mem_used); if (layer.bo) { @@ -2990,7 +3024,7 @@ bool create_tensors_helper::create_tensors() { if (layer.ffn_norm) { if (auto it = split_tensors.find(layer.ffn_norm); it != split_tensors.end()) { - auto split = create_split(ggml_nrows(layer.ffn_norm), -1, model.splits, mem_used); + auto split = create_split(ggml_nrows(layer.ffn_norm), -1, cur_splits, mem_used); prepare_split_tensors(-1, ctx_split, layer.ffn_norm, layer.split_ffn_norm, split, mem_used); } } @@ -3005,7 +3039,7 @@ bool create_tensors_helper::create_tensors() { auto tt = ggml_internal_get_type_traits(layer.ffn_down->type); if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; } - auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits, mem_used); + auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, cur_splits, mem_used); prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split, mem_used); prepare_split_tensors(1, ctx_split, layer.ffn_up, layer.split_ffn_up, split, mem_used); prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split, mem_used); @@ -3024,7 +3058,7 @@ bool create_tensors_helper::create_tensors() { auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type); if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; } - auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits, mem_used); + auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, cur_splits, mem_used); prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used); prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split, mem_used); prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used); @@ -3043,7 +3077,7 @@ bool create_tensors_helper::create_tensors() { auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type); if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; } - auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits, mem_used); + auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, cur_splits, mem_used); //printf("split(%2d):", il); for (auto & s : split) printf(" %d", s); printf("\n"); prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used); prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split, mem_used); @@ -3053,13 +3087,13 @@ bool create_tensors_helper::create_tensors() { if (layer.ffn_gate_inp) { if (auto it = split_tensors.find(layer.ffn_gate_inp); it != split_tensors.end()) { - auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, model.splits, mem_used); + auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, cur_splits, mem_used); prepare_split_tensors(-1, ctx_split, layer.ffn_gate_inp, layer.split_ffn_gate_inp, shared_split, mem_used); } } if (layer.ffn_exp_probs_b) { if (auto it = split_tensors.find(layer.ffn_exp_probs_b); it != split_tensors.end()) { - auto shared_split = create_split(ggml_nrows(layer.ffn_exp_probs_b), -1, model.splits, mem_used); + auto shared_split = create_split(ggml_nrows(layer.ffn_exp_probs_b), -1, cur_splits, mem_used); prepare_split_tensors(-1, ctx_split, layer.ffn_exp_probs_b, layer.split_ffn_exp_probs_b, shared_split, mem_used); } } diff --git a/src/llama-model.h b/src/llama-model.h index d6188721..a252ab5e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -350,6 +350,7 @@ struct llama_model { llama_split_mode split_mode; int main_gpu; + int max_gpu = 0; // max. number of GPUs to use per layer for aplit mode "graph" int n_gpu_layers; std::vector rpc_servers; diff --git a/src/llama.cpp b/src/llama.cpp index d0b42cca..e75194aa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1742,6 +1742,7 @@ static bool llm_load_tensors( int mla_attn, enum llama_split_mode split_mode, int main_gpu, + int max_gpu, const float * tensor_split, bool use_mlock, bool validate_quants, @@ -1763,6 +1764,7 @@ static bool llm_load_tensors( model.split_mode = split_mode; model.main_gpu = main_gpu; + model.max_gpu = max_gpu; model.n_gpu_layers = n_gpu_layers; const int n_layer = hparams.n_layer; @@ -2138,7 +2140,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam #endif if (!llm_load_tensors( - ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split, + ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split, params.use_mlock, params.validate_quants, params.progress_callback, params.progress_callback_user_data )) { @@ -3985,6 +3987,7 @@ struct llama_model_params llama_model_default_params() { /*.mla =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, + /*.max_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.rpc_servers =*/ nullptr, /*.progress_callback =*/ nullptr,