diff --git a/common/common.cpp b/common/common.cpp index 942b4150..5bdd01b7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1242,12 +1242,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else if (arg_next == "layer") { params.split_mode = LLAMA_SPLIT_MODE_LAYER; } - else if (arg_next == "row") { - //fprintf(stderr, "\n\n=====================================================================================\n"); - //fprintf(stderr, " Split mode row is no longer supported\n"); - //fprintf(stderr, "=====================================================================================\n\n\n"); - //GGML_ABORT("fatal error"); - params.split_mode = LLAMA_SPLIT_MODE_ROW; + else if (arg_next == "graph") { + params.split_mode = LLAMA_SPLIT_MODE_GRAPH; } else { invalid_param = true; @@ -2220,6 +2216,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", "how to split the model across multiple GPUs, one of:\n" " - none: use one GPU only\n" + " - graph: split model tensors and computation graph across GPUs\n" " - layer (default): split layers and KV across GPUs\n" }); options.push_back({ "*", "-ts, --tensor-split SPLIT", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3de656ad..f0f62d46 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -217,7 +217,7 @@ static const char * split_mode_str(llama_split_mode mode) { switch (mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; + case LLAMA_SPLIT_MODE_GRAPH: return "graph"; default: GGML_ABORT("invalid split mode"); } } @@ -630,13 +630,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { mode = LLAMA_SPLIT_MODE_NONE; } else if (m == "layer") { mode = LLAMA_SPLIT_MODE_LAYER; - } else if (m == "row") { - mode = LLAMA_SPLIT_MODE_ROW; - //fprintf(stderr, "\n\n=======================================================================\n"); - //fprintf(stderr, "Split mode 'row' is no longer supported\n"); - //fprintf(stderr, "=======================================================================\n\n\n"); - //invalid_param = true; - //break; + } else if (m == "graph") { + mode = LLAMA_SPLIT_MODE_GRAPH; } else { invalid_param = true; break; diff --git a/include/llama.h b/include/llama.h index 7682951e..ed3f67e9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -269,7 +269,7 @@ extern "C" { enum llama_split_mode { LLAMA_SPLIT_MODE_NONE = 0, // single GPU LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_GRAPH = 2, // splits computations across GPUs }; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index a2a9c327..4d5f2a76 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -230,7 +230,7 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod printf(" Oops: null buft for debvice %d\n", device); } } - if (model.split_mode == LLAMA_SPLIT_MODE_ROW) { + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) { printf("model.splits:"); for (auto s : model.splits) printf(" %g", s); printf("\n"); @@ -305,7 +305,7 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std } if (actual_context) *actual_context = ctx; auto tensor = ml.create_tensor(ctx, name, ne, flags); - //if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_ROW) { + //if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_GRAPH) { // int i_layer = -1; // if (auto pos = name.find("blk."); pos == 0) { // GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1); @@ -2929,7 +2929,7 @@ bool create_tensors_helper::create_tensors() { default: throw std::runtime_error("unknown architecture"); } - if (model.split_mode == LLAMA_SPLIT_MODE_ROW) { + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) { std::vector mem_used(model.splits.size(), 0); const auto & hparams = model.hparams; int gqa_ratio = hparams.n_head() / hparams.n_head_kv(); diff --git a/src/llama.cpp b/src/llama.cpp index 4a7b00b3..6eb2d820 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -648,7 +648,7 @@ static bool llama_kv_cache_init( } bool split_cache = false; - if (model.split_mode == LLAMA_SPLIT_MODE_ROW && model.arch != LLM_ARCH_DEEPSEEK2 && offload) { + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH && model.arch != LLM_ARCH_DEEPSEEK2 && offload) { cache.split_k_l.reserve(n_layer); cache.split_v_l.reserve(n_layer); split_cache = true; @@ -1785,7 +1785,7 @@ static bool llm_load_tensors( } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_MODE_ROW && model.splits.size() > 1) { + if (split_mode == LLAMA_SPLIT_MODE_GRAPH && model.splits.size() > 1) { split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], model.splits.data()); } else { // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported @@ -4424,7 +4424,7 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_CUDA) if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu, cparams.cuda_params); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -4434,7 +4434,7 @@ struct llama_context * llama_new_context_with_model( ggml_backend_add_from_device(ctx, backend); } else { - // LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_ROW require a backend for each GPU + // LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_GRAPH require a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device, cparams.cuda_params); if (backend == nullptr) { @@ -4446,7 +4446,7 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_VULKAN) - if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { + if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); llama_free(ctx); return nullptr; @@ -4471,8 +4471,8 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_SYCL) - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); @@ -4503,9 +4503,9 @@ struct llama_context * llama_new_context_with_model( ggml_backend_add_from_device(ctx, backend); } #elif defined(GGML_USE_CANN) - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used // TODO: ggml_backend_cann is not support split tensor now, just leave code here. - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);