mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
Rename split mode "row" to split mode "graph"
This commit is contained in:
@@ -1276,12 +1276,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
else if (arg_next == "layer") {
|
else if (arg_next == "layer") {
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||||
}
|
}
|
||||||
else if (arg_next == "row") {
|
else if (arg_next == "graph") {
|
||||||
//fprintf(stderr, "\n\n=====================================================================================\n");
|
params.split_mode = LLAMA_SPLIT_MODE_GRAPH;
|
||||||
//fprintf(stderr, " Split mode row is no longer supported\n");
|
|
||||||
//fprintf(stderr, "=====================================================================================\n\n\n");
|
|
||||||
//GGML_ABORT("fatal error");
|
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@@ -2249,6 +2245,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||||||
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
||||||
"how to split the model across multiple GPUs, one of:\n"
|
"how to split the model across multiple GPUs, one of:\n"
|
||||||
" - none: use one GPU only\n"
|
" - none: use one GPU only\n"
|
||||||
|
" - graph: split model tensors and computation graph across GPUs\n"
|
||||||
" - layer (default): split layers and KV across GPUs\n" });
|
" - layer (default): split layers and KV across GPUs\n" });
|
||||||
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
||||||
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
||||||
|
|||||||
@@ -217,7 +217,7 @@ static const char * split_mode_str(llama_split_mode mode) {
|
|||||||
switch (mode) {
|
switch (mode) {
|
||||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||||
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
||||||
case LLAMA_SPLIT_MODE_ROW: return "row";
|
case LLAMA_SPLIT_MODE_GRAPH: return "graph";
|
||||||
default: GGML_ABORT("invalid split mode");
|
default: GGML_ABORT("invalid split mode");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -630,13 +630,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
mode = LLAMA_SPLIT_MODE_NONE;
|
mode = LLAMA_SPLIT_MODE_NONE;
|
||||||
} else if (m == "layer") {
|
} else if (m == "layer") {
|
||||||
mode = LLAMA_SPLIT_MODE_LAYER;
|
mode = LLAMA_SPLIT_MODE_LAYER;
|
||||||
} else if (m == "row") {
|
} else if (m == "graph") {
|
||||||
mode = LLAMA_SPLIT_MODE_ROW;
|
mode = LLAMA_SPLIT_MODE_GRAPH;
|
||||||
//fprintf(stderr, "\n\n=======================================================================\n");
|
|
||||||
//fprintf(stderr, "Split mode 'row' is no longer supported\n");
|
|
||||||
//fprintf(stderr, "=======================================================================\n\n\n");
|
|
||||||
//invalid_param = true;
|
|
||||||
//break;
|
|
||||||
} else {
|
} else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
|
|||||||
@@ -275,7 +275,7 @@ extern "C" {
|
|||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||||
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
LLAMA_SPLIT_MODE_GRAPH = 2, // splits computations across GPUs
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -230,7 +230,7 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
|
|||||||
printf(" Oops: null buft for debvice %d\n", device);
|
printf(" Oops: null buft for debvice %d\n", device);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||||
printf("model.splits:");
|
printf("model.splits:");
|
||||||
for (auto s : model.splits) printf(" %g", s);
|
for (auto s : model.splits) printf(" %g", s);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
@@ -305,7 +305,7 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
|
|||||||
}
|
}
|
||||||
if (actual_context) *actual_context = ctx;
|
if (actual_context) *actual_context = ctx;
|
||||||
auto tensor = ml.create_tensor(ctx, name, ne, flags);
|
auto tensor = ml.create_tensor(ctx, name, ne, flags);
|
||||||
//if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_ROW) {
|
//if (tensor && requested_ctx == ctx && model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||||
// int i_layer = -1;
|
// int i_layer = -1;
|
||||||
// if (auto pos = name.find("blk."); pos == 0) {
|
// if (auto pos = name.find("blk."); pos == 0) {
|
||||||
// GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1);
|
// GGML_ASSERT(sscanf(name.c_str(), "blk.%d.", &i_layer) == 1);
|
||||||
@@ -2929,7 +2929,7 @@ bool create_tensors_helper::create_tensors() {
|
|||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
}
|
}
|
||||||
if (model.split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||||
std::vector<size_t> mem_used(model.splits.size(), 0);
|
std::vector<size_t> mem_used(model.splits.size(), 0);
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
|
int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
|
||||||
|
|||||||
@@ -635,7 +635,7 @@ static bool llama_kv_cache_init(
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool split_cache = false;
|
bool split_cache = false;
|
||||||
if (model.split_mode == LLAMA_SPLIT_MODE_ROW && model.arch != LLM_ARCH_DEEPSEEK2 && offload) {
|
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH && model.arch != LLM_ARCH_DEEPSEEK2 && offload) {
|
||||||
cache.split_k_l.reserve(n_layer);
|
cache.split_k_l.reserve(n_layer);
|
||||||
cache.split_v_l.reserve(n_layer);
|
cache.split_v_l.reserve(n_layer);
|
||||||
split_cache = true;
|
split_cache = true;
|
||||||
@@ -1772,7 +1772,7 @@ static bool llm_load_tensors(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_buffer_type_t split_buft;
|
ggml_backend_buffer_type_t split_buft;
|
||||||
if (split_mode == LLAMA_SPLIT_MODE_ROW && model.splits.size() > 1) {
|
if (split_mode == LLAMA_SPLIT_MODE_GRAPH && model.splits.size() > 1) {
|
||||||
split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], model.splits.data());
|
split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], model.splits.data());
|
||||||
} else {
|
} else {
|
||||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
||||||
@@ -4404,7 +4404,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu, cparams.cuda_params);
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu, cparams.cuda_params);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
||||||
@@ -4414,7 +4414,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ggml_backend_add_from_device(ctx, backend);
|
ggml_backend_add_from_device(ctx, backend);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_ROW require a backend for each GPU
|
// LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_GRAPH require a backend for each GPU
|
||||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(device, cparams.cuda_params);
|
ggml_backend_t backend = ggml_backend_cuda_init(device, cparams.cuda_params);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
@@ -4426,7 +4426,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||||
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@@ -4451,8 +4451,8 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
||||||
@@ -4483,9 +4483,9 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
ggml_backend_add_from_device(ctx, backend);
|
ggml_backend_add_from_device(ctx, backend);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
|
||||||
// TODO: ggml_backend_cann is not support split tensor now, just leave code here.
|
// TODO: ggml_backend_cann is not support split tensor now, just leave code here.
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||||
ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
|
||||||
|
|||||||
Reference in New Issue
Block a user