diff --git a/common/common.cpp b/common/common.cpp index b6638cc5..7ad73b9e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -200,6 +200,9 @@ int32_t cpu_get_num_math() { return cpu_get_num_physical_cores(); } +// +// Arg utils +// common_webui common_webui_from_name(const std::string& format) { if (format == "none") { return COMMON_WEBUI_NONE; @@ -224,6 +227,14 @@ static std::string read_file(const std::string& fname) { file.close(); return content; } + +static std::string parse_device_list(const std::string& value) { + if (value==" " || value.find("-")!= std::string::npos) { + throw std::invalid_argument("no devices specified"); + } + return value; +} + // // CLI argument parsing // @@ -1066,7 +1077,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { + if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") { CHECK_ARG params.n_gpu_layers_draft = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { @@ -1213,6 +1224,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "-dev" || arg == "--device") { + CHECK_ARG + std::string value(argv[i]); + params.devices = parse_device_list(value); + return true; + } + if (arg == "-devd" || arg == "--device-draft") { + CHECK_ARG + std::string value(argv[i]); + params.devices_draft = parse_device_list(value); + return true; + } if (arg == "-v" || arg == "--verbose") { params.verbosity = 1; return true; @@ -2002,6 +2025,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param " - row: split rows across GPUs" }); options.push_back({ "*", "-ts, --tensor-split SPLIT", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); + options.push_back({ "*", "-dev, --device dev1,dev2", + "comma-separated list of devices to use for offloading (none = don't offload)\n" + "Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" }); + options.push_back({ "*", "-devd, --device-draft dev1,dev2", + "comma-separated list of devices to use for offloading for the draft model (none = don't offload)\n" + "Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" }); options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); } @@ -2575,7 +2604,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } else { model = llama_load_model_from_file(params.model.c_str(), mparams); } - + if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return iparams; @@ -2692,6 +2721,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; @@ -440,6 +444,7 @@ bool fs_create_directory_with_parents(const std::string & path); std::string fs_get_cache_directory(); std::string fs_get_cache_file(const std::string & filename); + // // Model utils // diff --git a/common/speculative.cpp b/common/speculative.cpp index 5b4f8323..326a2df3 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -91,10 +91,10 @@ bool llama_speculative_are_compatible( const struct llama_vocab * vocab_dft = llama_get_model_vocab(model_dft); const bool vocab_type_tgt = llama_vocab_type(model_tgt); - LLAMA_LOG_INFO("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); + LLAMA_LOG_DEBUG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); const bool vocab_type_dft = llama_vocab_type(model_dft); - LLAMA_LOG_INFO("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); + LLAMA_LOG_DEBUG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); if (vocab_type_tgt != vocab_type_dft) { LLAMA_LOG_INFO("%s: draft model vocab type must match target model to use speculation but ", __func__); @@ -203,13 +203,13 @@ std::vector llama_speculative_gen_draft( std::string text; text = llama_detokenize(ctx_tgt, prompt_tgt_main_model, true); text = replace_to_dft(spec, text); - LLAMA_LOG_INFO("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str()); + LLAMA_LOG_DEBUG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str()); prompt_tgt_draft_model = llama_tokenize(ctx_dft, text, false, true); // convert id_last to draft vocab std::vector id_last_vec(1, id_last); text = llama_detokenize(ctx_tgt, id_last_vec); - LLAMA_LOG_INFO("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str()); + LLAMA_LOG_DEBUG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str()); id_last = llama_tokenize(ctx_dft, text, false, true)[0]; } // prompt_tgt's tokens will always be compatible with ctx_dft @@ -233,8 +233,7 @@ std::vector llama_speculative_gen_draft( reuse_n = cur; } } - - LLAMA_LOG_INFO("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size()); + LLAMA_LOG_DEBUG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size()); std::vector result; result.reserve(params.n_draft); @@ -344,7 +343,7 @@ std::vector llama_speculative_gen_draft( if (!spec->vocab_dft_compatible) { std::string detokenized = llama_detokenize(ctx_dft, result, true); detokenized = replace_to_tgt(spec, detokenized); - LLAMA_LOG_INFO("draft->main detokenized string: '%s'\n", detokenized.c_str()); + LLAMA_LOG_DEBUG("draft->main detokenized string: '%s'\n", detokenized.c_str()); result = llama_tokenize(ctx_tgt, detokenized, false, true); if (result.size() > (size_t)params.n_draft) { result.resize(params.n_draft); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index bc59480e..eb7740b6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1249,6 +1249,7 @@ struct server_context { LOG_INFO("loading draft model", {{"model", params.model_draft}}); gpt_params params_dft; + params_dft.devices = params.devices_draft; params_dft.model = params.model_draft; params_dft.n_ctx = params.n_ctx_draft == 0 ? params.n_ctx / params.n_parallel : params.n_ctx_draft; params_dft.n_gpu_layers = params.n_gpu_layers_draft; @@ -1273,7 +1274,7 @@ struct server_context { cparams_dft = llama_context_params_from_gpt_params(params_dft); cparams_dft.n_batch = n_ctx_dft; - + model_draft = llama_init_dft.model; ctx_draft = llama_init_dft.context; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 3063f0b6..cb88ea1f 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -71,6 +71,7 @@ int main(int argc, char ** argv) { ctx_tgt = llama_init_tgt.context; // load the draft model + params.devices = params.devices_draft; params.model = params.model_draft; params.n_gpu_layers = params.n_gpu_layers_draft; if (params.n_threads_draft > 0) { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 05964db1..2ed282e2 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -528,6 +529,16 @@ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn ini ggml_backend_registry_count++; } +// Backend (reg) enumeration +static bool striequals(const char* a, const char* b) { + for (; *a && *b; a++, b++) { + if (std::tolower(*a) != std::tolower(*b)) { + return false; + } + } + return *a == *b; +} + size_t ggml_backend_reg_get_count(void) { ggml_backend_registry_init(); @@ -539,7 +550,7 @@ size_t ggml_backend_reg_find_by_name(const char * name) { for (size_t i = 0; i < ggml_backend_registry_count; i++) { // TODO: case insensitive in a portable way - if (strcmp(ggml_backend_registry[i].name, name) == 0) { + if (striequals(ggml_backend_registry[i].name, name)) { return i; } } diff --git a/include/llama.h b/include/llama.h index 504f9280..046ac9c4 100644 --- a/include/llama.h +++ b/include/llama.h @@ -342,6 +342,9 @@ extern "C" { }; struct llama_model_params { + // comma separated list of devices to use for offloading + const char* devices; + int32_t n_gpu_layers; // number of layers to store in VRAM int32_t mla; // MLA implementation to use (only applicable to DeepSeek models at this point) enum llama_split_mode split_mode; // how to split the model across multiple GPUs diff --git a/src/llama-context.h b/src/llama-context.h index 056be200..b771d459 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -202,4 +202,7 @@ struct llama_context { struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens] + + ggml_backend_t ggml_backend_by_name(const char * name); + }; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 528184c8..10a777f9 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -12,6 +12,9 @@ struct llama_cparams { uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing + std::vector devices; + std::vector devices_draft; + float rope_freq_base; float rope_freq_scale; diff --git a/src/llama-impl.h b/src/llama-impl.h index ac76e23b..36d92db8 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -38,6 +38,12 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG , __VA_ARGS__) +#ifdef NDEBUG +// Release mode - make LLAMA_LOG_DEBUG a no-op +#define LLAMA_LOG_DEBUG(...) ((void)0) +#else +#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) +#endif #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) diff --git a/src/llama-model.h b/src/llama-model.h index 8446be98..a26c7cb3 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -264,6 +264,7 @@ struct llama_model { int n_gpu_layers; std::vector rpc_servers; + std::vector devices; // gguf metadata std::unordered_map gguf_kv; diff --git a/src/llama.cpp b/src/llama.cpp index 2cce5384..d3ca2ace 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -154,6 +154,39 @@ static std::string trim(const std::string & str) { return str.substr(start, end - start); } + +static std::vector llama_string_split(const std::string& str, const std::string& delimiter) { + std::vector parts; + size_t start = 0; + size_t end = str.find(delimiter); + + while (end != std::string::npos) { + parts.push_back(str.substr(start, end - start)); + start = end + delimiter.length(); + end = str.find(delimiter, start); + } + + parts.push_back(str.substr(start)); + + return parts; + +} + +// extract ip and port from RPC[ip:port] for rpc and keep other device names +static std::vector extract_ip_from_rpc_device(std::vector devices) { + std::vector rpc_servers; + std::regex pattern("RPC\\[(.*?)\\]"); + std::smatch matches; + for (auto device : devices) { + if (std::regex_search(device, matches, pattern)) { + rpc_servers.push_back(matches[1]); + } else { + rpc_servers.push_back(device); + } + } + return rpc_servers; +} + enum llm_chat_template { LLM_CHAT_TEMPLATE_CHATML, LLM_CHAT_TEMPLATE_LLAMA_2, @@ -1501,6 +1534,46 @@ static void llm_prepare_mla(llama_model & model, int mla) { ggml_free(ctx); } +// Backend (reg) enumeration +static bool striequals(const char* a, const char* b) { + for (; *a && *b; a++, b++) { + if (std::tolower(*a) != std::tolower(*b)) { + return false; + } + } + return *a == *b; +} + +ggml_backend_t llama_context::ggml_backend_by_name(const char* name) { + for (auto backend : backends) { + const char* backend_name = ggml_backend_name(backend); + if (striequals(backend_name, name)) { + return backend; + } + } + return nullptr; +} + +static bool item_in_list(const std::vector& devices, const char* name) { + for (auto& device : devices) { + if (striequals(device.c_str(), name)) { + return true; + } + } + return false; +} + +static void ggml_backend_add_from_device(llama_context* ctx, ggml_backend_t backend) { + const char* name = ggml_backend_name(backend); + if (ctx->cparams.devices.size()) { + if (item_in_list(ctx->cparams.devices, name)) { + ctx->backends.push_back(backend); + } + } else { + ctx->backends.push_back(backend); + } +} + // Returns false if cancelled by progress_callback static bool llm_load_tensors( llama_model_loader & ml, @@ -1538,13 +1611,14 @@ static bool llm_load_tensors( if (split_mode == LLAMA_SPLIT_MODE_LAYER) { // calculate the split points - int device_count = llama_get_device_count(model); + // int device_count = llama_get_device_count(model); + int device_count = model.devices.size(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); std::vector splits(device_count); if (all_zero) { // default split, by free memory for (int i = 0; i < device_count; ++i) { - splits[i] = llama_get_device_memory(model, i); + splits[i] = llama_get_device_memory(model, model.devices[i]); } } else { std::copy(tensor_split, tensor_split + device_count, splits.begin()); @@ -1564,35 +1638,35 @@ static bool llm_load_tensors( int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int i = i_gpu_start; i < n_layer; ++i) { int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); - model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu); + model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); } // assign the output layer if (n_gpu_layers > n_layer) { int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); - model.buft_output = llama_default_buffer_type_offload(model, layer_gpu); + model.buft_output = llama_default_buffer_type_offload(model, model.devices[layer_gpu]); } else { model.buft_output = llama_default_buffer_type_cpu(true); } } else { ggml_backend_buffer_type_t split_buft; if (split_mode == LLAMA_SPLIT_MODE_ROW) { - split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split); + split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], tensor_split); } else { // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported - split_buft = llama_default_buffer_type_offload(model, main_gpu); + split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]); } // assign the repeating layers for (int i = i_gpu_start; i < n_layer; ++i) { model.buft_layer[i] = { split_buft, - llama_default_buffer_type_offload(model, main_gpu) + llama_default_buffer_type_offload(model, model.devices[main_gpu]) }; } // assign the output layer if (n_gpu_layers > n_layer) { model.buft_output = { split_buft, - llama_default_buffer_type_offload(model, main_gpu) + llama_default_buffer_type_offload(model, model.devices[main_gpu]) }; } else { model.buft_output = llama_default_buffer_type_cpu(true); @@ -3696,6 +3770,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { // struct llama_model_params llama_model_default_params() { struct llama_model_params result = { + /*.devices =*/ nullptr, /*.n_gpu_layers =*/ 0, /*.mla =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, @@ -3859,6 +3934,17 @@ int64_t llama_time_us(void) { return ggml_time_us(); } +static int32_t find_device_idx(const std::string& str) { + std::regex pattern(R"((\d+)$)"); // Match digits at the end + std::smatch matches; + int number = -1; + if (std::regex_search(str, matches, pattern)) { + number = std::stoi(matches[1]); + } + return number; +} + + struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_model_params params) { @@ -3882,16 +3968,87 @@ struct llama_model * llama_load_model_from_file( return true; }; } - if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { - // split the servers set them into model->rpc_servers - std::string servers(params.rpc_servers); - size_t pos = 0; - while ((pos = servers.find(",")) != std::string::npos) { - std::string server = servers.substr(0, pos); - model->rpc_servers.push_back(server); - servers.erase(0, pos + 1); + + // model->devices hold device indices that are used to offload + // use model->devices to determine offload device + // if no device is specified, all device are included + // if device is specified, only those in the devices are included in the model->devices + + std::vector params_devices = {}; + if (!striequals(params.devices, "")) { + params_devices = llama_string_split(params.devices, ","); + params_devices = extract_ip_from_rpc_device(params_devices); + } + + int32_t idx = 0; + if (params_devices.size()) { + // just the number of GPU on host machine since we have not added any RPC backend + int dev_count = (int)llama_get_device_count(*model); + // list all buffer type names + std::vector buffer_names = {}; + for (int i = 0; i < dev_count; i++) { + ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i); + const char* name = ggml_backend_buft_name(buft); + buffer_names.push_back(std::string(name)); + } + + // add if device matches backend buffer type + for (auto device : params_devices) { + if (item_in_list(buffer_names, device.c_str())) { + idx = find_device_idx(device); + model->devices.push_back(idx); + } else { + LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str()); + } + + } + } else { + // add all backend buffer to device + // just the number of GPU on host machine since we have not added any RPC backend + int dev_count = (int)llama_get_device_count(*model); + for (idx = 0; idx < dev_count; idx++) { + model->devices.push_back(idx); + } + } + if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { + if (params_devices.size()) { + // just the number of GPU on host machine since we have not added any RPC backend + idx = (int)llama_get_device_count(*model); + // split the servers set them into model->rpc_servers + std::vector rpc_servers = llama_string_split(params.rpc_servers, ","); + for (auto device : params_devices) { + if (item_in_list(rpc_servers, device.c_str())) { + model->rpc_servers.push_back(device); + model->devices.push_back(idx); + idx++; + } else { + LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str()); + } + } + } else { + // just number of GPU on host machine since we have not added any RPC backend + idx = (int)llama_get_device_count(*model); + model->rpc_servers = llama_string_split(params.rpc_servers, ","); + for (auto rpc : model->rpc_servers) { + model->devices.push_back(idx); + idx++; + } + } + } + // no gpu used, so set layers offload to be 0 + if (!model->devices.size()) { + params.n_gpu_layers = 0; + LLAMA_LOG_INFO("CPU: using device CPU\n"); + } else { + for (auto i : model->devices) { + ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i); + const char* name = ggml_backend_buft_name(buft); + const char* description = name; + size_t description_size = llama_get_device_memory(*model, i); + LLAMA_LOG_INFO("%s: using device %s - %zu MiB free\n", + name, description, + description_size / 1024 / 1024); } - model->rpc_servers.push_back(servers); } int status = llama_model_load(path_model, *model, params); GGML_ASSERT(status <= 0); @@ -3948,9 +4105,18 @@ struct llama_context * llama_new_context_with_model( llama_context * ctx = new llama_context(*model); + // add devices to ctx->cparams from model + for (int i : model->devices) { + ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i); + const char* name = ggml_backend_buft_name(buft); + std::string device(name); + ctx->cparams.devices.push_back(device); + } + const auto & hparams = model->hparams; auto & cparams = ctx->cparams; + cparams.n_seq_max = std::max(1u, params.n_seq_max); cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; @@ -4077,7 +4243,6 @@ struct llama_context * llama_new_context_with_model( GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); - if (!hparams.vocab_only) { // initialize backends #if defined(GGML_USE_METAL) @@ -4088,7 +4253,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(ctx->backend_metal); + ggml_backend_add_from_device(ctx, ctx->backend_metal); } #elif defined(GGML_USE_CUDA) if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { @@ -4099,7 +4264,8 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); + } else { // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { @@ -4109,7 +4275,8 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); + } } #elif defined(GGML_USE_VULKAN) @@ -4125,7 +4292,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); } else { for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_vk_init(device); @@ -4134,7 +4301,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); } } #elif defined(GGML_USE_SYCL) @@ -4156,7 +4323,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); } } #elif defined(GGML_USE_KOMPUTE) @@ -4167,7 +4334,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); } #elif defined(GGML_USE_CANN) // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used @@ -4179,7 +4346,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); } else { // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version. @@ -4190,7 +4357,7 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); } } #endif @@ -4200,7 +4367,7 @@ struct llama_context * llama_new_context_with_model( if (ctx->backend_blas == nullptr) { LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__); } else { - ctx->backends.push_back(ctx->backend_blas); + ggml_backend_add_from_device(ctx, ctx->backend_blas); } #endif @@ -4213,10 +4380,23 @@ struct llama_context * llama_new_context_with_model( llama_free(ctx); return nullptr; } - ctx->backends.push_back(backend); + ggml_backend_add_from_device(ctx, backend); } } #endif + if (ctx->cparams.devices.size()) { + // reorder the backend from devices params + std::vector backends = {}; + std::vector device_list = {}; + for (auto device : ctx->cparams.devices) { + ggml_backend_t backend = ctx->ggml_backend_by_name(device.c_str()); + if (backend) { + backends.push_back(backend); + } + } + ctx->backends = std::move(backends); + } + ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);