diff --git a/src/llama.cpp b/src/llama.cpp index 5a374b6d..1a7faa71 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4053,66 +4053,53 @@ struct llama_model * llama_load_model_from_file( // if no device is specified, all device are included // if device is specified, only those in the devices are included in the model->devices - std::vector params_devices = {}; + std::vector params_devices; if (params.devices && !striequals(params.devices, "")) { params_devices = llama_string_split(params.devices, ","); params_devices = extract_ip_from_rpc_device(params_devices); } + std::map buffer_names; + std::vector gpu_names; + bool has_rpc = params.rpc_servers != nullptr && params.rpc_servers[0] != '\0'; int32_t idx = 0; - if (params_devices.size()) { - // just the number of GPU on host machine since we have not added any RPC backend - int dev_count = (int)llama_get_device_count(*model); - // list all buffer type names - std::vector buffer_names = {}; - for (int i = 0; i < dev_count; i++) { - ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i); - const char* name = ggml_backend_buft_name(buft); - buffer_names.push_back(std::string(name)); - } - - // add if device matches backend buffer type - for (auto device : params_devices) { - if (item_in_list(buffer_names, device.c_str())) { - idx = find_device_idx(device); - model->devices.push_back(idx); - } else { - LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str()); - } - } - } else { - // add all backend buffer to device - // just the number of GPU on host machine since we have not added any RPC backend - int dev_count = (int)llama_get_device_count(*model); - for (idx = 0; idx < dev_count; idx++) { - model->devices.push_back(idx); + int dev_count = (int)llama_get_device_count(*model); + // list all buffer type names + for (idx = 0; idx < dev_count; idx++) { + ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, idx); + const char* name = ggml_backend_buft_name(buft); + buffer_names.insert({ std::string(name), idx }); + gpu_names.push_back(std::string(name)); + } + if (has_rpc) { + model->rpc_servers = llama_string_split(params.rpc_servers, ","); + for (auto rpc : model->rpc_servers) { + buffer_names.insert({ rpc, idx}); + idx++; } } - if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { - if (params_devices.size()) { - // just the number of GPU on host machine since we have not added any RPC backend - idx = (int)llama_get_device_count(*model); - // split the servers set them into model->rpc_servers - std::vector rpc_servers = llama_string_split(params.rpc_servers, ","); - for (auto device : params_devices) { - if (item_in_list(rpc_servers, device.c_str())) { - model->rpc_servers.push_back(device); - model->devices.push_back(idx); - idx++; - } else { - LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str()); - } - } - } else { - // just number of GPU on host machine since we have not added any RPC backend - idx = (int)llama_get_device_count(*model); - model->rpc_servers = llama_string_split(params.rpc_servers, ","); - for (auto rpc : model->rpc_servers) { - model->devices.push_back(idx); - idx++; - } - } - } + std::vector device_names; + if (params_devices.size()) { + device_names = params_devices; + } + else { + // add RPC servers at the front of the list to minimize the network transfers + if (has_rpc) { + device_names = model->rpc_servers; + } + device_names.insert(device_names.end(), gpu_names.begin(), gpu_names.end()); + } + + for (auto device : device_names) { + if (buffer_names.count(device)) { + model->devices.push_back(buffer_names[device]); + } + else { + LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str()); + } + } + + // no gpu used, so set layers offload to be 0 if (!model->devices.size()) { params.n_gpu_layers = 0;