Change default RPC order and fix wrong RPC order in --device arg

This commit is contained in:
firecoperana
2025-11-24 23:00:25 -06:00
parent a3b8efd687
commit 2339d41d2e

View File

@@ -4053,66 +4053,53 @@ struct llama_model * llama_load_model_from_file(
// if no device is specified, all device are included
// if device is specified, only those in the devices are included in the model->devices
std::vector<std::string> params_devices = {};
std::vector<std::string> params_devices;
if (params.devices && !striequals(params.devices, "")) {
params_devices = llama_string_split(params.devices, ",");
params_devices = extract_ip_from_rpc_device(params_devices);
}
std::map<std::string, int32_t> buffer_names;
std::vector<std::string> gpu_names;
bool has_rpc = params.rpc_servers != nullptr && params.rpc_servers[0] != '\0';
int32_t idx = 0;
if (params_devices.size()) {
// just the number of GPU on host machine since we have not added any RPC backend
int dev_count = (int)llama_get_device_count(*model);
// list all buffer type names
std::vector<std::string> buffer_names = {};
for (int i = 0; i < dev_count; i++) {
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i);
const char* name = ggml_backend_buft_name(buft);
buffer_names.push_back(std::string(name));
}
// add if device matches backend buffer type
for (auto device : params_devices) {
if (item_in_list(buffer_names, device.c_str())) {
idx = find_device_idx(device);
model->devices.push_back(idx);
} else {
LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str());
}
}
} else {
// add all backend buffer to device
// just the number of GPU on host machine since we have not added any RPC backend
int dev_count = (int)llama_get_device_count(*model);
for (idx = 0; idx < dev_count; idx++) {
model->devices.push_back(idx);
int dev_count = (int)llama_get_device_count(*model);
// list all buffer type names
for (idx = 0; idx < dev_count; idx++) {
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, idx);
const char* name = ggml_backend_buft_name(buft);
buffer_names.insert({ std::string(name), idx });
gpu_names.push_back(std::string(name));
}
if (has_rpc) {
model->rpc_servers = llama_string_split(params.rpc_servers, ",");
for (auto rpc : model->rpc_servers) {
buffer_names.insert({ rpc, idx});
idx++;
}
}
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
if (params_devices.size()) {
// just the number of GPU on host machine since we have not added any RPC backend
idx = (int)llama_get_device_count(*model);
// split the servers set them into model->rpc_servers
std::vector <std::string> rpc_servers = llama_string_split(params.rpc_servers, ",");
for (auto device : params_devices) {
if (item_in_list(rpc_servers, device.c_str())) {
model->rpc_servers.push_back(device);
model->devices.push_back(idx);
idx++;
} else {
LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str());
}
}
} else {
// just number of GPU on host machine since we have not added any RPC backend
idx = (int)llama_get_device_count(*model);
model->rpc_servers = llama_string_split(params.rpc_servers, ",");
for (auto rpc : model->rpc_servers) {
model->devices.push_back(idx);
idx++;
}
}
}
std::vector<std::string> device_names;
if (params_devices.size()) {
device_names = params_devices;
}
else {
// add RPC servers at the front of the list to minimize the network transfers
if (has_rpc) {
device_names = model->rpc_servers;
}
device_names.insert(device_names.end(), gpu_names.begin(), gpu_names.end());
}
for (auto device : device_names) {
if (buffer_names.count(device)) {
model->devices.push_back(buffer_names[device]);
}
else {
LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str());
}
}
// no gpu used, so set layers offload to be 0
if (!model->devices.size()) {
params.n_gpu_layers = 0;