mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Support --device and --device-draft parameter (#866)
* add --device and --device-draft parameter * don't print debug message in release mode * fix * bug fix to throw exception when no device specified * add const --------- Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -200,6 +200,9 @@ int32_t cpu_get_num_math() {
|
||||
return cpu_get_num_physical_cores();
|
||||
}
|
||||
|
||||
//
|
||||
// Arg utils
|
||||
//
|
||||
common_webui common_webui_from_name(const std::string& format) {
|
||||
if (format == "none") {
|
||||
return COMMON_WEBUI_NONE;
|
||||
@@ -224,6 +227,14 @@ static std::string read_file(const std::string& fname) {
|
||||
file.close();
|
||||
return content;
|
||||
}
|
||||
|
||||
static std::string parse_device_list(const std::string& value) {
|
||||
if (value==" " || value.find("-")!= std::string::npos) {
|
||||
throw std::invalid_argument("no devices specified");
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
@@ -1066,7 +1077,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
|
||||
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
|
||||
CHECK_ARG
|
||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
@@ -1213,6 +1224,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
if (arg == "-dev" || arg == "--device") {
|
||||
CHECK_ARG
|
||||
std::string value(argv[i]);
|
||||
params.devices = parse_device_list(value);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-devd" || arg == "--device-draft") {
|
||||
CHECK_ARG
|
||||
std::string value(argv[i]);
|
||||
params.devices_draft = parse_device_list(value);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-v" || arg == "--verbose") {
|
||||
params.verbosity = 1;
|
||||
return true;
|
||||
@@ -2002,6 +2025,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
" - row: split rows across GPUs" });
|
||||
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
||||
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
||||
options.push_back({ "*", "-dev, --device dev1,dev2",
|
||||
"comma-separated list of devices to use for offloading (none = don't offload)\n"
|
||||
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
|
||||
options.push_back({ "*", "-devd, --device-draft dev1,dev2",
|
||||
"comma-separated list of devices to use for offloading for the draft model (none = don't offload)\n"
|
||||
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
|
||||
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
|
||||
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
|
||||
}
|
||||
@@ -2575,7 +2604,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
} else {
|
||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return iparams;
|
||||
@@ -2692,6 +2721,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
|
||||
|
||||
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
||||
auto mparams = llama_model_default_params();
|
||||
mparams.devices = params.devices.c_str();
|
||||
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
|
||||
@@ -126,6 +126,9 @@ struct model_paths {
|
||||
};
|
||||
|
||||
struct gpt_params {
|
||||
std::string devices;
|
||||
std::string devices_draft;
|
||||
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
|
||||
int32_t n_threads = cpu_get_num_math();
|
||||
@@ -193,6 +196,7 @@ struct gpt_params {
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||
|
||||
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
@@ -440,6 +444,7 @@ bool fs_create_directory_with_parents(const std::string & path);
|
||||
std::string fs_get_cache_directory();
|
||||
std::string fs_get_cache_file(const std::string & filename);
|
||||
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
||||
@@ -91,10 +91,10 @@ bool llama_speculative_are_compatible(
|
||||
const struct llama_vocab * vocab_dft = llama_get_model_vocab(model_dft);
|
||||
|
||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||
LLAMA_LOG_INFO("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
|
||||
LLAMA_LOG_DEBUG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
|
||||
|
||||
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||
LLAMA_LOG_INFO("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||
LLAMA_LOG_DEBUG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
LLAMA_LOG_INFO("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
@@ -203,13 +203,13 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
std::string text;
|
||||
text = llama_detokenize(ctx_tgt, prompt_tgt_main_model, true);
|
||||
text = replace_to_dft(spec, text);
|
||||
LLAMA_LOG_INFO("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
|
||||
LLAMA_LOG_DEBUG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
|
||||
prompt_tgt_draft_model = llama_tokenize(ctx_dft, text, false, true);
|
||||
|
||||
// convert id_last to draft vocab
|
||||
std::vector<llama_token> id_last_vec(1, id_last);
|
||||
text = llama_detokenize(ctx_tgt, id_last_vec);
|
||||
LLAMA_LOG_INFO("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
|
||||
LLAMA_LOG_DEBUG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
|
||||
id_last = llama_tokenize(ctx_dft, text, false, true)[0];
|
||||
}
|
||||
// prompt_tgt's tokens will always be compatible with ctx_dft
|
||||
@@ -233,8 +233,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
reuse_n = cur;
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
||||
LLAMA_LOG_DEBUG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
||||
|
||||
std::vector<llama_token> result;
|
||||
result.reserve(params.n_draft);
|
||||
@@ -344,7 +343,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
|
||||
if (!spec->vocab_dft_compatible) {
|
||||
std::string detokenized = llama_detokenize(ctx_dft, result, true);
|
||||
detokenized = replace_to_tgt(spec, detokenized);
|
||||
LLAMA_LOG_INFO("draft->main detokenized string: '%s'\n", detokenized.c_str());
|
||||
LLAMA_LOG_DEBUG("draft->main detokenized string: '%s'\n", detokenized.c_str());
|
||||
result = llama_tokenize(ctx_tgt, detokenized, false, true);
|
||||
if (result.size() > (size_t)params.n_draft) {
|
||||
result.resize(params.n_draft);
|
||||
|
||||
@@ -1249,6 +1249,7 @@ struct server_context {
|
||||
LOG_INFO("loading draft model", {{"model", params.model_draft}});
|
||||
|
||||
gpt_params params_dft;
|
||||
params_dft.devices = params.devices_draft;
|
||||
params_dft.model = params.model_draft;
|
||||
params_dft.n_ctx = params.n_ctx_draft == 0 ? params.n_ctx / params.n_parallel : params.n_ctx_draft;
|
||||
params_dft.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
@@ -1273,7 +1274,7 @@ struct server_context {
|
||||
|
||||
cparams_dft = llama_context_params_from_gpt_params(params_dft);
|
||||
cparams_dft.n_batch = n_ctx_dft;
|
||||
|
||||
|
||||
model_draft = llama_init_dft.model;
|
||||
ctx_draft = llama_init_dft.context;
|
||||
}
|
||||
|
||||
@@ -71,6 +71,7 @@ int main(int argc, char ** argv) {
|
||||
ctx_tgt = llama_init_tgt.context;
|
||||
|
||||
// load the draft model
|
||||
params.devices = params.devices_draft;
|
||||
params.model = params.model_draft;
|
||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
if (params.n_threads_draft > 0) {
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
@@ -528,6 +529,16 @@ GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn ini
|
||||
ggml_backend_registry_count++;
|
||||
}
|
||||
|
||||
// Backend (reg) enumeration
|
||||
static bool striequals(const char* a, const char* b) {
|
||||
for (; *a && *b; a++, b++) {
|
||||
if (std::tolower(*a) != std::tolower(*b)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return *a == *b;
|
||||
}
|
||||
|
||||
size_t ggml_backend_reg_get_count(void) {
|
||||
ggml_backend_registry_init();
|
||||
|
||||
@@ -539,7 +550,7 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
|
||||
|
||||
for (size_t i = 0; i < ggml_backend_registry_count; i++) {
|
||||
// TODO: case insensitive in a portable way
|
||||
if (strcmp(ggml_backend_registry[i].name, name) == 0) {
|
||||
if (striequals(ggml_backend_registry[i].name, name)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -342,6 +342,9 @@ extern "C" {
|
||||
};
|
||||
|
||||
struct llama_model_params {
|
||||
// comma separated list of devices to use for offloading
|
||||
const char* devices;
|
||||
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t mla; // MLA implementation to use (only applicable to DeepSeek models at this point)
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
@@ -202,4 +202,7 @@ struct llama_context {
|
||||
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
||||
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
||||
struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens]
|
||||
|
||||
ggml_backend_t ggml_backend_by_name(const char * name);
|
||||
|
||||
};
|
||||
|
||||
@@ -12,6 +12,9 @@ struct llama_cparams {
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
std::vector<std::string> devices;
|
||||
std::vector<std::string> devices_draft;
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
|
||||
|
||||
@@ -38,6 +38,12 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
||||
|
||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG , __VA_ARGS__)
|
||||
#ifdef NDEBUG
|
||||
// Release mode - make LLAMA_LOG_DEBUG a no-op
|
||||
#define LLAMA_LOG_DEBUG(...) ((void)0)
|
||||
#else
|
||||
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#endif
|
||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
|
||||
|
||||
@@ -264,6 +264,7 @@ struct llama_model {
|
||||
int n_gpu_layers;
|
||||
|
||||
std::vector<std::string> rpc_servers;
|
||||
std::vector<int32_t> devices;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
238
src/llama.cpp
238
src/llama.cpp
@@ -154,6 +154,39 @@ static std::string trim(const std::string & str) {
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
|
||||
static std::vector<std::string> llama_string_split(const std::string& str, const std::string& delimiter) {
|
||||
std::vector<std::string> parts;
|
||||
size_t start = 0;
|
||||
size_t end = str.find(delimiter);
|
||||
|
||||
while (end != std::string::npos) {
|
||||
parts.push_back(str.substr(start, end - start));
|
||||
start = end + delimiter.length();
|
||||
end = str.find(delimiter, start);
|
||||
}
|
||||
|
||||
parts.push_back(str.substr(start));
|
||||
|
||||
return parts;
|
||||
|
||||
}
|
||||
|
||||
// extract ip and port from RPC[ip:port] for rpc and keep other device names
|
||||
static std::vector<std::string> extract_ip_from_rpc_device(std::vector<std::string> devices) {
|
||||
std::vector<std::string> rpc_servers;
|
||||
std::regex pattern("RPC\\[(.*?)\\]");
|
||||
std::smatch matches;
|
||||
for (auto device : devices) {
|
||||
if (std::regex_search(device, matches, pattern)) {
|
||||
rpc_servers.push_back(matches[1]);
|
||||
} else {
|
||||
rpc_servers.push_back(device);
|
||||
}
|
||||
}
|
||||
return rpc_servers;
|
||||
}
|
||||
|
||||
enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_CHATML,
|
||||
LLM_CHAT_TEMPLATE_LLAMA_2,
|
||||
@@ -1501,6 +1534,46 @@ static void llm_prepare_mla(llama_model & model, int mla) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
// Backend (reg) enumeration
|
||||
static bool striequals(const char* a, const char* b) {
|
||||
for (; *a && *b; a++, b++) {
|
||||
if (std::tolower(*a) != std::tolower(*b)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return *a == *b;
|
||||
}
|
||||
|
||||
ggml_backend_t llama_context::ggml_backend_by_name(const char* name) {
|
||||
for (auto backend : backends) {
|
||||
const char* backend_name = ggml_backend_name(backend);
|
||||
if (striequals(backend_name, name)) {
|
||||
return backend;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static bool item_in_list(const std::vector<std::string>& devices, const char* name) {
|
||||
for (auto& device : devices) {
|
||||
if (striequals(device.c_str(), name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_add_from_device(llama_context* ctx, ggml_backend_t backend) {
|
||||
const char* name = ggml_backend_name(backend);
|
||||
if (ctx->cparams.devices.size()) {
|
||||
if (item_in_list(ctx->cparams.devices, name)) {
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
} else {
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns false if cancelled by progress_callback
|
||||
static bool llm_load_tensors(
|
||||
llama_model_loader & ml,
|
||||
@@ -1538,13 +1611,14 @@ static bool llm_load_tensors(
|
||||
|
||||
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||
// calculate the split points
|
||||
int device_count = llama_get_device_count(model);
|
||||
// int device_count = llama_get_device_count(model);
|
||||
int device_count = model.devices.size();
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
||||
std::vector<float> splits(device_count);
|
||||
if (all_zero) {
|
||||
// default split, by free memory
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
splits[i] = llama_get_device_memory(model, i);
|
||||
splits[i] = llama_get_device_memory(model, model.devices[i]);
|
||||
}
|
||||
} else {
|
||||
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
||||
@@ -1564,35 +1638,35 @@ static bool llm_load_tensors(
|
||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
for (int i = i_gpu_start; i < n_layer; ++i) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
|
||||
}
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
||||
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
||||
model.buft_output = llama_default_buffer_type_offload(model, model.devices[layer_gpu]);
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
} else {
|
||||
ggml_backend_buffer_type_t split_buft;
|
||||
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
||||
split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu], tensor_split);
|
||||
} else {
|
||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
||||
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
||||
split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
|
||||
}
|
||||
// assign the repeating layers
|
||||
for (int i = i_gpu_start; i < n_layer; ++i) {
|
||||
model.buft_layer[i] = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(model, main_gpu)
|
||||
llama_default_buffer_type_offload(model, model.devices[main_gpu])
|
||||
};
|
||||
}
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
model.buft_output = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(model, main_gpu)
|
||||
llama_default_buffer_type_offload(model, model.devices[main_gpu])
|
||||
};
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
@@ -3696,6 +3770,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
||||
//
|
||||
struct llama_model_params llama_model_default_params() {
|
||||
struct llama_model_params result = {
|
||||
/*.devices =*/ nullptr,
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.mla =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
@@ -3859,6 +3934,17 @@ int64_t llama_time_us(void) {
|
||||
return ggml_time_us();
|
||||
}
|
||||
|
||||
static int32_t find_device_idx(const std::string& str) {
|
||||
std::regex pattern(R"((\d+)$)"); // Match digits at the end
|
||||
std::smatch matches;
|
||||
int number = -1;
|
||||
if (std::regex_search(str, matches, pattern)) {
|
||||
number = std::stoi(matches[1]);
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
@@ -3882,16 +3968,87 @@ struct llama_model * llama_load_model_from_file(
|
||||
return true;
|
||||
};
|
||||
}
|
||||
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
||||
// split the servers set them into model->rpc_servers
|
||||
std::string servers(params.rpc_servers);
|
||||
size_t pos = 0;
|
||||
while ((pos = servers.find(",")) != std::string::npos) {
|
||||
std::string server = servers.substr(0, pos);
|
||||
model->rpc_servers.push_back(server);
|
||||
servers.erase(0, pos + 1);
|
||||
|
||||
// model->devices hold device indices that are used to offload
|
||||
// use model->devices to determine offload device
|
||||
// if no device is specified, all device are included
|
||||
// if device is specified, only those in the devices are included in the model->devices
|
||||
|
||||
std::vector<std::string> params_devices = {};
|
||||
if (!striequals(params.devices, "")) {
|
||||
params_devices = llama_string_split(params.devices, ",");
|
||||
params_devices = extract_ip_from_rpc_device(params_devices);
|
||||
}
|
||||
|
||||
int32_t idx = 0;
|
||||
if (params_devices.size()) {
|
||||
// just the number of GPU on host machine since we have not added any RPC backend
|
||||
int dev_count = (int)llama_get_device_count(*model);
|
||||
// list all buffer type names
|
||||
std::vector<std::string> buffer_names = {};
|
||||
for (int i = 0; i < dev_count; i++) {
|
||||
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i);
|
||||
const char* name = ggml_backend_buft_name(buft);
|
||||
buffer_names.push_back(std::string(name));
|
||||
}
|
||||
|
||||
// add if device matches backend buffer type
|
||||
for (auto device : params_devices) {
|
||||
if (item_in_list(buffer_names, device.c_str())) {
|
||||
idx = find_device_idx(device);
|
||||
model->devices.push_back(idx);
|
||||
} else {
|
||||
LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str());
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
// add all backend buffer to device
|
||||
// just the number of GPU on host machine since we have not added any RPC backend
|
||||
int dev_count = (int)llama_get_device_count(*model);
|
||||
for (idx = 0; idx < dev_count; idx++) {
|
||||
model->devices.push_back(idx);
|
||||
}
|
||||
}
|
||||
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
||||
if (params_devices.size()) {
|
||||
// just the number of GPU on host machine since we have not added any RPC backend
|
||||
idx = (int)llama_get_device_count(*model);
|
||||
// split the servers set them into model->rpc_servers
|
||||
std::vector <std::string> rpc_servers = llama_string_split(params.rpc_servers, ",");
|
||||
for (auto device : params_devices) {
|
||||
if (item_in_list(rpc_servers, device.c_str())) {
|
||||
model->rpc_servers.push_back(device);
|
||||
model->devices.push_back(idx);
|
||||
idx++;
|
||||
} else {
|
||||
LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// just number of GPU on host machine since we have not added any RPC backend
|
||||
idx = (int)llama_get_device_count(*model);
|
||||
model->rpc_servers = llama_string_split(params.rpc_servers, ",");
|
||||
for (auto rpc : model->rpc_servers) {
|
||||
model->devices.push_back(idx);
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// no gpu used, so set layers offload to be 0
|
||||
if (!model->devices.size()) {
|
||||
params.n_gpu_layers = 0;
|
||||
LLAMA_LOG_INFO("CPU: using device CPU\n");
|
||||
} else {
|
||||
for (auto i : model->devices) {
|
||||
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i);
|
||||
const char* name = ggml_backend_buft_name(buft);
|
||||
const char* description = name;
|
||||
size_t description_size = llama_get_device_memory(*model, i);
|
||||
LLAMA_LOG_INFO("%s: using device %s - %zu MiB free\n",
|
||||
name, description,
|
||||
description_size / 1024 / 1024);
|
||||
}
|
||||
model->rpc_servers.push_back(servers);
|
||||
}
|
||||
int status = llama_model_load(path_model, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
@@ -3948,9 +4105,18 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
llama_context * ctx = new llama_context(*model);
|
||||
|
||||
// add devices to ctx->cparams from model
|
||||
for (int i : model->devices) {
|
||||
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i);
|
||||
const char* name = ggml_backend_buft_name(buft);
|
||||
std::string device(name);
|
||||
ctx->cparams.devices.push_back(device);
|
||||
}
|
||||
|
||||
const auto & hparams = model->hparams;
|
||||
auto & cparams = ctx->cparams;
|
||||
|
||||
|
||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
@@ -4077,7 +4243,6 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
// initialize backends
|
||||
#if defined(GGML_USE_METAL)
|
||||
@@ -4088,7 +4253,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(ctx->backend_metal);
|
||||
ggml_backend_add_from_device(ctx, ctx->backend_metal);
|
||||
}
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
@@ -4099,7 +4264,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
|
||||
} else {
|
||||
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
||||
@@ -4109,7 +4275,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
@@ -4125,7 +4292,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
} else {
|
||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
||||
ggml_backend_t backend = ggml_backend_vk_init(device);
|
||||
@@ -4134,7 +4301,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
@@ -4156,7 +4323,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
@@ -4167,7 +4334,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
}
|
||||
#elif defined(GGML_USE_CANN)
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||
@@ -4179,7 +4346,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
} else {
|
||||
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
||||
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
|
||||
@@ -4190,7 +4357,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -4200,7 +4367,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
if (ctx->backend_blas == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
|
||||
} else {
|
||||
ctx->backends.push_back(ctx->backend_blas);
|
||||
ggml_backend_add_from_device(ctx, ctx->backend_blas);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -4213,10 +4380,23 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
ggml_backend_add_from_device(ctx, backend);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (ctx->cparams.devices.size()) {
|
||||
// reorder the backend from devices params
|
||||
std::vector<ggml_backend_t> backends = {};
|
||||
std::vector<const char*> device_list = {};
|
||||
for (auto device : ctx->cparams.devices) {
|
||||
ggml_backend_t backend = ctx->ggml_backend_by_name(device.c_str());
|
||||
if (backend) {
|
||||
backends.push_back(backend);
|
||||
}
|
||||
}
|
||||
ctx->backends = std::move(backends);
|
||||
}
|
||||
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||
|
||||
Reference in New Issue
Block a user