mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-27 08:34:09 +00:00
Disable pipeline parallel for tensor override or allocation failed (#879)
* disable pipeline parallelism when tensor override present * disable pipeline parallel if allocation failed --------- Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -2658,7 +2658,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
auto mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = nullptr;
|
||||
|
||||
|
||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
} else if (!params.model_url.empty()) {
|
||||
|
||||
@@ -1237,6 +1237,10 @@ std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, in
|
||||
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
|
||||
}
|
||||
|
||||
void llama_model::set_tensor_overrides(const llama_model_params& params) {
|
||||
tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
||||
}
|
||||
|
||||
std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||
|
||||
@@ -305,10 +305,18 @@ struct llama_model {
|
||||
// keep track of loaded lora adapters
|
||||
std::set<llama_lora_adapter *> lora_adapters;
|
||||
|
||||
bool tensor_overrides;
|
||||
|
||||
~llama_model();
|
||||
|
||||
// Not actually needed, but left in place for now
|
||||
size_t max_nodes() const { return 65536; }
|
||||
|
||||
bool has_tensor_overrides() const {
|
||||
return tensor_overrides;
|
||||
};
|
||||
|
||||
void set_tensor_overrides(const llama_model_params& params);
|
||||
};
|
||||
|
||||
struct llama_lora_weight {
|
||||
|
||||
@@ -3969,7 +3969,7 @@ struct llama_model * llama_load_model_from_file(
|
||||
return true;
|
||||
};
|
||||
}
|
||||
|
||||
model->set_tensor_overrides(params);
|
||||
// model->devices hold device indices that are used to offload
|
||||
// use model->devices to determine offload device
|
||||
// if no device is specified, all device are included
|
||||
@@ -4479,7 +4479,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
llama_get_device_count(*model) > 1 &&
|
||||
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
||||
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
||||
params.offload_kqv;
|
||||
params.offload_kqv && !model->has_tensor_overrides();
|
||||
#ifndef GGML_USE_CUDA
|
||||
// pipeline parallelism requires support for async compute and events
|
||||
// currently this is only implemented in the CUDA backend
|
||||
@@ -4498,10 +4498,19 @@ struct llama_context * llama_new_context_with_model(
|
||||
ggml_cgraph * gf = llm_build_context::llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
||||
|
||||
// initialize scheduler with the worst-case graph
|
||||
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
bool gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
|
||||
if (!gf_success)
|
||||
{
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, false);
|
||||
gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
|
||||
}
|
||||
if (!gf_success) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ctx->backends.size(); i++) {
|
||||
|
||||
Reference in New Issue
Block a user