mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-07 12:30:08 +00:00
Be able to quantize mmproj files (#1367)
This commit is contained in:
@@ -39,7 +39,7 @@ static inline const char * llm_expert_gating_func_name(llm_expert_gating_func_ty
|
||||
|
||||
void llm_load_hparams(
|
||||
llama_model_loader & ml,
|
||||
llama_model & model) {
|
||||
llama_model & model, bool ignore_vocab) {
|
||||
auto & hparams = model.hparams;
|
||||
const gguf_context * ctx = ml.meta;
|
||||
|
||||
@@ -54,11 +54,13 @@ void llm_load_hparams(
|
||||
model.gguf_kv.emplace(name, value);
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
|
||||
// get general kv
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
||||
|
||||
// get hparams kv
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, !ignore_vocab);
|
||||
|
||||
// everything past this point is not vocab-related
|
||||
if (hparams.vocab_only) {
|
||||
@@ -67,7 +69,6 @@ void llm_load_hparams(
|
||||
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
|
||||
|
||||
@@ -176,7 +176,7 @@ struct llama_model_loader {
|
||||
|
||||
void llm_load_arch(llama_model_loader & ml, llama_model & model);
|
||||
|
||||
void llm_load_hparams(llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_hparams(llama_model_loader & ml, llama_model & model, bool ignore_vocab = false);
|
||||
|
||||
struct create_tensors_helper_interface {
|
||||
virtual ~create_tensors_helper_interface() = default;
|
||||
|
||||
@@ -1026,8 +1026,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model;
|
||||
llm_load_arch(ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
try {
|
||||
llm_load_arch(ml, model);
|
||||
} catch(const std::exception & e) {
|
||||
LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX %s\n", e.what());
|
||||
}
|
||||
try {
|
||||
llm_load_hparams(ml, model, true);
|
||||
} catch(const std::exception & e) {
|
||||
LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX %s\n", e.what());
|
||||
}
|
||||
|
||||
struct quantize_state_internal qs(model, params);
|
||||
|
||||
@@ -1159,7 +1167,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
// - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models
|
||||
// - model.arch == LLM_ARCH_DECI for Deci-Nemotron models
|
||||
//
|
||||
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer || model.arch == LLM_ARCH_DECI) && "n_attention_wv is unexpected");
|
||||
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer ||
|
||||
model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_UNKNOWN) && "n_attention_wv is unexpected");
|
||||
|
||||
size_t total_size_org = 0;
|
||||
size_t total_size_new = 0;
|
||||
|
||||
Reference in New Issue
Block a user