From 20df7b89c80e424e998b4e98900139facbc027c3 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 20 Mar 2025 09:11:33 +0200 Subject: [PATCH] Repack a model with the quantize tool --- examples/quantize/quantize.cpp | 3 ++ ggml/src/ggml.c | 4 ++ ggml/src/iqk/iqk_quantize.cpp | 62 ++++++++++++++++------ ggml/src/iqk/iqk_quantize.h | 3 ++ include/llama.h | 1 + src/llama.cpp | 95 ++++++++++++++++++++++++++++++++-- 6 files changed, 149 insertions(+), 19 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 89de794b..84ea38d4 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -145,6 +145,7 @@ static void usage(const char * executable) { printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token_embd.weight tensor.\n\n"); printf(" --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n"); + printf(" --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n"); printf("Additional specific tensor quantization types used in the custom quant scheme 'CQS (default is Q2_K):\n"); printf(" --attn-q-type ggml_type: use this ggml_type for the attn_q.weight tensor.\n"); printf(" --attn-k-type ggml_type: use this ggml_type for the attn_k.weight tensor.\n"); @@ -331,6 +332,8 @@ int main(int argc, char ** argv) { params.quantize_output_tensor = false; } else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) { params.ignore_imatrix_rules = true; + } else if (strcmp(argv[arg_idx], "--repack") == 0) { + params.only_repack = true; } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { if (arg_idx < argc-1) { params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index faf1902d..38ba156f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -10626,6 +10626,10 @@ static void ggml_compute_forward_dup_q( return; } + if (dst->type != GGML_TYPE_F32) { + printf("%s: %s -> %s is of type %s\n", __func__, dst->src[0]->name, dst->name, ggml_type_name(dst->type)); + GGML_ABORT("fatal error"); + } GGML_ASSERT(dst->type == GGML_TYPE_F32); struct ggml_tensor * src0 = dst->src[0]; GGML_ASSERT(src0->ne[0] == dst->ne[0] && src0->nb[0] == ggml_type_size(src0->type)); diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index fb6a5db4..61e70b40 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -6766,9 +6766,7 @@ struct Modify { modify_func_t mod_func; int nrows; }; -} - -bool iqk_modify_tensor(struct ggml_tensor * tensor) { +const Modify * get_modify_info(ggml_type type) { static const std::unordered_map k_mod_map = { #ifdef __ARM_NEON { GGML_TYPE_Q4_0_R8, {modify_q4_0_r8, 8} }, @@ -6779,10 +6777,28 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_Q8_KV_R8, {modify_q8_KV_r8, 8} }, #endif }; - auto it = k_mod_map.find(tensor->type); - if (it == k_mod_map.end()) return false; + auto it = k_mod_map.find(type); + return it != k_mod_map.end() ? &it->second : nullptr; +} +bool is_forbidden_tensor(const std::string& name) { + if (name == "token_embd.weight") return true; + if (auto pos = name.find("attn_kv_b.weight"); pos != std::string::npos) return true; + return false; +} +} - auto& m = it->second; +bool iqk_should_modify_tensor(const struct ggml_tensor * tensor) { + if (is_forbidden_tensor(tensor->name)) return false; + auto mptr = get_modify_info(tensor->type); + return mptr ? true : false; +} + +bool iqk_modify_tensor(struct ggml_tensor * tensor) { + auto mptr = get_modify_info(tensor->type); + if (!mptr) return false; + if (is_forbidden_tensor(tensor->name)) return false; + + auto& m = *mptr; int nrows = ggml_nrows(tensor); int nchunks = nrows/m.nrows; int max_thread = std::max(1, int(std::thread::hardware_concurrency()/2)); @@ -6805,12 +6821,8 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) { return true; } -void iqk_repack_tensor(struct ggml_tensor * tensor) { - constexpr int kChunk = 8; - if (!tensor) return; - if (!ggml_is_contiguous(tensor)) return; - if (strncmp(tensor->name, "token_embd.weight", GGML_MAX_NAME) == 0) return; - if (tensor->ne[1] % 4) return; +namespace { +const Repack * get_repack_info(ggml_type type) { static const std::unordered_map k_map = { { GGML_TYPE_IQ2_K, { GGML_TYPE_IQ2_K_R4, 4, (Repack::repack_func)repack_iq2_k} }, { GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} }, @@ -6841,12 +6853,30 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_F16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16} }, #endif }; + auto it = k_map.find(type); + return it != k_map.end() ? &it->second : nullptr; +} +} - auto it = k_map.find(tensor->type); - if (it == k_map.end()) return; - if (tensor->ne[1] % it->second.num_rows) return; +int iqk_repacked_type(const struct ggml_tensor * tensor) { + if (!ggml_is_contiguous(tensor)) return (int)tensor->type; + if (is_forbidden_tensor(tensor->name)) return (int)tensor->type; + auto rptr = get_repack_info(tensor->type); + return rptr && tensor->ne[1] % rptr->num_rows == 0 ? (int)rptr->new_type : (int)tensor->type; +} - auto& r = it->second; +void iqk_repack_tensor(struct ggml_tensor * tensor) { + constexpr int kChunk = 8; + if (!tensor) return; + if (!ggml_is_contiguous(tensor)) return; + if (is_forbidden_tensor(tensor->name)) return; + if (tensor->ne[1] % 4) return; + + auto rptr = get_repack_info(tensor->type); + if (!rptr) return; + if (tensor->ne[1] % rptr->num_rows) return; + + auto& r = *rptr; auto nrows = ggml_nrows(tensor); diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index d447705b..dd148f2e 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -245,6 +245,9 @@ void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT d void iqk_repack_tensor(struct ggml_tensor * tensor); bool iqk_modify_tensor(struct ggml_tensor * tensor); +int iqk_repacked_type(const struct ggml_tensor * tensor); // int instead of ggml_type so we don't need to include ggml.h +bool iqk_should_modify_tensor(const struct ggml_tensor * tensor); + // So we can re-pack Microsoft's BitNet I2_S quants void dequantize_row_ms_i2s(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); diff --git a/include/llama.h b/include/llama.h index 5e86cb68..66e9af02 100644 --- a/include/llama.h +++ b/include/llama.h @@ -416,6 +416,7 @@ extern "C" { bool pure; // quantize all tensors to the default type bool keep_split; // quantize to the same number of shards bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored + bool only_repack; // Only repack tensors void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides void * custom_quants; // pointer to vector containing custom quantization rules diff --git a/src/llama.cpp b/src/llama.cpp index 03139e41..5d8be461 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17140,11 +17140,48 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa return new_size; } +static llama_ftype repacked_ftype(llama_ftype ftype) { + static std::unordered_map k_map = { + { LLAMA_FTYPE_MOSTLY_Q4_0, LLAMA_FTYPE_MOSTLY_Q4_0_R8 }, + { LLAMA_FTYPE_MOSTLY_Q8_0, LLAMA_FTYPE_MOSTLY_Q8_0_R8 }, + { LLAMA_FTYPE_MOSTLY_Q5_0, LLAMA_FTYPE_MOSTLY_Q5_0_R4 }, + { LLAMA_FTYPE_MOSTLY_Q2_K, LLAMA_FTYPE_MOSTLY_Q2_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q3_K_S, LLAMA_FTYPE_MOSTLY_Q3_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q3_K_M, LLAMA_FTYPE_MOSTLY_Q3_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q3_K_L, LLAMA_FTYPE_MOSTLY_Q3_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q4_K_S, LLAMA_FTYPE_MOSTLY_Q4_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q4_K_M, LLAMA_FTYPE_MOSTLY_Q4_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q5_K_S, LLAMA_FTYPE_MOSTLY_Q5_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q5_K_M, LLAMA_FTYPE_MOSTLY_Q5_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q6_K, LLAMA_FTYPE_MOSTLY_Q6_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ2_XXS, LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ2_XS, LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ3_XXS, LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ1_S, LLAMA_FTYPE_MOSTLY_IQ1_S_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ4_NL, LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ3_S, LLAMA_FTYPE_MOSTLY_IQ3_S_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ2_M, LLAMA_FTYPE_MOSTLY_IQ2_M_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ4_XS, LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 }, + { LLAMA_FTYPE_MOSTLY_IQ1_M, LLAMA_FTYPE_MOSTLY_IQ1_M_R4 }, + { LLAMA_FTYPE_MOSTLY_Q6_0, LLAMA_FTYPE_MOSTLY_Q6_0_R4 }, + { LLAMA_FTYPE_MOSTLY_BF16, LLAMA_FTYPE_MOSTLY_BF16_R16 }, + { LLAMA_FTYPE_MOSTLY_IQ2_BN, LLAMA_FTYPE_MOSTLY_IQ2_BN_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ2_K, LLAMA_FTYPE_MOSTLY_IQ2_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ3_K, LLAMA_FTYPE_MOSTLY_IQ3_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ4_K, LLAMA_FTYPE_MOSTLY_IQ4_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ5_K, LLAMA_FTYPE_MOSTLY_IQ5_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ4_KS, LLAMA_FTYPE_MOSTLY_IQ4_KS_R4 }, + { LLAMA_FTYPE_MOSTLY_Q8_KV, LLAMA_FTYPE_MOSTLY_Q8_KV_R8 }, + }; + if (auto it = k_map.find(ftype); it != k_map.end()) return it->second; + return ftype; +} + static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; - switch (params->ftype) { + switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; @@ -17256,7 +17293,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ftype = model.ftype; } const std::unordered_map> * imatrix_data = nullptr; - if (params->imatrix) { + if (!params->only_repack && params->imatrix) { imatrix_data = static_cast>*>(params->imatrix); if (imatrix_data) { LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); @@ -17303,9 +17340,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } + bool is_repacked = ml.ftype >= LLAMA_FTYPE_MOSTLY_Q4_0_R8 && ml.ftype <= LLAMA_FTYPE_MOSTLY_Q8_K_R8; + int n_to_repack = 0, n_to_modify = 0; for (int i = 0; i < ml.n_tensors; ++i) { const struct ggml_tensor * meta = ml.get_tensor_meta(i); + if (params->only_repack) { + auto repacked_type = (ggml_type)iqk_repacked_type(meta); + if (repacked_type != meta->type) { + ++n_to_repack; + } else if (!is_repacked) { + if (iqk_should_modify_tensor(meta)) ++n_to_modify; + } + } + const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -17317,6 +17365,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } + if (params->only_repack) { + if (n_to_repack == 0 && n_to_modify == 0) { + printf("=========================== %s: nothing to do for only_repack option\n", __func__); + return; + } + ftype = repacked_ftype(ftype); + } + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // sanity checks @@ -17457,6 +17513,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s void * new_data; size_t new_size; + if (params->only_repack) { + ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor); + bool modify = !is_repacked && iqk_should_modify_tensor(tensor); + if (modify || repacked_type != tensor->type) { + new_type = repacked_type; + new_size = ggml_nbytes(tensor); + if ((int)work.size() < new_size) work.resize(new_size); + new_data = work.data(); + + auto aux_tensor = *tensor; + aux_tensor.data = work.data(); + std::memcpy(aux_tensor.data, tensor->data, new_size); + + if (repacked_type != tensor->type) { + iqk_repack_tensor(&aux_tensor); + GGML_ASSERT(aux_tensor.type == repacked_type); + } else { + bool did_modify = iqk_modify_tensor(&aux_tensor); + GGML_ASSERT(did_modify); + } + } + else { + new_type = tensor->type; + new_size = ggml_nbytes(tensor); + new_data = tensor->data; + } + LLAMA_LOG_INFO("size = %8.3f MB, type = %s\n", new_size/1024.0/1024.0, ggml_type_name(new_type)); + goto QuantizationDone; + } + if (quantize) { new_type = default_type; if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) { @@ -17562,7 +17648,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_IQ1_S_R4|| new_type == GGML_TYPE_IQ1_M_R4|| (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || - (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) { + (new_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); @@ -17727,6 +17813,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); } + +QuantizationDone:; total_size_org += ggml_nbytes(tensor); total_size_new += new_size; @@ -18051,6 +18139,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.pure =*/ false, /*.keep_split =*/ false, /*.ignore_imatrix_rules =*/ false, + /*.only_repack =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.custom_quants =*/ nullptr,