Repack a model with the quantize tool

This commit is contained in:
Iwan Kawrakow
2025-03-20 09:11:33 +02:00
parent 127c6ee649
commit 20df7b89c8
6 changed files with 149 additions and 19 deletions

View File

@@ -17140,11 +17140,48 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
return new_size;
}
static llama_ftype repacked_ftype(llama_ftype ftype) {
static std::unordered_map<llama_ftype, llama_ftype> k_map = {
{ LLAMA_FTYPE_MOSTLY_Q4_0, LLAMA_FTYPE_MOSTLY_Q4_0_R8 },
{ LLAMA_FTYPE_MOSTLY_Q8_0, LLAMA_FTYPE_MOSTLY_Q8_0_R8 },
{ LLAMA_FTYPE_MOSTLY_Q5_0, LLAMA_FTYPE_MOSTLY_Q5_0_R4 },
{ LLAMA_FTYPE_MOSTLY_Q2_K, LLAMA_FTYPE_MOSTLY_Q2_K_R4 },
{ LLAMA_FTYPE_MOSTLY_Q3_K_S, LLAMA_FTYPE_MOSTLY_Q3_K_R4 },
{ LLAMA_FTYPE_MOSTLY_Q3_K_M, LLAMA_FTYPE_MOSTLY_Q3_K_R4 },
{ LLAMA_FTYPE_MOSTLY_Q3_K_L, LLAMA_FTYPE_MOSTLY_Q3_K_R4 },
{ LLAMA_FTYPE_MOSTLY_Q4_K_S, LLAMA_FTYPE_MOSTLY_Q4_K_R4 },
{ LLAMA_FTYPE_MOSTLY_Q4_K_M, LLAMA_FTYPE_MOSTLY_Q4_K_R4 },
{ LLAMA_FTYPE_MOSTLY_Q5_K_S, LLAMA_FTYPE_MOSTLY_Q5_K_R4 },
{ LLAMA_FTYPE_MOSTLY_Q5_K_M, LLAMA_FTYPE_MOSTLY_Q5_K_R4 },
{ LLAMA_FTYPE_MOSTLY_Q6_K, LLAMA_FTYPE_MOSTLY_Q6_K_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ2_XXS, LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ2_XS, LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ3_XXS, LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ1_S, LLAMA_FTYPE_MOSTLY_IQ1_S_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ4_NL, LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ3_S, LLAMA_FTYPE_MOSTLY_IQ3_S_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ2_M, LLAMA_FTYPE_MOSTLY_IQ2_M_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ4_XS, LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 },
{ LLAMA_FTYPE_MOSTLY_IQ1_M, LLAMA_FTYPE_MOSTLY_IQ1_M_R4 },
{ LLAMA_FTYPE_MOSTLY_Q6_0, LLAMA_FTYPE_MOSTLY_Q6_0_R4 },
{ LLAMA_FTYPE_MOSTLY_BF16, LLAMA_FTYPE_MOSTLY_BF16_R16 },
{ LLAMA_FTYPE_MOSTLY_IQ2_BN, LLAMA_FTYPE_MOSTLY_IQ2_BN_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ2_K, LLAMA_FTYPE_MOSTLY_IQ2_K_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ3_K, LLAMA_FTYPE_MOSTLY_IQ3_K_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ4_K, LLAMA_FTYPE_MOSTLY_IQ4_K_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ5_K, LLAMA_FTYPE_MOSTLY_IQ5_K_R4 },
{ LLAMA_FTYPE_MOSTLY_IQ4_KS, LLAMA_FTYPE_MOSTLY_IQ4_KS_R4 },
{ LLAMA_FTYPE_MOSTLY_Q8_KV, LLAMA_FTYPE_MOSTLY_Q8_KV_R8 },
};
if (auto it = k_map.find(ftype); it != k_map.end()) return it->second;
return ftype;
}
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
switch (params->ftype) {
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
@@ -17256,7 +17293,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
ftype = model.ftype;
}
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
if (params->imatrix) {
if (!params->only_repack && params->imatrix) {
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
@@ -17303,9 +17340,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}
bool is_repacked = ml.ftype >= LLAMA_FTYPE_MOSTLY_Q4_0_R8 && ml.ftype <= LLAMA_FTYPE_MOSTLY_Q8_K_R8;
int n_to_repack = 0, n_to_modify = 0;
for (int i = 0; i < ml.n_tensors; ++i) {
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
if (params->only_repack) {
auto repacked_type = (ggml_type)iqk_repacked_type(meta);
if (repacked_type != meta->type) {
++n_to_repack;
} else if (!is_repacked) {
if (iqk_should_modify_tensor(meta)) ++n_to_modify;
}
}
const std::string name = ggml_get_name(meta);
// TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -17317,6 +17365,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}
if (params->only_repack) {
if (n_to_repack == 0 && n_to_modify == 0) {
printf("=========================== %s: nothing to do for only_repack option\n", __func__);
return;
}
ftype = repacked_ftype(ftype);
}
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
// sanity checks
@@ -17457,6 +17513,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
void * new_data;
size_t new_size;
if (params->only_repack) {
ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor);
bool modify = !is_repacked && iqk_should_modify_tensor(tensor);
if (modify || repacked_type != tensor->type) {
new_type = repacked_type;
new_size = ggml_nbytes(tensor);
if ((int)work.size() < new_size) work.resize(new_size);
new_data = work.data();
auto aux_tensor = *tensor;
aux_tensor.data = work.data();
std::memcpy(aux_tensor.data, tensor->data, new_size);
if (repacked_type != tensor->type) {
iqk_repack_tensor(&aux_tensor);
GGML_ASSERT(aux_tensor.type == repacked_type);
} else {
bool did_modify = iqk_modify_tensor(&aux_tensor);
GGML_ASSERT(did_modify);
}
}
else {
new_type = tensor->type;
new_size = ggml_nbytes(tensor);
new_data = tensor->data;
}
LLAMA_LOG_INFO("size = %8.3f MB, type = %s\n", new_size/1024.0/1024.0, ggml_type_name(new_type));
goto QuantizationDone;
}
if (quantize) {
new_type = default_type;
if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) {
@@ -17562,7 +17648,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_type == GGML_TYPE_IQ1_S_R4||
new_type == GGML_TYPE_IQ1_M_R4||
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
(new_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
LLAMA_LOG_ERROR("\n\n============================================================\n");
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
@@ -17727,6 +17813,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
}
QuantizationDone:;
total_size_org += ggml_nbytes(tensor);
total_size_new += new_size;
@@ -18051,6 +18139,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.pure =*/ false,
/*.keep_split =*/ false,
/*.ignore_imatrix_rules =*/ false,
/*.only_repack =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.custom_quants =*/ nullptr,