mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-07 23:10:10 +00:00
Additional guards for interleaved quants (#299)
* Make sure no interleaved quants are being used for token embeddings also with `--pure` and/or `--custom-q`. * Simplify --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
248
src/llama.cpp
248
src/llama.cpp
@@ -16762,7 +16762,7 @@ static void llama_tensor_dequantize_internal(
|
||||
workers.clear();
|
||||
}
|
||||
|
||||
static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
|
||||
static ggml_type change_type_if_necessary(ggml_type new_type, int nx, int ny) {
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
||||
@@ -16834,6 +16834,43 @@ static ggml_type change_type_if_necessar(ggml_type new_type, int nx, int ny) {
|
||||
return new_type;
|
||||
}
|
||||
|
||||
static std::pair<ggml_type, int> interleaved_properties(ggml_type type) {
|
||||
static std::unordered_map<ggml_type, std::pair<ggml_type, int>> k_map = {
|
||||
{ GGML_TYPE_Q4_0_4_4, { GGML_TYPE_Q4_0, 4} },
|
||||
{ GGML_TYPE_Q4_0_4_8, { GGML_TYPE_Q4_0, 4} },
|
||||
{ GGML_TYPE_Q4_0_8_8, { GGML_TYPE_Q4_0, 8} },
|
||||
{ GGML_TYPE_Q4_0_R8, { GGML_TYPE_Q4_0, 8} },
|
||||
{ GGML_TYPE_Q5_0_R4, { GGML_TYPE_Q5_0, 4} },
|
||||
{ GGML_TYPE_Q6_0_R4, { GGML_TYPE_Q6_0, 4} },
|
||||
{ GGML_TYPE_Q8_0_R8, { GGML_TYPE_Q8_0, 8} },
|
||||
{ GGML_TYPE_Q2_K_R4, { GGML_TYPE_Q2_K, 4} },
|
||||
{ GGML_TYPE_Q3_K_R4, { GGML_TYPE_Q3_K, 4} },
|
||||
{ GGML_TYPE_Q4_K_R4, { GGML_TYPE_Q4_K, 4} },
|
||||
{ GGML_TYPE_Q5_K_R4, { GGML_TYPE_Q5_K, 4} },
|
||||
{ GGML_TYPE_Q6_K_R4, { GGML_TYPE_Q6_K, 4} },
|
||||
{ GGML_TYPE_IQ2_XXS_R4, { GGML_TYPE_IQ2_XXS, 4} },
|
||||
{ GGML_TYPE_IQ2_XS_R4, { GGML_TYPE_IQ2_XS, 4} },
|
||||
{ GGML_TYPE_IQ2_S_R4, { GGML_TYPE_IQ2_S, 4} },
|
||||
{ GGML_TYPE_IQ3_XXS_R4, { GGML_TYPE_IQ3_XXS, 4} },
|
||||
{ GGML_TYPE_IQ3_S_R4, { GGML_TYPE_IQ3_S, 4} },
|
||||
{ GGML_TYPE_IQ4_XS_R8, { GGML_TYPE_IQ4_XS, 8} },
|
||||
{ GGML_TYPE_IQ4_NL_R4, { GGML_TYPE_IQ4_NL, 4} },
|
||||
{ GGML_TYPE_IQ1_S_R4, { GGML_TYPE_IQ1_S, 4} },
|
||||
{ GGML_TYPE_IQ1_M_R4, { GGML_TYPE_IQ1_M, 4} },
|
||||
{ GGML_TYPE_IQ2_BN_R4, { GGML_TYPE_IQ2_BN, 4} },
|
||||
{ GGML_TYPE_IQ2_K_R4, { GGML_TYPE_IQ2_K, 4} },
|
||||
{ GGML_TYPE_IQ3_K_R4, { GGML_TYPE_IQ3_K, 4} },
|
||||
{ GGML_TYPE_IQ4_K_R4, { GGML_TYPE_IQ4_K, 4} },
|
||||
{ GGML_TYPE_IQ4_KS_R4, { GGML_TYPE_IQ4_KS, 4} },
|
||||
{ GGML_TYPE_IQ5_K_R4, { GGML_TYPE_IQ5_K, 4} },
|
||||
{ GGML_TYPE_Q8_KV_R8, { GGML_TYPE_Q8_KV, 8} },
|
||||
{ GGML_TYPE_Q8_K_R8, { GGML_TYPE_Q8_K, 8} },
|
||||
{ GGML_TYPE_BF16_R16, { GGML_TYPE_BF16, 16} },
|
||||
};
|
||||
if (auto it = k_map.find(type); it != k_map.end()) return it->second;
|
||||
return {type, 1};
|
||||
}
|
||||
|
||||
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
@@ -16939,70 +16976,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN_R4) {
|
||||
new_type = GGML_TYPE_IQ4_NL;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
||||
new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ4_NL_R4) {
|
||||
new_type = GGML_TYPE_IQ4_NL;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ4_XS_R8) {
|
||||
new_type = GGML_TYPE_IQ4_XS;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q2_K_R4) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q3_K_R4) {
|
||||
new_type = GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_K_R4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q5_K_R4) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q6_K_R4) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q8_K_R8) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q8_KV_R8) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ2_K_R4) {
|
||||
new_type = GGML_TYPE_IQ2_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ3_K_R4) {
|
||||
new_type = GGML_TYPE_IQ3_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ3_S_R4) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ4_K_R4) {
|
||||
new_type = GGML_TYPE_IQ4_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ5_K_R4) {
|
||||
new_type = GGML_TYPE_IQ5_K;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ4_KS_R4) {
|
||||
new_type = GGML_TYPE_IQ4_KS;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_R8) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q5_0_R4) {
|
||||
new_type = GGML_TYPE_Q5_0;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q6_0_R4) {
|
||||
new_type = GGML_TYPE_Q6_0;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q8_0_R8) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_BF16_R16) {
|
||||
new_type = GGML_TYPE_BF16;
|
||||
}
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) {
|
||||
if (name.find("attn_v.weight") != std::string::npos) {
|
||||
@@ -17332,12 +17305,21 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
LLAMA_LOG_INFO("Using custom type %s for tensor %s\n", ggml_type_name(new_type), name.c_str());
|
||||
}
|
||||
|
||||
auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
|
||||
auto working_type = change_type_if_necessary(new_type, tensor->ne[0], tensor->ne[1]);
|
||||
if (working_type != new_type) {
|
||||
++qs.n_fallback;
|
||||
new_type = working_type;
|
||||
}
|
||||
|
||||
if (name == "token_embd.weight") {
|
||||
auto working_type = interleaved_properties(new_type).first;
|
||||
if (working_type != new_type) {
|
||||
printf("\n============ Token embeddings cannot be quantized with row-interleaved quants\n");
|
||||
printf("---> Changed %s to %s\n", ggml_type_name(new_type), ggml_type_name(working_type));
|
||||
new_type = working_type;
|
||||
}
|
||||
}
|
||||
|
||||
return new_type;
|
||||
}
|
||||
|
||||
@@ -17834,14 +17816,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
}
|
||||
|
||||
if (quantize) {
|
||||
|
||||
new_type = default_type;
|
||||
if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) {
|
||||
new_type = GGML_TYPE_BF16;
|
||||
}
|
||||
|
||||
// get more optimal quantization type based on the tensor shape, layer, etc.
|
||||
if (params->pure) {
|
||||
auto working_type = change_type_if_necessar(new_type, tensor->ne[0], tensor->ne[1]);
|
||||
auto working_type = change_type_if_necessary(new_type, tensor->ne[0], tensor->ne[1]);
|
||||
if (working_type != new_type) {
|
||||
++qs.n_fallback;
|
||||
new_type = working_type;
|
||||
@@ -17881,6 +17861,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
new_type = params->ffn_up_type;
|
||||
}
|
||||
|
||||
if (strcmp(tensor->name, "token_embd.weight") == 0) {
|
||||
// token embeddings cannot be quantized with row-interleaved quants
|
||||
auto working_type = interleaved_properties(new_type).first;
|
||||
if (working_type != new_type) {
|
||||
printf("\n============ Token embeddings cannot be quantized with row-interleaved quants\n");
|
||||
printf("---> Changed %s to %s\n", ggml_type_name(new_type), ggml_type_name(working_type));
|
||||
new_type = working_type;
|
||||
}
|
||||
}
|
||||
|
||||
// If we've decided to quantize to the same type the tensor is already
|
||||
// in then there's nothing to do.
|
||||
quantize = tensor->type != new_type;
|
||||
@@ -17965,119 +17955,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
}
|
||||
|
||||
int chunk_size_multiplier = 1;
|
||||
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
||||
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
||||
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ4_NL_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_NL;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ4_XS_R8) {
|
||||
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_IQ4_XS;
|
||||
else chunk_size_multiplier = 8;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_R8) {
|
||||
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q4_0;
|
||||
else chunk_size_multiplier = 8;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q5_0_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_0;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q6_0_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_0;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q8_0_R8) {
|
||||
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
|
||||
else chunk_size_multiplier = 8;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q2_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q2_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q3_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q3_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q5_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q6_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q8_K_R8) {
|
||||
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
|
||||
else chunk_size_multiplier = 8;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q8_KV_R8) {
|
||||
if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0;
|
||||
else chunk_size_multiplier = 8;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ2_BN_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_BN;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ2_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ3_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ4_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ5_K_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ5_K;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ4_KS_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_KS;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ2_XXS_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_XXS;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ2_XS_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_XS;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ2_S_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_S;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ3_XXS_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_XXS;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ3_S_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_S;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ1_S_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_IQ1_M_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_M;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_BF16_R16) {
|
||||
if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16;
|
||||
else chunk_size_multiplier = 16;
|
||||
auto [working_type, num_rows] = interleaved_properties(new_type);
|
||||
if (tensor->ne[1] % num_rows != 0) {
|
||||
new_type = working_type;
|
||||
} else {
|
||||
chunk_size_multiplier = num_rows;
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
|
||||
Reference in New Issue
Block a user