Update GGML_QUANT_SIZES

This commit is contained in:
Saood Karim
2025-04-23 23:06:26 -05:00
parent a7f026eebb
commit adb6b6fb3f

View File

@@ -1349,39 +1349,89 @@ class GGUFValueType(IntEnum):
# Items here are (block size, type size)
QK_K = 256
#Values generated programatically
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
GGMLQuantizationType.F32: (1, 4),
GGMLQuantizationType.F16: (1, 2),
GGMLQuantizationType.Q4_0: (32, 2 + 16),
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
GGMLQuantizationType.Q8_0: (32, 2 + 32),
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
GGMLQuantizationType.I8: (1, 1),
GGMLQuantizationType.I16: (1, 2),
GGMLQuantizationType.I32: (1, 4),
GGMLQuantizationType.I64: (1, 8),
GGMLQuantizationType.F64: (1, 8),
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
GGMLQuantizationType.BF16: (1, 2),
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
GGMLQuantizationType.F32 : ( 1, 4),
GGMLQuantizationType.F16 : ( 1, 2),
GGMLQuantizationType.Q4_0 : ( 32, 18),
GGMLQuantizationType.Q4_1 : ( 32, 20),
GGMLQuantizationType.Q5_0 : ( 32, 22),
GGMLQuantizationType.Q5_1 : ( 32, 24),
GGMLQuantizationType.Q8_0 : ( 32, 34),
GGMLQuantizationType.Q8_1 : ( 32, 36),
GGMLQuantizationType.Q2_K : ( 256, 84),
GGMLQuantizationType.Q3_K : ( 256, 110),
GGMLQuantizationType.Q4_K : ( 256, 144),
GGMLQuantizationType.Q5_K : ( 256, 176),
GGMLQuantizationType.Q6_K : ( 256, 210),
GGMLQuantizationType.Q8_K : ( 256, 292),
GGMLQuantizationType.IQ2_XXS : ( 256, 66),
GGMLQuantizationType.IQ2_XS : ( 256, 74),
GGMLQuantizationType.IQ3_XXS : ( 256, 98),
GGMLQuantizationType.IQ1_S : ( 256, 50),
GGMLQuantizationType.IQ4_NL : ( 32, 18),
GGMLQuantizationType.IQ3_S : ( 256, 110),
GGMLQuantizationType.IQ2_S : ( 256, 82),
GGMLQuantizationType.IQ4_XS : ( 256, 136),
GGMLQuantizationType.I8 : ( 1, 1),
GGMLQuantizationType.I16 : ( 1, 2),
GGMLQuantizationType.I32 : ( 1, 4),
GGMLQuantizationType.I64 : ( 1, 8),
GGMLQuantizationType.F64 : ( 1, 8),
GGMLQuantizationType.IQ1_M : ( 256, 56),
GGMLQuantizationType.BF16 : ( 1, 2),
GGMLQuantizationType.Q4_0_4_4 : ( 32, 18),
GGMLQuantizationType.Q4_0_4_8 : ( 32, 18),
GGMLQuantizationType.Q4_0_8_8 : ( 32, 18),
GGMLQuantizationType.I2_S : ( 1, 1),
GGMLQuantizationType.Q8_0_X4 : ( 32, 34),
GGMLQuantizationType.Q8_1_X4 : ( 32, 36),
GGMLQuantizationType.Q8_2_X4 : ( 32, 36),
GGMLQuantizationType.Q6_0 : ( 32, 26),
GGMLQuantizationType.IQ1_BN : ( 64, 13),
GGMLQuantizationType.IQ2_BN : ( 64, 16),
GGMLQuantizationType.Q8_K64 : ( 64, 68),
GGMLQuantizationType.IQ2_K : ( 256, 76),
GGMLQuantizationType.IQ3_K : ( 256, 110),
GGMLQuantizationType.IQ4_K : ( 256, 144),
GGMLQuantizationType.IQ5_K : ( 256, 176),
GGMLQuantizationType.IQ6_K : ( 256, 212),
GGMLQuantizationType.IQ4_KS : ( 256, 136),
GGMLQuantizationType.IQ2_KS : ( 256, 70),
GGMLQuantizationType.IQ4_KSS : ( 256, 128),
GGMLQuantizationType.Q8_K16 : ( 64, 64),
GGMLQuantizationType.Q8_K32 : ( 256, 292),
GGMLQuantizationType.Q8_KR8 : ( 256, 292),
GGMLQuantizationType.Q8_K128 : ( 128, 140),
GGMLQuantizationType.Q8_KV : ( 32, 32),
GGMLQuantizationType.Q4_0_R8 : ( 32, 18),
GGMLQuantizationType.Q5_0_R4 : ( 32, 22),
GGMLQuantizationType.Q8_0_R8 : ( 32, 34),
GGMLQuantizationType.Q2_K_R4 : ( 256, 84),
GGMLQuantizationType.Q3_K_R4 : ( 256, 110),
GGMLQuantizationType.Q4_K_R4 : ( 256, 144),
GGMLQuantizationType.Q5_K_R4 : ( 256, 176),
GGMLQuantizationType.Q6_K_R4 : ( 256, 210),
GGMLQuantizationType.IQ2_XXS_R4 : ( 256, 66),
GGMLQuantizationType.IQ2_XS_R4 : ( 256, 74),
GGMLQuantizationType.IQ3_XXS_R4 : ( 256, 98),
GGMLQuantizationType.IQ1_S_R4 : ( 32, 6),
GGMLQuantizationType.IQ4_NL_R4 : ( 32, 18),
GGMLQuantizationType.IQ3_S_R4 : ( 256, 110),
GGMLQuantizationType.IQ2_S_R4 : ( 256, 82),
GGMLQuantizationType.IQ4_XS_R8 : ( 256, 136),
GGMLQuantizationType.IQ1_M_R4 : ( 32, 7),
GGMLQuantizationType.BF16_R16 : ( 1, 2),
GGMLQuantizationType.Q6_0_R4 : ( 32, 26),
GGMLQuantizationType.IQ2_BN_R4 : ( 64, 16),
GGMLQuantizationType.IQ2_K_R4 : ( 256, 76),
GGMLQuantizationType.IQ3_K_R4 : ( 256, 110),
GGMLQuantizationType.IQ4_K_R4 : ( 256, 144),
GGMLQuantizationType.IQ5_K_R4 : ( 256, 176),
GGMLQuantizationType.IQ4_KS_R4 : ( 256, 136),
GGMLQuantizationType.Q8_KV_R8 : ( 32, 32),
GGMLQuantizationType.Q8_K_R8 : ( 256, 258),
}