mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
convert_hf_to_gguf.py : conversion from hf weights to Q6_0 (#483)
* Direct conversion from fp16 to Q6_0 * forgotten comma * More precise infos
This commit is contained in:
@@ -313,6 +313,7 @@ class Model:
|
|||||||
gguf.MODEL_TENSOR.OUTPUT,
|
gguf.MODEL_TENSOR.OUTPUT,
|
||||||
gguf.MODEL_TENSOR.ATTN_V,
|
gguf.MODEL_TENSOR.ATTN_V,
|
||||||
gguf.MODEL_TENSOR.ATTN_K,
|
gguf.MODEL_TENSOR.ATTN_K,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_QKV,
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
if self.ftype in (
|
if self.ftype in (
|
||||||
@@ -323,9 +324,8 @@ class Model:
|
|||||||
elif self.ftype in (
|
elif self.ftype in (
|
||||||
gguf.LlamaFileType.MOSTLY_Q5_0,
|
gguf.LlamaFileType.MOSTLY_Q5_0,
|
||||||
gguf.LlamaFileType.MOSTLY_Q5_1,
|
gguf.LlamaFileType.MOSTLY_Q5_1,
|
||||||
# gguf.LlamaFileType.MOSTLY_Q6_0,
|
|
||||||
):
|
):
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
data_qtype = gguf.GGMLQuantizationType.Q6_0
|
||||||
|
|
||||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
@@ -343,8 +343,8 @@ class Model:
|
|||||||
data_qtype = gguf.GGMLQuantizationType.Q5_0
|
data_qtype = gguf.GGMLQuantizationType.Q5_0
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q5_1
|
data_qtype = gguf.GGMLQuantizationType.Q5_1
|
||||||
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented?
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0:
|
||||||
# data_qtype = gguf.GGMLQuantizationType.Q6_0
|
data_qtype = gguf.GGMLQuantizationType.Q6_0
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
else:
|
else:
|
||||||
@@ -419,12 +419,12 @@ class Model:
|
|||||||
logger.info("Set model quantization version")
|
logger.info("Set model quantization version")
|
||||||
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||||
|
|
||||||
logger.info("****************************************************************************************")
|
logger.info("***********************************************************************************************")
|
||||||
logger.info("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`")
|
logger.info("** Converting to `q4_0`,`q4_1`,`q5_0`, `q5_1` or `q6_0` is not equiv to using `llama-quantize`!")
|
||||||
logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0")
|
logger.info("** Ftype `q4_0`,`q4_1` are here converting embeddings, output, attn_k and attn_v/qkv in q5_0.")
|
||||||
logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0")
|
logger.info("** Ftype `q5_0`,`q5_1` are here converting embeddings, output, attn_k and attn_v/qkv in q6_0.")
|
||||||
logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.")
|
logger.info("** This, in order to create a small but viable conv. to then for example make an iMatrix file.")
|
||||||
logger.info("****************************************************************************************")
|
logger.info("***********************************************************************************************")
|
||||||
|
|
||||||
def write(self):
|
def write(self):
|
||||||
self.prepare_tensors()
|
self.prepare_tensors()
|
||||||
@@ -4113,8 +4113,8 @@ def parse_args() -> argparse.Namespace:
|
|||||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q6_0", "auto"], default="f16",
|
||||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1, q6_0 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bigendian", action="store_true",
|
"--bigendian", action="store_true",
|
||||||
@@ -4204,7 +4204,7 @@ def main() -> None:
|
|||||||
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
|
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
|
||||||
"q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
|
"q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
|
||||||
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
|
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
|
||||||
# "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
|
"q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -377,6 +377,32 @@ class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
|
|||||||
return (d * qs) + m
|
return (d * qs) + m
|
||||||
|
|
||||||
|
|
||||||
|
class Q6_0(__Quant, qtype=GGMLQuantizationType.Q6_0):
|
||||||
|
@classmethod
|
||||||
|
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
n_blocks = blocks.shape[0]
|
||||||
|
|
||||||
|
imax = abs(blocks).argmax(axis=-1, keepdims=True)
|
||||||
|
max = np.take_along_axis(blocks, imax, axis=-1)
|
||||||
|
|
||||||
|
d = max / -32
|
||||||
|
with np.errstate(divide="ignore"):
|
||||||
|
id = np.where(d == 0, 0, 1 / d)
|
||||||
|
# Adapted from Q5_0
|
||||||
|
q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(32.5), dtype=np.float32).astype(np.uint8).clip(0, 63)
|
||||||
|
|
||||||
|
qs = q.reshape((n_blocks, 2, cls.block_size // 2))
|
||||||
|
qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
|
||||||
|
|
||||||
|
qh = np.zeros((n_blocks, cls.block_size // 4), dtype=np.uint8)
|
||||||
|
for j in range(cls.block_size // 2):
|
||||||
|
h = ((q[:, j] >> 4) | ((q[:, j + cls.block_size // 2] >> 4) << 2)).astype(np.uint8)
|
||||||
|
qh[:, j % (cls.block_size // 4)] |= (h << 4 * (j // (cls.block_size // 4)))
|
||||||
|
|
||||||
|
d = d.astype(np.float16).view(np.uint8)
|
||||||
|
|
||||||
|
return np.concatenate([d, qh, qs], axis=-1)
|
||||||
|
|
||||||
class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
|
class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
|
||||||
@classmethod
|
@classmethod
|
||||||
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ class GGMLQuants:
|
|||||||
self.libggml.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
|
self.libggml.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
|
||||||
|
|
||||||
for t in (
|
for t in (
|
||||||
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
|
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "q6_0",
|
||||||
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
||||||
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
||||||
"iq4_nl", "iq4_xs",
|
"iq4_nl", "iq4_xs",
|
||||||
|
|||||||
Reference in New Issue
Block a user