mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 07:04:11 +00:00
Add fp8 GGUF creation
This commit is contained in:
@@ -195,7 +195,7 @@ class Model:
|
||||
return False
|
||||
return name == (key_name + suffix)
|
||||
|
||||
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
||||
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias", ".weight_scale_inv")) -> str:
|
||||
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
|
||||
if new_name is None:
|
||||
raise ValueError(f"Can not map tensor {name!r}")
|
||||
@@ -316,6 +316,8 @@ class Model:
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_FP8_E4M3:
|
||||
data_qtype = gguf.GGMLQuantizationType.FP8_E4M3
|
||||
else:
|
||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||
|
||||
@@ -4076,8 +4078,8 @@ def parse_args() -> argparse.Namespace:
|
||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "fp8", "q8_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, fp8 for float8_e4m3fn, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
@@ -4164,6 +4166,7 @@ def main() -> None:
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
"fp8": gguf.LlamaFileType.MOSTLY_FP8_E4M3,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
}
|
||||
|
||||
|
||||
@@ -1317,6 +1317,7 @@ class GGMLQuantizationType(IntEnum):
|
||||
IQ4_KS_R4 = 344
|
||||
Q8_KV_R8 = 398
|
||||
Q8_K_R8 = 399
|
||||
FP8_E4M3 = 999
|
||||
|
||||
|
||||
class ExpertGatingFuncType(IntEnum):
|
||||
@@ -1395,6 +1396,7 @@ class LlamaFileType(IntEnum):
|
||||
MOSTLY_IQ4_KS_R4 = 337 #except 1d tensors
|
||||
MOSTLY_Q8_KV_R8 = 398 #except 1d tensors
|
||||
MOSTLY_Q8_K_R8 = 399 #except 1d tensors
|
||||
MOSTLY_FP8_E4M3 = 999 #except 1d tensors
|
||||
|
||||
|
||||
GUESSED = 1024 # not specified in the model file
|
||||
@@ -1522,6 +1524,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
GGMLQuantizationType.IQ4_KS_R4 : ( 256, 136),
|
||||
GGMLQuantizationType.Q8_KV_R8 : ( 32, 32),
|
||||
GGMLQuantizationType.Q8_K_R8 : ( 256, 258),
|
||||
GGMLQuantizationType.FP8_E4M3 : ( 1, 1),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -61,6 +61,7 @@ def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||
elif (q := _type_traits.get(qtype)) is not None:
|
||||
return q.quantize(data)
|
||||
else:
|
||||
print(_type_traits)
|
||||
raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
|
||||
|
||||
|
||||
@@ -217,6 +218,110 @@ class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
|
||||
return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
|
||||
|
||||
|
||||
class FP8_E4M3(__Quant, qtype=GGMLQuantizationType.FP8_E4M3):
|
||||
FP8_EXP_BIAS = 7
|
||||
FP8_MAX_EXP = 14
|
||||
FP8_MANT_BITS = 3
|
||||
FP32_EXP_BIAS = 127
|
||||
|
||||
@classmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
f32 = blocks.view(np.float32)
|
||||
u32 = f32.view(np.uint32)
|
||||
sign = (u32 >> 31).astype(np.uint32)
|
||||
exp = (u32 >> 23) & 0xFF
|
||||
mant = u32 & 0x7FFFFF
|
||||
|
||||
#special cases
|
||||
is_nan = (exp == 0xFF) & (mant != 0)
|
||||
is_inf = (exp == 0xFF) & (mant == 0)
|
||||
is_zero = (exp == 0) & (mant == 0)
|
||||
|
||||
#normalize FP32 subnormals
|
||||
is_subnormal_fp32 = (exp == 0) & (mant != 0)
|
||||
leading_zeros = 22 - np.log2(np.maximum(mant, 1)).astype(int)
|
||||
mant = np.where(is_subnormal_fp32, mant << leading_zeros, mant)
|
||||
exp = np.where(is_subnormal_fp32, 1 - leading_zeros, exp)
|
||||
|
||||
#calculate unclipped exponent
|
||||
fp8_exp_raw = exp.astype(np.int32) - (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS)
|
||||
underflow = fp8_exp_raw < 0
|
||||
fp8_exp = np.clip(fp8_exp_raw, 0, cls.FP8_MAX_EXP)
|
||||
|
||||
# calculate subnormal shift
|
||||
shift = np.where(underflow, 1 - fp8_exp_raw, 0)
|
||||
|
||||
# align and round mantissa (RNE)
|
||||
mant_plus_implicit = np.where(exp > 0, mant | 0x800000, mant)
|
||||
total_shift = 20 + shift
|
||||
mant_shifted = np.right_shift(mant_plus_implicit, total_shift)
|
||||
round_bit = np.right_shift(mant_plus_implicit, total_shift - 1) & 1
|
||||
sticky_mask = (1 << (total_shift - 1)) - 1
|
||||
sticky = (mant_plus_implicit & sticky_mask) != 0
|
||||
rounded = mant_shifted + ((round_bit & (sticky | (mant_shifted & 1))) != 0)
|
||||
|
||||
# handle mantissa overflow
|
||||
mant_overflow = rounded >= 16 # 1 << (3+1)
|
||||
fp8_exp = np.where(mant_overflow, fp8_exp + 1, fp8_exp)
|
||||
rounded = np.where(mant_overflow, 8, rounded) # Reset to 1.000
|
||||
|
||||
# handle exponent overflow
|
||||
overflow = fp8_exp > cls.FP8_MAX_EXP
|
||||
fp8_exp = np.where(overflow, 0xF, fp8_exp)
|
||||
rounded = np.where(overflow, 0, rounded)
|
||||
|
||||
# make the FP8
|
||||
fp8 = (
|
||||
(sign << 7) |
|
||||
((fp8_exp << 3) & 0x78) |
|
||||
(rounded & 0x7)
|
||||
)
|
||||
fp8 = np.where(is_nan, (sign << 7) | 0x7D, fp8) # NaN
|
||||
fp8 = np.where(is_inf, (sign << 7) | 0x78, fp8) # Inf
|
||||
fp8 = np.where(is_zero, sign << 7, fp8) # Zero
|
||||
|
||||
return fp8.astype(np.uint8)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
fp8 = blocks.astype(np.uint32)
|
||||
sign = (fp8 >> 7) & 1
|
||||
exp = (fp8 >> 3) & 0xF
|
||||
mant = fp8 & 0x7
|
||||
|
||||
#special cases
|
||||
is_nan = (exp == 0xF) & (mant != 0)
|
||||
is_inf = (exp == 0xF) & (mant == 0)
|
||||
is_zero = (exp == 0) & (mant == 0)
|
||||
is_subnormal = (exp == 0) & (mant != 0)
|
||||
|
||||
fp32_exp = np.where(
|
||||
exp > 0,
|
||||
exp + (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS),
|
||||
(1 - cls.FP8_EXP_BIAS) + cls.FP32_EXP_BIAS # -6 + 127 = 121
|
||||
)
|
||||
|
||||
mant_scale = np.where(
|
||||
is_subnormal,
|
||||
mant.astype(np.float32) * 0.125, # 1/8
|
||||
1.0 + mant.astype(np.float32) * 0.125
|
||||
)
|
||||
|
||||
result = np.where(
|
||||
is_nan,
|
||||
np.nan,
|
||||
np.where(
|
||||
is_inf,
|
||||
np.copysign(np.inf, (-1.0)**sign),
|
||||
np.where(
|
||||
is_zero,
|
||||
np.copysign(0.0, (-1.0)**sign),
|
||||
np.ldexp(mant_scale * (-1.0)**sign, fp32_exp - cls.FP32_EXP_BIAS)
|
||||
)
|
||||
)
|
||||
)
|
||||
return result.astype(np.float32)
|
||||
|
||||
class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
|
||||
@classmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
|
||||
Reference in New Issue
Block a user