diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 966cfcd3..624d4174 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -195,7 +195,7 @@ class Model: return False return name == (key_name + suffix) - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias", ".weight_scale_inv")) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) if new_name is None: raise ValueError(f"Can not map tensor {name!r}") @@ -316,6 +316,8 @@ class Model: data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_FP8_E4M3: + data_qtype = gguf.GGMLQuantizationType.FP8_E4M3 else: raise ValueError(f"Unknown file type: {self.ftype.name}") @@ -4076,8 +4078,8 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + "--outtype", type=str, choices=["f32", "f16", "bf16", "fp8", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, fp8 for float8_e4m3fn, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", @@ -4164,6 +4166,7 @@ def main() -> None: "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "fp8": gguf.LlamaFileType.MOSTLY_FP8_E4M3, "auto": gguf.LlamaFileType.GUESSED, } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6819979f..b3a071fe 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1317,6 +1317,7 @@ class GGMLQuantizationType(IntEnum): IQ4_KS_R4 = 344 Q8_KV_R8 = 398 Q8_K_R8 = 399 + FP8_E4M3 = 999 class ExpertGatingFuncType(IntEnum): @@ -1395,6 +1396,7 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_KS_R4 = 337 #except 1d tensors MOSTLY_Q8_KV_R8 = 398 #except 1d tensors MOSTLY_Q8_K_R8 = 399 #except 1d tensors + MOSTLY_FP8_E4M3 = 999 #except 1d tensors GUESSED = 1024 # not specified in the model file @@ -1522,6 +1524,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.IQ4_KS_R4 : ( 256, 136), GGMLQuantizationType.Q8_KV_R8 : ( 32, 32), GGMLQuantizationType.Q8_K_R8 : ( 256, 258), + GGMLQuantizationType.FP8_E4M3 : ( 1, 1), } diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index ff589b85..1d0de0f4 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -61,6 +61,7 @@ def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: elif (q := _type_traits.get(qtype)) is not None: return q.quantize(data) else: + print(_type_traits) raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented") @@ -217,6 +218,110 @@ class BF16(__Quant, qtype=GGMLQuantizationType.BF16): return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32) +class FP8_E4M3(__Quant, qtype=GGMLQuantizationType.FP8_E4M3): + FP8_EXP_BIAS = 7 + FP8_MAX_EXP = 14 + FP8_MANT_BITS = 3 + FP32_EXP_BIAS = 127 + + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + f32 = blocks.view(np.float32) + u32 = f32.view(np.uint32) + sign = (u32 >> 31).astype(np.uint32) + exp = (u32 >> 23) & 0xFF + mant = u32 & 0x7FFFFF + + #special cases + is_nan = (exp == 0xFF) & (mant != 0) + is_inf = (exp == 0xFF) & (mant == 0) + is_zero = (exp == 0) & (mant == 0) + + #normalize FP32 subnormals + is_subnormal_fp32 = (exp == 0) & (mant != 0) + leading_zeros = 22 - np.log2(np.maximum(mant, 1)).astype(int) + mant = np.where(is_subnormal_fp32, mant << leading_zeros, mant) + exp = np.where(is_subnormal_fp32, 1 - leading_zeros, exp) + + #calculate unclipped exponent + fp8_exp_raw = exp.astype(np.int32) - (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS) + underflow = fp8_exp_raw < 0 + fp8_exp = np.clip(fp8_exp_raw, 0, cls.FP8_MAX_EXP) + + # calculate subnormal shift + shift = np.where(underflow, 1 - fp8_exp_raw, 0) + + # align and round mantissa (RNE) + mant_plus_implicit = np.where(exp > 0, mant | 0x800000, mant) + total_shift = 20 + shift + mant_shifted = np.right_shift(mant_plus_implicit, total_shift) + round_bit = np.right_shift(mant_plus_implicit, total_shift - 1) & 1 + sticky_mask = (1 << (total_shift - 1)) - 1 + sticky = (mant_plus_implicit & sticky_mask) != 0 + rounded = mant_shifted + ((round_bit & (sticky | (mant_shifted & 1))) != 0) + + # handle mantissa overflow + mant_overflow = rounded >= 16 # 1 << (3+1) + fp8_exp = np.where(mant_overflow, fp8_exp + 1, fp8_exp) + rounded = np.where(mant_overflow, 8, rounded) # Reset to 1.000 + + # handle exponent overflow + overflow = fp8_exp > cls.FP8_MAX_EXP + fp8_exp = np.where(overflow, 0xF, fp8_exp) + rounded = np.where(overflow, 0, rounded) + + # make the FP8 + fp8 = ( + (sign << 7) | + ((fp8_exp << 3) & 0x78) | + (rounded & 0x7) + ) + fp8 = np.where(is_nan, (sign << 7) | 0x7D, fp8) # NaN + fp8 = np.where(is_inf, (sign << 7) | 0x78, fp8) # Inf + fp8 = np.where(is_zero, sign << 7, fp8) # Zero + + return fp8.astype(np.uint8) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + fp8 = blocks.astype(np.uint32) + sign = (fp8 >> 7) & 1 + exp = (fp8 >> 3) & 0xF + mant = fp8 & 0x7 + + #special cases + is_nan = (exp == 0xF) & (mant != 0) + is_inf = (exp == 0xF) & (mant == 0) + is_zero = (exp == 0) & (mant == 0) + is_subnormal = (exp == 0) & (mant != 0) + + fp32_exp = np.where( + exp > 0, + exp + (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS), + (1 - cls.FP8_EXP_BIAS) + cls.FP32_EXP_BIAS # -6 + 127 = 121 + ) + + mant_scale = np.where( + is_subnormal, + mant.astype(np.float32) * 0.125, # 1/8 + 1.0 + mant.astype(np.float32) * 0.125 + ) + + result = np.where( + is_nan, + np.nan, + np.where( + is_inf, + np.copysign(np.inf, (-1.0)**sign), + np.where( + is_zero, + np.copysign(0.0, (-1.0)**sign), + np.ldexp(mant_scale * (-1.0)**sign, fp32_exp - cls.FP32_EXP_BIAS) + ) + ) + ) + return result.astype(np.float32) + class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0): @classmethod def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: