Add fp8 GGUF creation

2026-02-24 07:04:11 +00:00 · 2025-05-24 03:48:06 -05:00
parent 9fb82af3a8
commit 16597a3ee2
3 changed files with 114 additions and 3 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -195,7 +195,7 @@ class Model:
                return False
        return name == (key_name + suffix)

-    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias", ".weight_scale_inv")) -> str:
        new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
        if new_name is None:
            raise ValueError(f"Can not map tensor {name!r}")
@@ -316,6 +316,8 @@ class Model:
                        data_qtype = gguf.GGMLQuantizationType.BF16
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                        data_qtype = gguf.GGMLQuantizationType.Q8_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_FP8_E4M3:
+                        data_qtype = gguf.GGMLQuantizationType.FP8_E4M3
                    else:
                        raise ValueError(f"Unknown file type: {self.ftype.name}")

@@ -4076,8 +4078,8 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "fp8", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, fp8 for float8_e4m3fn, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
@@ -4164,6 +4166,7 @@ def main() -> None:
        "f16": gguf.LlamaFileType.MOSTLY_F16,
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "fp8": gguf.LlamaFileType.MOSTLY_FP8_E4M3,
        "auto": gguf.LlamaFileType.GUESSED,
    }

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -1317,6 +1317,7 @@ class GGMLQuantizationType(IntEnum):
    IQ4_KS_R4 = 344
    Q8_KV_R8  = 398
    Q8_K_R8   = 399
+    FP8_E4M3  = 999


 class ExpertGatingFuncType(IntEnum):
@@ -1395,6 +1396,7 @@ class LlamaFileType(IntEnum):
    MOSTLY_IQ4_KS_R4       = 337    #except 1d tensors
    MOSTLY_Q8_KV_R8        = 398    #except 1d tensors
    MOSTLY_Q8_K_R8         = 399    #except 1d tensors
+    MOSTLY_FP8_E4M3        = 999    #except 1d tensors


    GUESSED              = 1024  # not specified in the model file
@@ -1522,6 +1524,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.IQ4_KS_R4   : ( 256,  136),
    GGMLQuantizationType.Q8_KV_R8    : (  32,   32),
    GGMLQuantizationType.Q8_K_R8     : ( 256,  258),
+    GGMLQuantizationType.FP8_E4M3    : (   1,    1),
 }


--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -61,6 +61,7 @@ def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
    elif (q := _type_traits.get(qtype)) is not None:
        return q.quantize(data)
    else:
+        print(_type_traits)
        raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")


@@ -217,6 +218,110 @@ class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
        return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)


+class FP8_E4M3(__Quant, qtype=GGMLQuantizationType.FP8_E4M3):
+    FP8_EXP_BIAS = 7
+    FP8_MAX_EXP = 14
+    FP8_MANT_BITS = 3
+    FP32_EXP_BIAS = 127
+
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        f32 = blocks.view(np.float32)
+        u32 = f32.view(np.uint32)
+        sign = (u32 >> 31).astype(np.uint32)
+        exp = (u32 >> 23) & 0xFF
+        mant = u32 & 0x7FFFFF
+
+        #special cases
+        is_nan = (exp == 0xFF) & (mant != 0)
+        is_inf = (exp == 0xFF) & (mant == 0)
+        is_zero = (exp == 0) & (mant == 0)
+        
+        #normalize FP32 subnormals
+        is_subnormal_fp32 = (exp == 0) & (mant != 0)
+        leading_zeros = 22 - np.log2(np.maximum(mant, 1)).astype(int)
+        mant = np.where(is_subnormal_fp32, mant << leading_zeros, mant)
+        exp = np.where(is_subnormal_fp32, 1 - leading_zeros, exp)
+
+        #calculate unclipped exponent
+        fp8_exp_raw = exp.astype(np.int32) - (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS)
+        underflow = fp8_exp_raw < 0
+        fp8_exp = np.clip(fp8_exp_raw, 0, cls.FP8_MAX_EXP)
+        
+        # calculate subnormal shift
+        shift = np.where(underflow, 1 - fp8_exp_raw, 0)
+       
+        # align and round mantissa (RNE)
+        mant_plus_implicit = np.where(exp > 0, mant | 0x800000, mant)
+        total_shift = 20 + shift
+        mant_shifted = np.right_shift(mant_plus_implicit, total_shift)
+        round_bit = np.right_shift(mant_plus_implicit, total_shift - 1) & 1
+        sticky_mask = (1 << (total_shift - 1)) - 1
+        sticky = (mant_plus_implicit & sticky_mask) != 0
+        rounded = mant_shifted + ((round_bit & (sticky | (mant_shifted & 1))) != 0)
+        
+        # handle mantissa overflow
+        mant_overflow = rounded >= 16  # 1 << (3+1)
+        fp8_exp = np.where(mant_overflow, fp8_exp + 1, fp8_exp)
+        rounded = np.where(mant_overflow, 8, rounded)  # Reset to 1.000
+        
+        # handle exponent overflow
+        overflow = fp8_exp > cls.FP8_MAX_EXP
+        fp8_exp = np.where(overflow, 0xF, fp8_exp)
+        rounded = np.where(overflow, 0, rounded)
+        
+        # make the FP8
+        fp8 = (
+            (sign << 7) |
+            ((fp8_exp << 3) & 0x78) |
+            (rounded & 0x7)
+        )
+        fp8 = np.where(is_nan, (sign << 7) | 0x7D, fp8)   # NaN
+        fp8 = np.where(is_inf, (sign << 7) | 0x78, fp8)   # Inf
+        fp8 = np.where(is_zero, sign << 7, fp8)           # Zero
+        
+        return fp8.astype(np.uint8)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        fp8 = blocks.astype(np.uint32)
+        sign = (fp8 >> 7) & 1
+        exp = (fp8 >> 3) & 0xF
+        mant = fp8 & 0x7
+        
+        #special cases
+        is_nan = (exp == 0xF) & (mant != 0)
+        is_inf = (exp == 0xF) & (mant == 0)
+        is_zero = (exp == 0) & (mant == 0)
+        is_subnormal = (exp == 0) & (mant != 0)
+        
+        fp32_exp = np.where(
+            exp > 0,
+            exp + (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS),
+            (1 - cls.FP8_EXP_BIAS) + cls.FP32_EXP_BIAS  # -6 + 127 = 121
+        )
+        
+        mant_scale = np.where(
+            is_subnormal,
+            mant.astype(np.float32) * 0.125,  # 1/8
+            1.0 + mant.astype(np.float32) * 0.125
+        )
+        
+        result = np.where(
+            is_nan,
+            np.nan,
+            np.where(
+                is_inf,
+                np.copysign(np.inf, (-1.0)**sign),
+                np.where(
+                    is_zero,
+                    np.copysign(0.0, (-1.0)**sign),
+                    np.ldexp(mant_scale * (-1.0)**sign, fp32_exp - cls.FP32_EXP_BIAS)
+                )
+            )
+        )
+        return result.astype(np.float32)
+
 class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
    @classmethod
    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: