Add fp8 GGUF creation

This commit is contained in:
Saood Karim
2025-05-24 03:48:06 -05:00
parent 9fb82af3a8
commit 16597a3ee2
3 changed files with 114 additions and 3 deletions

View File

@@ -195,7 +195,7 @@ class Model:
return False
return name == (key_name + suffix)
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias", ".weight_scale_inv")) -> str:
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
if new_name is None:
raise ValueError(f"Can not map tensor {name!r}")
@@ -316,6 +316,8 @@ class Model:
data_qtype = gguf.GGMLQuantizationType.BF16
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
data_qtype = gguf.GGMLQuantizationType.Q8_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_FP8_E4M3:
data_qtype = gguf.GGMLQuantizationType.FP8_E4M3
else:
raise ValueError(f"Unknown file type: {self.ftype.name}")
@@ -4076,8 +4078,8 @@ def parse_args() -> argparse.Namespace:
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
"--outtype", type=str, choices=["f32", "f16", "bf16", "fp8", "q8_0", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, fp8 for float8_e4m3fn, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
)
parser.add_argument(
"--bigendian", action="store_true",
@@ -4164,6 +4166,7 @@ def main() -> None:
"f16": gguf.LlamaFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
"fp8": gguf.LlamaFileType.MOSTLY_FP8_E4M3,
"auto": gguf.LlamaFileType.GUESSED,
}

View File

@@ -1317,6 +1317,7 @@ class GGMLQuantizationType(IntEnum):
IQ4_KS_R4 = 344
Q8_KV_R8 = 398
Q8_K_R8 = 399
FP8_E4M3 = 999
class ExpertGatingFuncType(IntEnum):
@@ -1395,6 +1396,7 @@ class LlamaFileType(IntEnum):
MOSTLY_IQ4_KS_R4 = 337 #except 1d tensors
MOSTLY_Q8_KV_R8 = 398 #except 1d tensors
MOSTLY_Q8_K_R8 = 399 #except 1d tensors
MOSTLY_FP8_E4M3 = 999 #except 1d tensors
GUESSED = 1024 # not specified in the model file
@@ -1522,6 +1524,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
GGMLQuantizationType.IQ4_KS_R4 : ( 256, 136),
GGMLQuantizationType.Q8_KV_R8 : ( 32, 32),
GGMLQuantizationType.Q8_K_R8 : ( 256, 258),
GGMLQuantizationType.FP8_E4M3 : ( 1, 1),
}

View File

@@ -61,6 +61,7 @@ def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
elif (q := _type_traits.get(qtype)) is not None:
return q.quantize(data)
else:
print(_type_traits)
raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
@@ -217,6 +218,110 @@ class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
class FP8_E4M3(__Quant, qtype=GGMLQuantizationType.FP8_E4M3):
FP8_EXP_BIAS = 7
FP8_MAX_EXP = 14
FP8_MANT_BITS = 3
FP32_EXP_BIAS = 127
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
f32 = blocks.view(np.float32)
u32 = f32.view(np.uint32)
sign = (u32 >> 31).astype(np.uint32)
exp = (u32 >> 23) & 0xFF
mant = u32 & 0x7FFFFF
#special cases
is_nan = (exp == 0xFF) & (mant != 0)
is_inf = (exp == 0xFF) & (mant == 0)
is_zero = (exp == 0) & (mant == 0)
#normalize FP32 subnormals
is_subnormal_fp32 = (exp == 0) & (mant != 0)
leading_zeros = 22 - np.log2(np.maximum(mant, 1)).astype(int)
mant = np.where(is_subnormal_fp32, mant << leading_zeros, mant)
exp = np.where(is_subnormal_fp32, 1 - leading_zeros, exp)
#calculate unclipped exponent
fp8_exp_raw = exp.astype(np.int32) - (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS)
underflow = fp8_exp_raw < 0
fp8_exp = np.clip(fp8_exp_raw, 0, cls.FP8_MAX_EXP)
# calculate subnormal shift
shift = np.where(underflow, 1 - fp8_exp_raw, 0)
# align and round mantissa (RNE)
mant_plus_implicit = np.where(exp > 0, mant | 0x800000, mant)
total_shift = 20 + shift
mant_shifted = np.right_shift(mant_plus_implicit, total_shift)
round_bit = np.right_shift(mant_plus_implicit, total_shift - 1) & 1
sticky_mask = (1 << (total_shift - 1)) - 1
sticky = (mant_plus_implicit & sticky_mask) != 0
rounded = mant_shifted + ((round_bit & (sticky | (mant_shifted & 1))) != 0)
# handle mantissa overflow
mant_overflow = rounded >= 16 # 1 << (3+1)
fp8_exp = np.where(mant_overflow, fp8_exp + 1, fp8_exp)
rounded = np.where(mant_overflow, 8, rounded) # Reset to 1.000
# handle exponent overflow
overflow = fp8_exp > cls.FP8_MAX_EXP
fp8_exp = np.where(overflow, 0xF, fp8_exp)
rounded = np.where(overflow, 0, rounded)
# make the FP8
fp8 = (
(sign << 7) |
((fp8_exp << 3) & 0x78) |
(rounded & 0x7)
)
fp8 = np.where(is_nan, (sign << 7) | 0x7D, fp8) # NaN
fp8 = np.where(is_inf, (sign << 7) | 0x78, fp8) # Inf
fp8 = np.where(is_zero, sign << 7, fp8) # Zero
return fp8.astype(np.uint8)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
fp8 = blocks.astype(np.uint32)
sign = (fp8 >> 7) & 1
exp = (fp8 >> 3) & 0xF
mant = fp8 & 0x7
#special cases
is_nan = (exp == 0xF) & (mant != 0)
is_inf = (exp == 0xF) & (mant == 0)
is_zero = (exp == 0) & (mant == 0)
is_subnormal = (exp == 0) & (mant != 0)
fp32_exp = np.where(
exp > 0,
exp + (cls.FP32_EXP_BIAS - cls.FP8_EXP_BIAS),
(1 - cls.FP8_EXP_BIAS) + cls.FP32_EXP_BIAS # -6 + 127 = 121
)
mant_scale = np.where(
is_subnormal,
mant.astype(np.float32) * 0.125, # 1/8
1.0 + mant.astype(np.float32) * 0.125
)
result = np.where(
is_nan,
np.nan,
np.where(
is_inf,
np.copysign(np.inf, (-1.0)**sign),
np.where(
is_zero,
np.copysign(0.0, (-1.0)**sign),
np.ldexp(mant_scale * (-1.0)**sign, fp32_exp - cls.FP32_EXP_BIAS)
)
)
)
return result.astype(np.float32)
class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: