From 042ea88616364b4d56e2c237e133968247c1c4a3 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Sat, 14 Dec 2024 19:06:19 +0200
Subject: [PATCH] Rename bf16_r4 to bf16_r16

We are interleaving 16 rows now.
---
 examples/quantize/quantize.cpp |  2 +-
 ggml/include/ggml.h            |  4 ++--
 ggml/src/ggml-quants.c         |  2 +-
 ggml/src/ggml.c                | 12 +++++------
 ggml/src/iqk/iqk_mul_mat.cpp   | 37 +++++++++++++---------------------
 ggml/src/iqk/iqk_quantize.cpp  |  4 ++--
 ggml/src/iqk/iqk_quantize.h    |  4 ++--
 include/llama.h                |  2 +-
 src/llama.cpp                  | 12 +++++------
 9 files changed, 35 insertions(+), 44 deletions(-)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 4401eb1d..5e5dd7c0 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -77,7 +77,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, -0.0020 ppl @ Mistral-7B", },
     { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B", },
-    { "BF16_R4",  LLAMA_FTYPE_MOSTLY_BF16_R4,  "14.00G, -0.0050 ppl @ Mistral-7B", },
+    { "BF16_R16", LLAMA_FTYPE_MOSTLY_BF16_R16, "14.00G, -0.0050 ppl @ Mistral-7B", },
     { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B", },
     // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
     { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index febb8960..7b7fde0d 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -420,7 +420,7 @@ extern "C" {
         GGML_TYPE_Q6_K_R4   = 214,
         GGML_TYPE_IQ4_NL_R4 = 220,
         GGML_TYPE_IQ4_XS_R4 = 223,
-        GGML_TYPE_BF16_R4   = 230,
+        GGML_TYPE_BF16_R16  = 230,
         GGML_TYPE_Q6_0_R4   = 233,
         GGML_TYPE_IQ2_BN_R4 = 335,
         GGML_TYPE_IQ4_K_R4  = 339,
@@ -494,7 +494,7 @@ extern "C" {
         GGML_FTYPE_MOSTLY_Q6_K_R4   = 214, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ4_NL_R4 = 219, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors
-        GGML_FTYPE_MOSTLY_BF16_R4   = 224, // except 1d tensors
+        GGML_FTYPE_MOSTLY_BF16_R16  = 224, // except 1d tensors
         GGML_FTYPE_MOSTLY_Q6_0_R4   = 227, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ2_BN_R4 = 329, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ4_K_R4  = 332, // except 1d tensors
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index d76d41d9..0b157295 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -15209,7 +15209,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         case GGML_TYPE_Q6_K_R4: break;
         case GGML_TYPE_IQ4_K_R4: break;
         case GGML_TYPE_Q8_K_R8: break;
-        case GGML_TYPE_BF16_R4: break;
+        case GGML_TYPE_BF16_R16: break;
         case GGML_TYPE_Q4_0_4_4:
         case GGML_TYPE_Q4_0_4_8:
             {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1bdc7e92..51ef6eb2 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1231,8 +1231,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
         .row_meta_size            = 0,
     },
-    [GGML_TYPE_BF16_R4] = {
-        .type_name                = "bf16_r4",
+    [GGML_TYPE_BF16_R16] = {
+        .type_name                = "bf16_r16",
         .blck_size                = 1,
         .type_size                = sizeof(ggml_bf16_t),
         .is_quantized             = false,
@@ -4123,7 +4123,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
         case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
         case GGML_FTYPE_MOSTLY_BF16:          wtype = GGML_TYPE_BF16;  break;
-        case GGML_FTYPE_MOSTLY_BF16_R4:       wtype = GGML_TYPE_BF16_R4;break;
+        case GGML_FTYPE_MOSTLY_BF16_R16:      wtype = GGML_TYPE_BF16_R16;break;
         case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
         case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
         case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
@@ -15762,7 +15762,7 @@ static void ggml_compute_forward_clamp(
             } break;
         case GGML_TYPE_F16:
         case GGML_TYPE_BF16:
-        case GGML_TYPE_BF16_R4:
+        case GGML_TYPE_BF16_R16:
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q5_0:
@@ -22666,9 +22666,9 @@ size_t ggml_quantize_chunk(
                 ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
                 result = n * elemsize;
             } break;
-        case GGML_TYPE_BF16_R4:
+        case GGML_TYPE_BF16_R16:
             {
-                repack_f32_bf16_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row);
+                repack_f32_bf16_r16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row);
                 result = nrows * row_size;
             } break;
         case GGML_TYPE_F32:
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index 9e954e72..8fdf7163 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -184,7 +184,7 @@ struct MulMat {
             case GGML_TYPE_IQ4_XS_R4:
             case GGML_TYPE_IQ2_BN_R4: return 4;
             case GGML_TYPE_Q8_K_R8: return 8;
-            case GGML_TYPE_BF16_R4: return 16;
+            case GGML_TYPE_BF16_R16: return 16;
             default: return 1;
         }
     }
@@ -3884,7 +3884,7 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
 
 #ifdef __AVX512BF16__
 template <int nrc_y>
-static void mul_mat_bf16_r8_bf16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+static void mul_mat_bf16_r16_bf16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
     GGML_ASSERT(nrc_x%16 == 0);
     const ggml_bf16_t * y[nrc_y];
     for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const ggml_bf16_t *)info.src1_row(iy);
@@ -5994,24 +5994,16 @@ void set_mul_mat_bf16(MulMat& mm) {
     mm.funcs[3] = mul_mat_fX_fY_T<4>;
     mm.funcs[4] = mul_mat_fX_fY_T<5>;
 }
-void set_mul_mat_bf16_r4(MulMat& mm) {
+void set_mul_mat_bf16_r16(MulMat& mm) {
     for (auto& f : mm.funcs) f = nullptr;
-    mm.funcs[0] = mul_mat_bf16_r8_bf16<1>;
-    mm.funcs[1] = mul_mat_bf16_r8_bf16<2>;
-    mm.funcs[2] = mul_mat_bf16_r8_bf16<3>;
-    mm.funcs[3] = mul_mat_bf16_r8_bf16<4>;
-    mm.funcs[4] = mul_mat_bf16_r8_bf16<5>;
-    mm.funcs[5] = mul_mat_bf16_r8_bf16<6>;
-    mm.funcs[6] = mul_mat_bf16_r8_bf16<7>;
-    mm.funcs[7] = mul_mat_bf16_r8_bf16<8>;
-    //mm.funcs[0] = mul_mat_fX_fY_r4<1>;
-    //mm.funcs[1] = mul_mat_fX_fY_r4<2>;
-    //mm.funcs[2] = mul_mat_fX_fY_r4<3>;
-    //mm.funcs[3] = mul_mat_fX_fY_r4<4>;
-    ////mm.funcs[4] = mul_mat_fX_fY_r4<5>;
-    ////mm.funcs[5] = mul_mat_fX_fY_r4<6>;
-    ////mm.funcs[6] = mul_mat_fX_fY_r4<7>;
-    ////mm.funcs[7] = mul_mat_fX_fY_r4<8>;
+    mm.funcs[0] = mul_mat_bf16_r16_bf16<1>;
+    mm.funcs[1] = mul_mat_bf16_r16_bf16<2>;
+    mm.funcs[2] = mul_mat_bf16_r16_bf16<3>;
+    mm.funcs[3] = mul_mat_bf16_r16_bf16<4>;
+    mm.funcs[4] = mul_mat_bf16_r16_bf16<5>;
+    mm.funcs[5] = mul_mat_bf16_r16_bf16<6>;
+    mm.funcs[6] = mul_mat_bf16_r16_bf16<7>;
+    mm.funcs[7] = mul_mat_bf16_r16_bf16<8>;
 }
 #endif
 
@@ -6030,12 +6022,11 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
         return true;
     }
 
-    if (typeA == GGML_TYPE_BF16_R4) {
-        //printf("%s: %s\n", __func__, ggml_type_name((ggml_type)typeB));
-        if (ne00 % 32) return false;
+    if (typeA == GGML_TYPE_BF16_R16) {
+        if (ne00 % 16) return false;
         switch (typeB) {
 #ifdef __AVX512BF16__
-            case GGML_TYPE_BF16: set_mul_mat_bf16_r4(mm); break;
+            case GGML_TYPE_BF16: set_mul_mat_bf16_r16(mm); break;
 #endif
             default: return false;
         }
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 3a0a2565..abe81858 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -4787,11 +4787,11 @@ void repack_bf16(int nrows, int n_per_row, const T * x, ggml_bf16_t * y) {
 }
 }
 
-void repack_f32_bf16_r4(const void * src, void * dst, int64_t nrows, int64_t n_per_row) {
+void repack_f32_bf16_r16(const void * src, void * dst, int64_t nrows, int64_t n_per_row) {
     repack_bf16(nrows, n_per_row, (const float *)src, (ggml_bf16_t *)dst);
 }
 
-void repack_bf16_bf16_r4(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row) {
+void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row) {
     repack_bf16(nrows, n_per_row, (const ggml_bf16_t *)src, (ggml_bf16_t *)dst);
 }
 
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index 10754f21..e8721a5e 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -158,8 +158,8 @@ void quantize_row_q8_K16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
 void quantize_row_q8_K32(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_KR8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
-void repack_f32_bf16_r4 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
-void repack_bf16_bf16_r4(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
+void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
+void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
 
 #ifdef __cplusplus
 }
diff --git a/include/llama.h b/include/llama.h
index 10ed9cc5..988ffec7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -191,7 +191,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ4_NL_R4     = 225, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_XS_R4     = 230, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q6_0_R4       = 335, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_BF16_R4       = 232, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_BF16_R16      = 232, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_BN_R4     = 337, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_K_R4      = 340, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q8_K_R8       = 399, // except 1d tensors
diff --git a/src/llama.cpp b/src/llama.cpp
index 8de6f5f3..536b2f97 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3828,7 +3828,7 @@ struct llama_model_loader {
                 case GGML_TYPE_F32:     ftype = LLAMA_FTYPE_ALL_F32;        break;
                 case GGML_TYPE_F16:     ftype = LLAMA_FTYPE_MOSTLY_F16;     break;
                 case GGML_TYPE_BF16:    ftype = LLAMA_FTYPE_MOSTLY_BF16;    break;
-                case GGML_TYPE_BF16_R4: ftype = LLAMA_FTYPE_MOSTLY_BF16_R4; break;
+                case GGML_TYPE_BF16_R16:ftype = LLAMA_FTYPE_MOSTLY_BF16_R16;break;
                 case GGML_TYPE_Q4_0:    ftype = LLAMA_FTYPE_MOSTLY_Q4_0;    break;
                 case GGML_TYPE_Q4_1:    ftype = LLAMA_FTYPE_MOSTLY_Q4_1;    break;
                 case GGML_TYPE_Q5_0:    ftype = LLAMA_FTYPE_MOSTLY_Q5_0;    break;
@@ -4541,7 +4541,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_ALL_F32:         return "all F32";
         case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
         case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
-        case LLAMA_FTYPE_MOSTLY_BF16_R4:  return "BF16_R4";
+        case LLAMA_FTYPE_MOSTLY_BF16_R16: return "BF16_R16";
         case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
         case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
         case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
@@ -15835,7 +15835,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             else if (new_type == GGML_TYPE_Q8_0_R4) {
                 new_type = GGML_TYPE_Q8_0;
             }
-            else if (new_type == GGML_TYPE_BF16_R4) {
+            else if (new_type == GGML_TYPE_BF16_R16) {
                 new_type = GGML_TYPE_BF16;
             }
         }
@@ -16233,7 +16233,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
         case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
         case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
-        case LLAMA_FTYPE_MOSTLY_BF16_R4: default_type = GGML_TYPE_BF16_R4; break;
+        case LLAMA_FTYPE_MOSTLY_BF16_R16: default_type = GGML_TYPE_BF16_R16; break;
         case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
 
         // K-quants
@@ -16526,7 +16526,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
         if (quantize) {
             new_type = default_type;
-            if (new_type == GGML_TYPE_BF16_R4 && strcmp(tensor->name, "token_embd.weight") == 0) {
+            if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = GGML_TYPE_BF16;
             }
 
@@ -16689,7 +16689,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_K;
                 else chunk_size_multiplier = 4;
             }
-            else if (new_type == GGML_TYPE_BF16_R4) {
+            else if (new_type == GGML_TYPE_BF16_R16) {
                 if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16;
                 else chunk_size_multiplier = 16;
             }