diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index bae071ce..5c311e3b 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -28,6 +28,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
     { "IQ1_BN",   LLAMA_FTYPE_MOSTLY_IQ1_BN,   " 1.62 bpw quantization (Bitnet)",   },
     { "IQ2_BN",   LLAMA_FTYPE_MOSTLY_IQ2_BN,   " 2.00 bpw quantization (Bitnet)",   },
+    { "IQ2_TN",   LLAMA_FTYPE_MOSTLY_IQ2_TN,   " 2.06 bpw quantization (TriLM)",    },
     { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
     { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
     { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 94ffae7e..144e87f5 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -393,6 +393,7 @@ extern "C" {
         GGML_TYPE_IQ3_K   = 38,
         GGML_TYPE_IQ4_K   = 39,
         GGML_TYPE_IQ5_K   = 40,
+        GGML_TYPE_IQ2_TN  = 41,
         GGML_TYPE_COUNT,
     };
 
@@ -443,6 +444,7 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ3_K   = 31, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ4_K   = 32, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ5_K   = 33, // except 1d tensors
+        GGML_FTYPE_MOSTLY_IQ2_TN  = 34, // except 1d tensors
     };
 
     // available tensor operations:
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 423797b6..5847d903 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -407,7 +407,7 @@ typedef struct {
 static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
 
 //
-// Bitnet - implemented as 1.75 bpw
+// Bitnet - implemented as 1.625 bpw
 // The block scale is a waste, but it allows us to plug it in without any additional
 // changes to ggml.
 //
@@ -418,13 +418,21 @@ typedef struct {
 } block_iq1_bn;
 static_assert(sizeof(block_iq1_bn) == 13, "wrong iq1_bn block size/padding");
 //
-// Bitnet - implemented as 2.25 bpw
+// Bitnet - implemented as 2.0 bpw
 //
 #define QK_IQ2BN 64
 typedef struct {
     uint8_t qs[QK_IQ2BN/4];
 } block_iq2_bn;
 static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/padding");
+//
+// TriLM - implemented as 2.0625 bpw
+//
+typedef struct {
+    ggml_half d;
+    uint8_t qs[QK_K/4];
+} block_iq2_tn;
+static_assert(sizeof(block_iq2_tn) == sizeof(ggml_half) + QK_K/4, "wrong iqt_bn block size/padding");
 
 // Used by IQ1_M quants
 typedef union {
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 415249fb..9b3fddbc 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -14996,6 +14996,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         case GGML_TYPE_IQ3_K: break;
         case GGML_TYPE_IQ4_K: break;
         case GGML_TYPE_IQ5_K: break;
+        case GGML_TYPE_IQ2_TN: break;
         case GGML_TYPE_Q4_0_4_4:
         case GGML_TYPE_Q4_0_4_8:
             {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 4ce9948d..5c817030 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -882,6 +882,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K64,
         .nrows                    = 1,
     },
+    [GGML_TYPE_IQ2_TN] = {
+        .type_name                = "iq2_tn",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq2_tn),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_tn,
+        .from_float               = quantize_row_iq2_tn,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq2_tn_ref,
+        .vec_dot                  = vec_dot_iq2_tn_q8_k,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
     [GGML_TYPE_IQ4_NL] = {
         .type_name                = "iq4_nl",
         .blck_size                = QK4_NL,
@@ -3375,6 +3387,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_IQ1_M:         wtype = GGML_TYPE_IQ1_M;    break;
         case GGML_FTYPE_MOSTLY_IQ1_BN:        wtype = GGML_TYPE_IQ1_BN;   break;
         case GGML_FTYPE_MOSTLY_IQ2_BN:        wtype = GGML_TYPE_IQ2_BN;   break;
+        case GGML_FTYPE_MOSTLY_IQ2_TN:        wtype = GGML_TYPE_IQ2_TN;   break;
         case GGML_FTYPE_MOSTLY_IQ4_NL:        wtype = GGML_TYPE_IQ4_NL;   break;
         case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
         case GGML_FTYPE_MOSTLY_IQ2_K:         wtype = GGML_TYPE_IQ2_K;    break;
@@ -9628,6 +9641,7 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_IQ1_M:
         case GGML_TYPE_IQ1_BN:
         case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ2_TN:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ2_K:
@@ -10012,6 +10026,7 @@ static void ggml_compute_forward_add1(
         case GGML_TYPE_IQ1_M:
         case GGML_TYPE_IQ1_BN:
         case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ2_TN:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ2_K:
@@ -10146,6 +10161,7 @@ static void ggml_compute_forward_acc(
         case GGML_TYPE_IQ1_M:
         case GGML_TYPE_IQ1_BN:
         case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ2_TN:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ2_K:
@@ -13069,6 +13085,7 @@ static void ggml_compute_forward_out_prod(
         case GGML_TYPE_IQ1_M:
         case GGML_TYPE_IQ1_BN:
         case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ2_TN:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ2_K:
@@ -13263,6 +13280,7 @@ static void ggml_compute_forward_set(
         case GGML_TYPE_IQ1_M:
         case GGML_TYPE_IQ1_BN:
         case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ2_TN:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ2_K:
@@ -13531,6 +13549,7 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_IQ1_M:
         case GGML_TYPE_IQ1_BN:
         case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ2_TN:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ2_K:
@@ -14126,6 +14145,7 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_IQ1_M:
         case GGML_TYPE_IQ1_BN:
         case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ2_TN:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ2_K:
@@ -20865,6 +20885,7 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ1_BN:  result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ2_BN:  result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ2_TN:  result = quantize_iq2_tn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ2_K:   result = quantize_iq2_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index c840fabf..1cba1532 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -1514,3 +1514,110 @@ size_t quantize_iq5_k(const float * src, void * dst, int64_t nrows, int64_t n_pe
     }
     return nrows * nblock * sizeof(block_iq5_k);
 }
+
+//
+//  ========================== IQ2_TN
+//
+
+void quantize_row_iq2_tn_ref(const float * x, block_iq2_tn  * y, int64_t k) {
+    GGML_ASSERT(k%QK_K == 0);
+
+    int nb = k/QK_K;
+
+    auto quantize = [] (float xmax, float x) {
+        return x < -0.5f*xmax ? 0 : x < 0.5f*xmax ? 1 : 2;
+    };
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        auto xb = x + QK_K*ibl;
+        float max = xb[0];
+        for (int j = 0; j < QK_K; ++j) {
+            float ax = fabsf(xb[j]);
+            max = std::max(ax, max);
+        }
+        y[ibl].d = GGML_FP32_TO_FP16(max);
+        auto qs = y[ibl].qs;
+        for (int l = 0; l < QK_K/128; ++l) {
+            for (int j = 0; j < 32; ++j) {
+                qs[j] = quantize(max, xb[j]) | (quantize(max, xb[j+32]) << 2) | (quantize(max, xb[j+64]) << 4) | (quantize(max, xb[j+96]) << 6);
+            }
+            xb += 128;
+            qs += 32;
+        }
+    }
+}
+
+void   quantize_row_iq2_tn(const float * x, void * y, int64_t k) {
+    quantize_row_iq2_tn_ref(x, (block_iq2_tn *)y, k);
+}
+
+size_t quantize_iq2_tn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * /*imatrix*/) {
+    auto row_size = ggml_row_size(GGML_TYPE_IQ2_TN, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrows; ++row) {
+        quantize_row_iq2_tn_ref(src, (block_iq2_tn *)qrow, n_per_row);
+        qrow += row_size;
+        src  += n_per_row;
+    }
+    return row_size*nrows;
+}
+
+void dequantize_row_iq2_tn(const block_iq2_tn * x, float * y, int64_t k) {
+    GGML_ASSERT(k%QK_K == 0);
+    int nb = k/QK_K;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        float d = GGML_FP16_TO_FP32(x[ibl].d);
+        auto qs = x[ibl].qs;
+        for (int l = 0; l < QK_K/128; ++l) {
+            for (int j = 0; j < 32; ++j) {
+                y[j+ 0] = d*((qs[j] >> 0) & 3) - d;
+                y[j+32] = d*((qs[j] >> 2) & 3) - d;
+                y[j+64] = d*((qs[j] >> 4) & 3) - d;
+                y[j+96] = d*((qs[j] >> 6) & 3) - d;
+            }
+            y  += 128;
+            qs += 32;
+        }
+    }
+}
+
+void   vec_dot_iq2_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+
+    if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_TN, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
+        return;
+    }
+
+    const int nb = n / QK_K;
+
+    const block_iq2_tn * x = (const block_iq2_tn *)vx;
+    const block_q8_K   * y = (const block_q8_K  *)vy;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; i++) {
+        float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        auto qs = x[i].qs;
+        auto q8 = y[i].qs;
+        int sumi1 = 0, sumi2 = 0, sumi3 = 0,sumi4 = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi1 -= y[i].bsums[j];
+        for (int l = 0; l < QK_K/128; ++l) {
+            for (int j = 0; j < 32; ++j) {
+                sumi1 += q8[j+ 0] * (qs[j] & 0x03);
+                sumi2 += q8[j+32] * (qs[j] & 0x0c);
+                sumi3 += q8[j+64] * (qs[j] & 0x30);
+                sumi4 += q8[j+96] * (qs[j] & 0xc0);
+            }
+            q8 += 128;
+            qs += 32;
+        }
+        sumf += d * (sumi1 + 0.25f*sumi2 + 0.0625f*sumi3 + 0.015625f*sumi4);
+    }
+    *s = sumf;
+}
+
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index 0295eb99..80a9012b 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -37,6 +37,12 @@ size_t quantize_iq5_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
 void   dequantize_row_iq5_k(const block_iq5_k  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq5_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
+void   quantize_row_iq2_tn_ref(const float * GGML_RESTRICT x, block_iq2_tn  * GGML_RESTRICT y, int64_t k);
+void   quantize_row_iq2_tn(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+size_t quantize_iq2_tn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+void   dequantize_row_iq2_tn(const block_iq2_tn  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void   vec_dot_iq2_tn_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/llama.h b/include/llama.h
index 15ff915b..a5a2deb1 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -174,6 +174,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ3_K         = 39, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_K         = 40, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ5_K         = 41, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_TN        = 42, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
diff --git a/src/llama.cpp b/src/llama.cpp
index e530f528..7a28314e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3759,6 +3759,7 @@ struct llama_model_loader {
                 case GGML_TYPE_IQ1_M:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_M;   break;
                 case GGML_TYPE_IQ1_BN:  ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN;  break;
                 case GGML_TYPE_IQ2_BN:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN;  break;
+                case GGML_TYPE_IQ2_TN:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_TN;  break;
                 case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                 case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                 case GGML_TYPE_IQ2_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ2_K;   break;
@@ -4471,6 +4472,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
         case LLAMA_FTYPE_MOSTLY_IQ1_BN:   return "IQ1_BN - 1.625 bpw Bitnet";
         case LLAMA_FTYPE_MOSTLY_IQ2_BN:   return "IQ2_BN - 2.00 bpw Bitnet";
+        case LLAMA_FTYPE_MOSTLY_IQ2_TN:   return "IQT_BN - 2.06 bpw TriLM";
 
         default: return "unknown, may not work";
     }
@@ -15437,6 +15439,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN) {
                 new_type = GGML_TYPE_IQ4_NL;
             }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_TN) {
+                new_type = GGML_TYPE_Q4_K;
+            }
             else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
                      new_type == GGML_TYPE_Q4_0_8_8) {
                 new_type = GGML_TYPE_Q4_0;
@@ -15640,7 +15645,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         new_type == GGML_TYPE_IQ2_XS  || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S  ||
         new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S   || new_type == GGML_TYPE_IQ3_S  ||
         new_type == GGML_TYPE_IQ1_M   || new_type == GGML_TYPE_IQ4_K   || new_type == GGML_TYPE_IQ2_K  ||
-        new_type == GGML_TYPE_IQ5_K   || new_type == GGML_TYPE_IQ3_K) {
+        new_type == GGML_TYPE_IQ5_K   || new_type == GGML_TYPE_IQ3_K   || new_type == GGML_TYPE_IQ2_TN) {
         int nx = tensor->ne[0];
         int ny = tensor->ne[1];
         if (nx % QK_K != 0) {
@@ -15665,6 +15670,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             case GGML_TYPE_IQ3_S:
             case GGML_TYPE_IQ1_S:
             case GGML_TYPE_IQ1_M:
+            case GGML_TYPE_IQ2_TN:
             case GGML_TYPE_Q2_K:
             case GGML_TYPE_Q3_K:
             case GGML_TYPE_IQ2_K:
@@ -15773,6 +15779,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ1_M:   default_type = GGML_TYPE_IQ1_M;   break;
         case LLAMA_FTYPE_MOSTLY_IQ1_BN:  default_type = GGML_TYPE_IQ1_BN;  break;
         case LLAMA_FTYPE_MOSTLY_IQ2_BN:  default_type = GGML_TYPE_IQ2_BN;  break;
+        case LLAMA_FTYPE_MOSTLY_IQ2_TN:  default_type = GGML_TYPE_IQ2_TN;  break;
         case LLAMA_FTYPE_MOSTLY_IQ4_NL:  default_type = GGML_TYPE_IQ4_NL;  break;
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ2_K:   default_type = GGML_TYPE_IQ2_K;   break;