Add low-precision POWER10 gemm kernels (#467)

Details: - This commit adds a new BLIS sandbox that (1) provides implementations based on low-precision gemm kernels, and (2) extends the BLIS typed API for those new implementations. Currently, these new kernels can only be used for the POWER10 microarchitecture; however, they may provide a template for developing similar kernels for other microarchitectures (even those beyond POWER), as changes would likely be limited to select places in the microkernel and possibly the packing routines. The new low-precision operations that are now supported include: shgemm, sbgemm, i16gemm, i8gemm, i4gemm. For more information, refer to the POWER10.md document that is included in 'sandbox/power10'.
2026-06-06 04:34:02 +00:00 · 2021-03-05 13:53:43 -06:00
parent b8dcc5bc75
commit 670bc7b60f
24 changed files with 3363 additions and 371 deletions
--- a/config/power10/bli_cntx_init_power10.c
+++ b/config/power10/bli_cntx_init_power10.c
@@ -122,9 +122,9 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	//                                           s      d      c      z
 	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     8,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   576,   576,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  1408,  1408,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8192,  8184,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   832,   320,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  1026,   960,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,    -1,    -1 );


 	// Update the context with the current architecture's register and cache
--- a/config/power10/make_defs.mk
+++ b/config/power10/make_defs.mk
@@ -66,8 +66,12 @@ CKOPTFLAGS     := $(COPTFLAGS) -O3
 ifeq ($(CC_VENDOR),gcc)
 CKVECFLAGS     := -mcpu=power10 -mtune=power10
 else
+ifeq ($(CC_VENDOR),clang)
+CKVECFLAGS     := -mcpu=power10 -mtune=power10
+else
 $(info $(CC_VENDOR)) 
-$(error gcc is required for this configuration.)
+$(error gcc, clang is required for this configuration.)
+endif
 endif

 # Flags specific to reference kernels.
@@ -77,4 +81,3 @@ CRVECFLAGS     := $(CKVECFLAGS)
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
-
--- a/frame/3/bli_l3_ukr_prot.h
+++ b/frame/3/bli_l3_ukr_prot.h
@@ -36,16 +36,18 @@
 // Define template prototypes for level-3 micro-kernels.
 //

-#define GEMM_UKR_PROT( ctype, ch, opname ) \
+#define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname)
+
+#define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \
 \
 void PASTEMAC(ch,opname) \
     ( \
       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       ctype_out* restrict alpha, \
+       ctype_in*  restrict a, \
+       ctype_in*  restrict b, \
+       ctype_out* restrict beta, \
+       ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \
       auxinfo_t* restrict data, \
       cntx_t*    restrict cntx  \
     );
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -0,0 +1,192 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#include "vector_int_macros.h"
+
+#define D_ASSEMBLE_VEC_PAIR \
+        __builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \
+        __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); 
+
+#define D_ACCUMULATE \
+        __builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \
+        __builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \
+        __builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \
+        __builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \
+        __builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \
+        __builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \
+        __builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \
+        __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); 
+
+#define D_INCREMENT \
+        A0+=8; \
+        B0+=8;
+
+#define D_AB_PRODUCT \
+        LOAD_VECTORS \
+        D_ASSEMBLE_VEC_PAIR \
+        D_INCREMENT \
+        D_ACCUMULATE 
+
+
+void bli_dgemm_power10_mma_8x8
+    (
+        dim_t               k0,
+        double*    restrict alpha,
+        double*    restrict a,
+        double*    restrict b,
+        double*    restrict beta,
+        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
+    uint64_t k_iter = (k0-1) / 4;
+    uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    double* restrict A0 = a;
+    double* restrict B0 = b;
+    double* restrict C0 = c;
+
+    double alpha_ = *alpha,
+           beta_ = *beta;
+
+    dv4sf_t result[4];
+    dv4sf_t *rowC;
+
+    /* 8 accumulator registers that will be used to store the result.
+       
+       Each accumulator register is mapped to 4 vector registers.
+       Illustration:
+                      
+            acc0 = [  vs0
+                      vs1
+                      vs3
+                      vs4  ]
+
+        These registers are used to store the result of an outer product 
+        instruction (general outer product instruction syntax: xv???ger??). */
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    /* 2 vector pairs are necessary for a double precision outer product 
+       instruction. */
+    __vector_pair colA_1, 
+                  colA_2;
+
+    /* Prefetch C so that it stays in cache */
+    PREFETCH1 (C0, 0);
+    PREFETCH1 (C0 + rs_c, 0);
+    PREFETCH1 (C0 + rs_c + rs_c, 0);
+    PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0);
+    PREFETCH1 (C0, 128);
+    PREFETCH1 (C0 + rs_c, 128);
+    PREFETCH1 (C0 + rs_c + rs_c, 128);
+    PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128);
+
+    /* Load elements into vector registers */
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0; 
+
+    /* Each accumulator represents a matrix of size 
+       4 x ( 16 / (datatype size in bytes) )  (vector register size = 16B)
+
+       Thus in the case of double, the accumulate registers represent a 4x2 
+       matrix. However, a vector register can hold at most 2 doubles. Thus, if
+       we performed an outer product using 2 vector register, we can only get a 
+       2x2 matrix. Therefore, we must create a vector register pair in order
+       to get the desired 4x2 matrix.
+    
+    */
+    D_ASSEMBLE_VEC_PAIR
+
+    /* Compute accumulate outer products and override accumulators with result */
+    __builtin_mma_xvf64ger (&acc0, colA_1, rb[0]);
+    __builtin_mma_xvf64ger (&acc1, colA_1, rb[1]);
+    __builtin_mma_xvf64ger (&acc2, colA_1, rb[2]);
+    __builtin_mma_xvf64ger (&acc3, colA_1, rb[3]);
+    __builtin_mma_xvf64ger (&acc4, colA_2, rb[0]);
+    __builtin_mma_xvf64ger (&acc5, colA_2, rb[1]);
+    __builtin_mma_xvf64ger (&acc6, colA_2, rb[2]);
+    __builtin_mma_xvf64ger (&acc7, colA_2, rb[3]);
+
+    /* Move A and B pointers */
+    D_INCREMENT
+
+    // k loop (unrolled by 4)
+    for (int k = 0; k<k_iter; k++)
+    {
+        D_AB_PRODUCT
+        D_AB_PRODUCT
+        D_AB_PRODUCT
+        D_AB_PRODUCT
+    }
+    
+    // edge loop
+    for (int k = 0; k<k_left; k++)
+    {
+        D_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(dv4sf_t, &acc0, rs_c, 0      );
+        SAVE_ACC(dv4sf_t, &acc1, rs_c, 4      );
+        SAVE_ACC(dv4sf_t, &acc2, rs_c, 8      );
+        SAVE_ACC(dv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(dv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(dv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(dv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(dv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(dv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(dv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(dv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(dv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(dv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(dv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(dv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(dv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+
+}
--- a/kernels/power10/3/bli_gemm_power10_mma.c
+++ b/kernels/power10/3/bli_gemm_power10_mma.c
@@ -1,359 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-typedef double dv4sf_t __attribute__ ((vector_size (16)));
-typedef unsigned char vec_t __attribute__ ((vector_size (16)));
-
-/*  disassemble the acc accumulator into a result array of vectors
-	store the result accordingly  */
-#define dgemm_SAVE_ACC_(ACC, rs_c, j)                   \
-    __builtin_mma_disassemble_acc (result, ACC);      \
-    rowC = (dv4sf_t *) &C0[j];                        \
-    rowC[0] = alpha_ * result[0] + beta_ * rowC[0];   \
-    rowC = (dv4sf_t *) &C0[rs_c+j];                     \
-    rowC[0] = alpha_ * result[1] + beta_ * rowC[0];   \
-    rowC = (dv4sf_t *) &C0[2*rs_c+j];                   \
-    rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ;  \
-    rowC = (dv4sf_t *) &C0[3*rs_c+j];                   \
-    rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
-
-#define dgemm_SAVE_ACC_bz(ACC, rs_c, j)                 \
-    __builtin_mma_disassemble_acc (result, ACC);      \
-    rowC = (dv4sf_t *) &C0[j];                        \
-    rowC[0] = alpha_ * result[0];                     \
-    rowC = (dv4sf_t *) &C0[rs_c+j];                     \
-    rowC[0] = alpha_ * result[1];                     \
-    rowC = (dv4sf_t *) &C0[2*rs_c+j];                   \
-    rowC[0] = alpha_ * result[2];                     \
-    rowC = (dv4sf_t *) &C0[3*rs_c+j];                   \
-    rowC[0] = alpha_ * result[3];
-
-#define PREFETCH1(x, y) __asm__ volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
-
-#define LOAD_VECTORS \
-		ca = (vec_t *) A0; \
-		rb = (vec_t *) B0; 
-
-#define D_ASSEMBLE_VEC_PAIR \
-		__builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \
-		__builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); 
-
-#define D_ACCUMULATE \
-		__builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \
-		__builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \
-		__builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \
-		__builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \
-		__builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \
-		__builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \
-		__builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \
-		__builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); 
-
-#define D_INCREMENT \
-		A0+=8; \
-		B0+=8;
-
-#define D_AB_PRODUCT \
-		LOAD_VECTORS \
-		D_ASSEMBLE_VEC_PAIR \
-		D_INCREMENT \
-		D_ACCUMULATE 
-
-
-void bli_dgemm_power10_mma_8x8
-	(
-		dim_t               k0,
-		double*    restrict alpha,
-		double*    restrict a,
-		double*    restrict b,
-		double*    restrict beta,
-		double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-		auxinfo_t* restrict data,
-		cntx_t*    restrict cntx
-	)
-{
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
-	uint64_t k_iter = (k0-1) / 4;
-	uint64_t k_left = (k0-1) % 4;
-
-	uint64_t rs_c   = rs_c0;
-
-	double* restrict A0 = a;
-	double* restrict B0 = b;
-	double* restrict C0 = c;
-
-	double alpha_ = *alpha,
-	       beta_ = *beta;
-
-	dv4sf_t result[4];
-  	dv4sf_t *rowC;
-
-	/* 8 accumulator registers that will be used to store the result.
-	   
-	   Each accumulator register is mapped to 4 vector registers.
-	   Illustration:
-					  
-			acc0 = [  vs0
-					  vs1
-			          vs3
-					  vs4  ]
-
-		These registers are used to store the result of an outer product 
-		instruction (general outer product instruction syntax: xv???ger??). */
-	__vector_quad acc0, acc1, acc2, acc3, 
-	              acc4, acc5, acc6, acc7;
-
-	/* 2 vector pairs are necessary for a double precision outer product 
-	   instruction. */
-	__vector_pair colA_1, 
-	              colA_2;
-
-	/* Prefetch C so that it stays in cache */
-	PREFETCH1 (C0, 0);
-	PREFETCH1 (C0 + rs_c, 0);
-	PREFETCH1 (C0 + rs_c + rs_c, 0);
-	PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0);
-	PREFETCH1 (C0, 128);
-	PREFETCH1 (C0 + rs_c, 128);
-	PREFETCH1 (C0 + rs_c + rs_c, 128);
-	PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128);
-
-	/* Load elements into vector registers */
-	vec_t *ca = (vec_t *) A0;
-	vec_t *rb = (vec_t *) B0; 
-
-	/* Each accumulator represents a matrix of size 
-	   4 x ((datatype size in bytes) / 16)  (vector register size = 128b)
-
-	   Thus in the case of double, the accumulate registers represent a 4x2 
-	   matrix. However, a vector register can hold at most 2 doubles. Thus, if
-	   we performed an outer product using 2 vector register, we can only get a 
-	   2x2 matrix. Therefore, we must create a vector register pair in order
-	   to get the desired 4x2 matrix.
-	
-	*/
-	D_ASSEMBLE_VEC_PAIR
-
-	/* Compute accumulate outer products and override accumulators with result */
-	__builtin_mma_xvf64ger (&acc0, colA_1, rb[0]);
-	__builtin_mma_xvf64ger (&acc1, colA_1, rb[1]);
-	__builtin_mma_xvf64ger (&acc2, colA_1, rb[2]);
-	__builtin_mma_xvf64ger (&acc3, colA_1, rb[3]);
-	__builtin_mma_xvf64ger (&acc4, colA_2, rb[0]);
-	__builtin_mma_xvf64ger (&acc5, colA_2, rb[1]);
-	__builtin_mma_xvf64ger (&acc6, colA_2, rb[2]);
-	__builtin_mma_xvf64ger (&acc7, colA_2, rb[3]);
-
-	/* Move A and B pointers */
-	D_INCREMENT
-
-	// k loop (unrolled by 4)
-	for (int k = 0; k<k_iter; k++)
-	{
-		D_AB_PRODUCT
-		D_AB_PRODUCT
-		D_AB_PRODUCT
-		D_AB_PRODUCT
-	}
-	
-	// edge loop
-	for (int k = 0; k<k_left; k++)
-	{
-		D_AB_PRODUCT
-	}
-
-	// handle beta cases
-	if (beta_ != 0.0)
-	{
-		dgemm_SAVE_ACC_(&acc0, rs_c, 0       );
-		dgemm_SAVE_ACC_(&acc1, rs_c, 2       );
-		dgemm_SAVE_ACC_(&acc2, rs_c, 4       );
-		dgemm_SAVE_ACC_(&acc3, rs_c, 6       );
-		dgemm_SAVE_ACC_(&acc4, rs_c,   4*rs_c);
-		dgemm_SAVE_ACC_(&acc5, rs_c, 2+4*rs_c);
-		dgemm_SAVE_ACC_(&acc6, rs_c, 4+4*rs_c);
-		dgemm_SAVE_ACC_(&acc7, rs_c, 6+4*rs_c);
-	}
-	else
-	{
-		dgemm_SAVE_ACC_bz(&acc0, rs_c, 0       );
-		dgemm_SAVE_ACC_bz(&acc1, rs_c, 2       );
-		dgemm_SAVE_ACC_bz(&acc2, rs_c, 4       );
-		dgemm_SAVE_ACC_bz(&acc3, rs_c, 6       );
-		dgemm_SAVE_ACC_bz(&acc4, rs_c,   4*rs_c);
-		dgemm_SAVE_ACC_bz(&acc5, rs_c, 2+4*rs_c);
-		dgemm_SAVE_ACC_bz(&acc6, rs_c, 4+4*rs_c);
-		dgemm_SAVE_ACC_bz(&acc7, rs_c, 6+4*rs_c);
-	}
-
-}
-
-
-typedef float fv4sf_t __attribute__ ((vector_size (16)));
-
-#define sgemm_SAVE_ACC_(ACC, rs_c, j)                \
-    __builtin_mma_disassemble_acc (result, ACC);       \
-    rowC = (fv4sf_t *) &C0[j];                        \
-    rowC[0] = alpha_ * result[0] + beta_ * rowC[0];    \
-    rowC = (fv4sf_t *) &C0[rs_c+j];                     \
-    rowC[0] = alpha_ * result[1] + beta_ * rowC[0];    \
-    rowC = (fv4sf_t *) &C0[2*rs_c+j];                   \
-    rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ;   \
-    rowC = (fv4sf_t *) &C0[3*rs_c+j];                   \
-    rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
-
-#define sgemm_SAVE_ACC_bz(ACC, rs_c, j)                     \
-    __builtin_mma_disassemble_acc (result, ACC);     \
-    rowC = (fv4sf_t *) &C0[j];                      \
-    rowC[0] = alpha_ * result[0];                      \
-    rowC = (fv4sf_t *) &C0[rs_c+j];                     \
-    rowC[0] = alpha_ * result[1];                      \
-    rowC = (fv4sf_t *) &C0[2*rs_c+j];                   \
-    rowC[0] = alpha_ * result[2];                      \
-    rowC = (fv4sf_t *) &C0[3*rs_c+j];                   \
-    rowC[0] = alpha_ * result[3];
-
-#define S_ACCUMULATE \
-		__builtin_mma_xvf32gerpp (&acc0, ca[0], rb[0]); \
-		__builtin_mma_xvf32gerpp (&acc1, ca[0], rb[1]); \
-		__builtin_mma_xvf32gerpp (&acc2, ca[0], rb[2]); \
-		__builtin_mma_xvf32gerpp (&acc3, ca[0], rb[3]); \
-		__builtin_mma_xvf32gerpp (&acc4, ca[1], rb[0]); \
-		__builtin_mma_xvf32gerpp (&acc5, ca[1], rb[1]); \
-		__builtin_mma_xvf32gerpp (&acc6, ca[1], rb[2]); \
-		__builtin_mma_xvf32gerpp (&acc7, ca[1], rb[3]); 
-
-#define S_INCREMENT \
-		A0+=8; \
-		B0+=16;
-
-#define S_AB_PRODUCT \
-		LOAD_VECTORS \
-		S_INCREMENT \
-		S_ACCUMULATE 
-
-void bli_sgemm_power10_mma_8x16
-	(
-		dim_t               k0,
-		float*     restrict alpha,
-		float*     restrict a,
-		float*     restrict b,
-		float*     restrict beta,
-		float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-		auxinfo_t* restrict data,
-		cntx_t*    restrict cntx
-	)
-{
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
-	uint64_t k_iter = (k0-1) / 4;
-	uint64_t k_left = (k0-1) % 4;
-	
-	uint64_t rs_c   = rs_c0;
-
-	fv4sf_t result[4];
-  	fv4sf_t *rowC;
-
-	// accumulators that will hold the matrix product
-	__vector_quad acc0, acc1, acc2, acc3, 
-	              acc4, acc5, acc6, acc7;
-
-	float* restrict A0 = a;
-	float* restrict B0 = b;
-	float* restrict C0 = c;
-
-	float alpha_ = *alpha,
-	      beta_  = *beta;
-
-	/* Load elements into vector registers */
-	vec_t *ca = (vec_t *) A0;
-	vec_t *rb = (vec_t *) B0;
-
-	/* Compute accumulate outer products and override accumulators with result */
-	__builtin_mma_xvf32ger (&acc0, ca[0], rb[0]);
-	__builtin_mma_xvf32ger (&acc1, ca[0], rb[1]);
-	__builtin_mma_xvf32ger (&acc2, ca[0], rb[2]);
-	__builtin_mma_xvf32ger (&acc3, ca[0], rb[3]);
-	__builtin_mma_xvf32ger (&acc4, ca[1], rb[0]);
-	__builtin_mma_xvf32ger (&acc5, ca[1], rb[1]);
-	__builtin_mma_xvf32ger (&acc6, ca[1], rb[2]);
-	__builtin_mma_xvf32ger (&acc7, ca[1], rb[3]);
-
-	S_INCREMENT
-
-	// k loop (unrolled by 4)
-	for (int k = 0; k<k_iter; k++)
-	{
-		S_AB_PRODUCT
-		S_AB_PRODUCT
-		S_AB_PRODUCT
-		S_AB_PRODUCT
-	}
-	
-	// edge loop
-	for (int k = 0; k<k_left; k++)
-	{
-		S_AB_PRODUCT
-	}
-
-	// handle beta cases
-	if (beta_ != 0.0)
-	{
-		sgemm_SAVE_ACC_(&acc0, rs_c, 0      );
-		sgemm_SAVE_ACC_(&acc1, rs_c, 4      );
-		sgemm_SAVE_ACC_(&acc2, rs_c, 8      );
-		sgemm_SAVE_ACC_(&acc3, rs_c, 12     );
-		sgemm_SAVE_ACC_(&acc4, rs_c,    4*rs_c);
-		sgemm_SAVE_ACC_(&acc5, rs_c,  4+4*rs_c);
-		sgemm_SAVE_ACC_(&acc6, rs_c,  8+4*rs_c);
-		sgemm_SAVE_ACC_(&acc7, rs_c, 12+4*rs_c);
-	}
-	else
-	{
-		sgemm_SAVE_ACC_bz( &acc0, rs_c,  0     );
-		sgemm_SAVE_ACC_bz( &acc1, rs_c,  4     );
-		sgemm_SAVE_ACC_bz( &acc2, rs_c,  8     );
-		sgemm_SAVE_ACC_bz( &acc3, rs_c, 12     );
-		sgemm_SAVE_ACC_bz( &acc4, rs_c,    4*rs_c);
-		sgemm_SAVE_ACC_bz( &acc5, rs_c,  4+4*rs_c);
-		sgemm_SAVE_ACC_bz( &acc6, rs_c,  8+4*rs_c);
-		sgemm_SAVE_ACC_bz( &acc7, rs_c, 12+4*rs_c);
-	}
-}
-
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -0,0 +1,140 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define I16_ACCUMULATE \
+    __builtin_mma_xvi16ger2pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvi16ger2pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvi16ger2pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvi16ger2pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvi16ger2pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvi16ger2pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvi16ger2pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvi16ger2pp (&acc7, ca[1], rb[3]);
+
+#define I16_INCREMENT \
+    A0+=16; \
+    B0+=32;
+
+#define I16_AB_PRODUCT \
+    LOAD_VECTORS \
+    I16_INCREMENT \
+    I16_ACCUMULATE
+
+void bli_i16gemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        int32_t*       restrict alpha,
+        short*     restrict a,
+        short*     restrict b,
+        int32_t*       restrict beta,
+        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1) / 4;
+    uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    short* restrict A0 = a;
+    short* restrict B0 = b;
+    int*   restrict C0 = c;
+
+    int alpha_ = *alpha,
+        beta_ = *beta;
+
+    iv4sf_t result[4];
+    iv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    __builtin_mma_xvi16ger2 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvi16ger2 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvi16ger2 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvi16ger2 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvi16ger2 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvi16ger2 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvi16ger2 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvi16ger2 (&acc7, ca[1], rb[3]);
+
+    I16_INCREMENT
+
+    // k loop
+    for (int k = 0; k<k_iter; k++)
+    {
+        I16_AB_PRODUCT
+        I16_AB_PRODUCT
+        I16_AB_PRODUCT
+        I16_AB_PRODUCT
+    }
+
+    // k loop
+    for (int k = 0; k<k_left; k++)
+    {
+        I16_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -0,0 +1,140 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define I16S_ACCUMULATE \
+    __builtin_mma_xvi16ger2spp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvi16ger2spp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvi16ger2spp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvi16ger2spp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvi16ger2spp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvi16ger2spp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvi16ger2spp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvi16ger2spp (&acc7, ca[1], rb[3]);
+
+#define I16S_INCREMENT \
+    A0+=16; \
+    B0+=32;
+
+#define I16S_AB_PRODUCT \
+    LOAD_VECTORS \
+    I16S_INCREMENT \
+    I16S_ACCUMULATE
+
+void bli_i16sgemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        int32_t*       restrict alpha,
+        short*     restrict a,
+        short*     restrict b,
+        int32_t*       restrict beta,
+        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1) / 4;
+    uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    short* restrict A0 = a;
+    short* restrict B0 = b;
+    int*   restrict C0 = c;
+
+    int alpha_ = *alpha,
+        beta_ = *beta;
+
+    iv4sf_t result[4];
+    iv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    __builtin_mma_xvi16ger2s (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvi16ger2s (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvi16ger2s (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvi16ger2s (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvi16ger2s (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvi16ger2s (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvi16ger2s (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvi16ger2s (&acc7, ca[1], rb[3]);
+
+    I16S_INCREMENT
+
+    // k loop
+    for (int k = 0; k<k_iter; k++)
+    {
+        I16S_AB_PRODUCT
+        I16S_AB_PRODUCT
+        I16S_AB_PRODUCT
+        I16S_AB_PRODUCT
+    }
+
+    // k loop
+    for (int k = 0; k<k_left; k++)
+    {
+        I16S_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -0,0 +1,140 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define I4_ACCUMULATE \
+    __builtin_mma_xvi4ger8pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvi4ger8pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvi4ger8pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvi4ger8pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvi4ger8pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvi4ger8pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvi4ger8pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvi4ger8pp (&acc7, ca[1], rb[3]);
+
+#define I4_INCREMENT \
+    A0+=32; \
+    B0+=64;
+
+#define I4_AB_PRODUCT \
+    LOAD_VECTORS \
+    I4_INCREMENT \
+    I4_ACCUMULATE
+
+void bli_i4gemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        int32_t*       restrict alpha,
+        nibbles*   restrict a,
+        nibbles*   restrict b,
+        int32_t*       restrict beta,
+        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1) / 4;
+	uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    nibbles* restrict A0 = a;
+    nibbles* restrict B0 = b;
+    int*     restrict C0 = c;
+
+    int alpha_ = *alpha,
+        beta_ = *beta;
+
+    iv4sf_t result[4];
+    iv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;        
+
+    __builtin_mma_xvi4ger8 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvi4ger8 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvi4ger8 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvi4ger8 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvi4ger8 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvi4ger8 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvi4ger8 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvi4ger8 (&acc7, ca[1], rb[3]);
+    
+    I4_INCREMENT
+
+    // k loop (unrolled by 4)
+	for (int k = 0; k<k_iter; k++)
+	{
+		I4_AB_PRODUCT
+		I4_AB_PRODUCT
+		I4_AB_PRODUCT
+		I4_AB_PRODUCT
+	}
+	
+	// edge loop
+	for (int k = 0; k<k_left; k++)
+	{
+		I4_AB_PRODUCT
+	}
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -0,0 +1,139 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define I8_ACCUMULATE \
+    __builtin_mma_xvi8ger4pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvi8ger4pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvi8ger4pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvi8ger4pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvi8ger4pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvi8ger4pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvi8ger4pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvi8ger4pp (&acc7, ca[1], rb[3]);
+
+#define I8_INCREMENT \
+    A0+=32; \
+    B0+=64;
+
+#define I8_AB_PRODUCT \
+    LOAD_VECTORS \
+    I8_INCREMENT \
+    I8_ACCUMULATE
+
+void bli_i8gemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        int32_t*       restrict alpha,
+        int8_t*    restrict a,
+        int8_t*    restrict b,
+        int32_t*       restrict beta,
+        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+    uint64_t k_iter = (k0-1) / 4;
+	uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    int8_t* restrict A0 = a;
+    int8_t* restrict B0 = b;
+    int*    restrict C0 = c;
+
+    int alpha_ = *alpha,
+        beta_ = *beta;
+
+    iv4sf_t result[4];
+    iv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;        
+
+    __builtin_mma_xvi8ger4 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvi8ger4 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvi8ger4 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvi8ger4 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvi8ger4 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvi8ger4 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvi8ger4 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvi8ger4 (&acc7, ca[1], rb[3]);
+
+    I8_INCREMENT
+
+    // k loop (unrolled by 4)
+	for (int k = 0; k<k_iter; k++)
+	{
+		I8_AB_PRODUCT
+		I8_AB_PRODUCT
+		I8_AB_PRODUCT
+		I8_AB_PRODUCT
+	}
+	
+	// edge loop
+	for (int k = 0; k<k_left; k++)
+	{
+		I8_AB_PRODUCT
+	}
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -0,0 +1,141 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define B_ACCUMULATE \
+    __builtin_mma_xvbf16ger2pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvbf16ger2pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvbf16ger2pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvbf16ger2pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvbf16ger2pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvbf16ger2pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvbf16ger2pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvbf16ger2pp (&acc7, ca[1], rb[3]); 
+
+#define B_INCREMENT \
+    A0+=16; \
+    B0+=32; 
+    
+#define B_AB_PRODUCT \
+    LOAD_VECTORS \
+    B_INCREMENT \
+    B_ACCUMULATE 
+
+
+void bli_sbgemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        float*     restrict alpha,
+        bfloat16*  restrict a,
+        bfloat16*  restrict b,
+        float*     restrict beta,
+        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1)/4;
+    uint64_t k_left = (k0-1)%4;
+
+    uint64_t rs_c   = rs_c0;
+
+    bfloat16* restrict A0 = a;
+    bfloat16* restrict B0 = b;
+    float* restrict C0 = c;
+
+    float alpha_= *alpha,
+          beta_ = *beta;
+
+    fv4sf_t result[4];
+    fv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    __builtin_mma_xvbf16ger2 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvbf16ger2 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvbf16ger2 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvbf16ger2 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvbf16ger2 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvbf16ger2 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvbf16ger2 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvbf16ger2 (&acc7, ca[1], rb[3]);
+
+    B_INCREMENT
+
+    // k loop
+    for (int k = 0; k<k_iter; k++)
+    {
+        B_AB_PRODUCT
+        B_AB_PRODUCT
+        B_AB_PRODUCT
+        B_AB_PRODUCT
+    }
+
+    for (int k = 0; k<k_left; k++)
+    {
+        B_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(fv4sf_t, &acc0, rs_c,  0       );
+        SAVE_ACC(fv4sf_t, &acc1, rs_c,  4       );
+        SAVE_ACC(fv4sf_t, &acc2, rs_c,  8       );
+        SAVE_ACC(fv4sf_t, &acc3, rs_c, 12       );
+        SAVE_ACC(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(fv4sf_t, &acc0, rs_c,  0       );
+        SAVE_ACC_bz(fv4sf_t, &acc1, rs_c,  4       );
+        SAVE_ACC_bz(fv4sf_t, &acc2, rs_c,  8       );
+        SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12       );
+        SAVE_ACC_bz(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+
+}
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -0,0 +1,144 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define S_ACCUMULATE \
+        __builtin_mma_xvf32gerpp (&acc0, ca[0], rb[0]); \
+        __builtin_mma_xvf32gerpp (&acc1, ca[0], rb[1]); \
+        __builtin_mma_xvf32gerpp (&acc2, ca[0], rb[2]); \
+        __builtin_mma_xvf32gerpp (&acc3, ca[0], rb[3]); \
+        __builtin_mma_xvf32gerpp (&acc4, ca[1], rb[0]); \
+        __builtin_mma_xvf32gerpp (&acc5, ca[1], rb[1]); \
+        __builtin_mma_xvf32gerpp (&acc6, ca[1], rb[2]); \
+        __builtin_mma_xvf32gerpp (&acc7, ca[1], rb[3]); 
+
+#define S_INCREMENT \
+        A0+=8; \
+        B0+=16;
+
+#define S_AB_PRODUCT \
+        LOAD_VECTORS \
+        S_INCREMENT \
+        S_ACCUMULATE 
+
+void bli_sgemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        float*     restrict alpha,
+        float*     restrict a,
+        float*     restrict b,
+        float*     restrict beta,
+        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
+    uint64_t k_iter = (k0-1) / 4;
+    uint64_t k_left = (k0-1) % 4;
+    
+    uint64_t rs_c   = rs_c0;
+
+    fv4sf_t result[4];
+      fv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    float* restrict A0 = a;
+    float* restrict B0 = b;
+    float* restrict C0 = c;
+
+    float alpha_ = *alpha,
+          beta_  = *beta;
+
+    /* Load elements into vector registers */
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    /* Compute accumulate outer products and override accumulators with result */
+    __builtin_mma_xvf32ger (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvf32ger (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvf32ger (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvf32ger (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvf32ger (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvf32ger (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvf32ger (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvf32ger (&acc7, ca[1], rb[3]);
+
+    S_INCREMENT
+
+    // k loop (unrolled by 4)
+    for (int k = 0; k<k_iter; k++)
+    {
+        S_AB_PRODUCT
+        S_AB_PRODUCT
+        S_AB_PRODUCT
+        S_AB_PRODUCT
+    }
+    
+    // edge loop
+    for (int k = 0; k<k_left; k++)
+    {
+        S_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(fv4sf_t, &acc0, rs_c, 0      );
+        SAVE_ACC(fv4sf_t, &acc1, rs_c, 4      );
+        SAVE_ACC(fv4sf_t, &acc2, rs_c, 8      );
+        SAVE_ACC(fv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(fv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(fv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(fv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -0,0 +1,141 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define H_ACCUMULATE \
+    __builtin_mma_xvf16ger2pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvf16ger2pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvf16ger2pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvf16ger2pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvf16ger2pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvf16ger2pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvf16ger2pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvf16ger2pp (&acc7, ca[1], rb[3]); 
+
+#define H_INCREMENT \
+    A0+=16; \
+    B0+=32; 
+    
+#define H_AB_PRODUCT \
+    LOAD_VECTORS \
+    H_INCREMENT \
+    H_ACCUMULATE 
+
+
+void bli_shgemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        float*     restrict alpha,
+        float16*  restrict a,
+        float16*  restrict b,
+        float*     restrict beta,
+        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1)/4;
+    uint64_t k_left = (k0-1)%4;
+
+    uint64_t rs_c   = rs_c0;
+
+    float16* restrict A0 = a;
+    float16* restrict B0 = b;
+    float* restrict C0 = c;
+
+    float alpha_= *alpha,
+          beta_ = *beta;
+
+    fv4sf_t result[4];
+    fv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    __builtin_mma_xvf16ger2 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvf16ger2 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvf16ger2 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvf16ger2 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvf16ger2 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvf16ger2 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvf16ger2 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvf16ger2 (&acc7, ca[1], rb[3]);
+
+    H_INCREMENT
+
+    // k loop
+    for (int k = 0; k<k_iter; k++)
+    {
+        H_AB_PRODUCT
+        H_AB_PRODUCT
+        H_AB_PRODUCT
+        H_AB_PRODUCT
+    }
+
+    for (int k = 0; k<k_left; k++)
+    {
+        H_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(fv4sf_t, &acc0, rs_c,  0       );
+        SAVE_ACC(fv4sf_t, &acc1, rs_c,  4       );
+        SAVE_ACC(fv4sf_t, &acc2, rs_c,  8       );
+        SAVE_ACC(fv4sf_t, &acc3, rs_c, 12       );
+        SAVE_ACC(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(fv4sf_t, &acc0, rs_c,  0       );
+        SAVE_ACC_bz(fv4sf_t, &acc1, rs_c,  4       );
+        SAVE_ACC_bz(fv4sf_t, &acc2, rs_c,  8       );
+        SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12       );
+        SAVE_ACC_bz(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+
+}
--- a/kernels/power10/3/vector_int_macros.h
+++ b/kernels/power10/3/vector_int_macros.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Common include/defines across microkernels
+
+#include "blis.h"
+
+#define PREFETCH1(x, y) __asm__ volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#define LOAD_VECTORS \
+        ca = (vec_t *) A0; \
+        rb = (vec_t *) B0; 
+
+typedef __vector float fv4sf_t;
+typedef __vector double dv4sf_t;
+typedef __vector int32_t iv4sf_t;
+typedef __vector unsigned char vec_t;
+
+#define SAVE_ACC(v_t, ACC, rs_c, j)                \
+    __builtin_mma_disassemble_acc ( (void *) result, ACC);       \
+    rowC = (v_t *) &C0[j];                        \
+    rowC[0] = alpha_ * result[0] + beta_ * rowC[0];    \
+    rowC = (v_t *) &C0[rs_c+j];                     \
+    rowC[0] = alpha_ * result[1] + beta_ * rowC[0];    \
+    rowC = (v_t *) &C0[2*rs_c+j];                   \
+    rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ;   \
+    rowC = (v_t *) &C0[3*rs_c+j];                   \
+    rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
+
+#define SAVE_ACC_bz(v_t, ACC, rs_c, j)                     \
+    __builtin_mma_disassemble_acc ( (void *) result, ACC);     \
+    rowC = (v_t *) &C0[j];                      \
+    rowC[0] = alpha_ * result[0];                      \
+    rowC = (v_t *) &C0[rs_c+j];                     \
+    rowC[0] = alpha_ * result[1];                      \
+    rowC = (v_t *) &C0[2*rs_c+j];                   \
+    rowC[0] = alpha_ * result[2];                      \
+    rowC = (v_t *) &C0[3*rs_c+j];                   \
+    rowC[0] = alpha_ * result[3];
+    
--- a/kernels/power10/bli_kernels_power10.h
+++ b/kernels/power10/bli_kernels_power10.h
@@ -34,4 +34,5 @@

 // gemm
 GEMM_UKR_PROT( double,   d, gemm_power10_mma_8x8  )
-GEMM_UKR_PROT( float,    s, gemm_power10_mma_8x16 )
+GEMM_UKR_PROT( float,    s, gemm_power10_mma_8x16 )
+
--- a/sandbox/power10/POWER10.md
+++ b/sandbox/power10/POWER10.md
@@ -0,0 +1,71 @@
+### Low Precision POWER10 Kernels
+
+This is a special BLIS Sandbox that allows users to call low precision POWER10 `gemm` kernels. 
+
+#### Introduction
+
+This document describes how the low precision POWER10 `gemm` kernels are implemented. The document will also demonstrate how to call the `gemm` kernels. 
+
+**Important: This sandbox does not have the full functionality of BLIS. This sandbox can only perform single threaded, no transpose, GEMM. At this time, full functioning POWER10 hardware has not be released. Once hardware has been released, the kernels will be further optimized in areas such as prefetching and cache blocksizes.**
+
+#### Implementation
+
+The kernels are implemented in `generic_gemm.c`. They are instantiated with macro templates. The main template is called `GENERIC_GEMM`. This template is used to create the 5-loop `gemm` function.
+
+The API points are created in `gemm_api.c`. In this file, the API points are wrappers for the functions that are created by the templates in `generic_gemm.c`.
+
+#### Kernels
+
+The following low precision datatypes have POWER10 `gemm` kernels: `IEEE float16, bfloat16, int16, int8, int4`. 
+
+#### Low Precision Types
+
+| BLIS type  | BLIS char | Type definition                        | Used to represent...                 |
+|:-----------|:----------|:---------------------------------------|:-------------------------------------|
+| `float16`    | `h`    | `typedef union { uint16_t v; struct { uint16_t m:10; uint16_t e:5; uint16_t s:1} bits; }` | IEEE half-precision real numbers        |
+| `bfloat16`   | `b`    | `typedef union { uint16_t v; struct { uint16_t m:7; uint16_t e:8; uint16_t s:1; } bits; }` | Google's half-precision real numbers    |
+| `int16`    | `i16`     | `int16_t`    | 16 bit integers |
+| `int8`     | `i8`       | `int8_t`  | 8 bit integers |
+| `int4`     | `i4`       | `typedef union{ uint8_t v; struct { uint8_t nib1:4; uint8_t nib2:4; } bits; }` | 4 bit integers |
+
+#### Low Precision API
+
+The API that is used for the low precision POWER10 `gemm` kernels is similar to the existing [BLIS basic typed API](https://github.com/flame/blis/blob/master/docs/BLISTypedAPI.md). The main difference between the two is that in the existing BLIS typed API, there is only one type for the input and output matrices. However in the low precision API, there is a input and output type.
+
+Thus the new `gemm` call looks like the following:
+
+```
+void bli_??gemm
+     (
+       trans_t transa,
+       trans_t transb,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       ctype_out*  alpha,
+       ctype_in*   a, inc_t rsa, inc_t csa,
+       ctype_in*   b, inc_t rsb, inc_t csb,
+       ctype_out*  beta,
+       ctype_out*  c, inc_t rsc, inc_t csc
+     );
+```
+
+The first `?` is for the output type. The second `?` is for the input type. 
+
+At this time for IEEE float16 and bfloat16, the only output type is single precision float. For int16, int8, and int4, the only output type is 32 bit int.
+
+
+#### How To Build The Sandbox
+
+Add the following flags when running the configure script to build BLIS correctly.
+
+`CFLAGS="-fPIC -std=c99 -D_ISOC11_SOURCE -D_POSIX_C_SOURCE=200112L" -s power10`
+
+Ensure that you have GCC 10.2 or greater.
+
+
+
+#### References
+
+* [bfloat16 wiki](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format)
+* [IEEE float16 wiki](https://en.wikipedia.org/wiki/Half-precision_floating-point_format)
--- a/sandbox/power10/bli_gemmnat.c
+++ b/sandbox/power10/bli_gemmnat.c
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// This file is needed for the BLIS build system.
+
+#include "blis.h"
+
+#undef  GENFRONT
+#define GENFRONT( opname, cname, imeth ) \
+\
+void PASTEMAC(opname,imeth) \
+     ( \
+       obj_t*  alpha, \
+       obj_t*  a, \
+       obj_t*  b, \
+       obj_t*  beta, \
+       obj_t*  c, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	/* Obtain a valid (native) context from the gks if necessary. */ \
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
+\
+	/* Initialize a local runtime with global settings if necessary. Note
+	   that in the case that a runtime is passed in, we make a local copy. */ \
+	rntm_t rntm_l; \
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
+\
+	/* Invoke the operation's front end. */ \
+	PASTEMAC(opname,_front) \
+	( \
+	  alpha, a, b, beta, c, cntx, rntm, NULL \
+	); \
+}
+
+GENFRONT( gemm, gemm, nat )
--- a/sandbox/power10/bli_sandbox.h
+++ b/sandbox/power10/bli_sandbox.h
@@ -0,0 +1,115 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of copyright holder(s) nor the names
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SANDBOX_H
+#define BLIS_SANDBOX_H
+
+#include "blis.h"
+#include "gemm_api.h"
+
+// NOTE: This header is the only header required to be present in the sandbox
+// implementation directory.
+
+// This header is used to create the typedefs needed for low precision
+
+// int4 type 
+typedef union
+{
+    uint8_t v;
+    struct
+    {
+        uint8_t nib1:4;
+        uint8_t nib2:4;
+    } bits;
+} nibbles;
+
+// bfloat16 
+typedef union
+{
+    uint16_t v;
+    struct
+    {
+        uint16_t m:7;
+        uint16_t e:8;
+        uint16_t s:1;
+    } bits;
+} bfloat16;
+
+// ieee float16 
+typedef union
+{
+    uint16_t v;
+    struct
+    {
+        uint16_t m:10;
+        uint16_t e:5;
+        uint16_t s:1;
+    } bits;
+} float16;
+
+#define P10_PG_SIZE 4096
+
+GEMM_UKR_PROT2( bfloat16,   float,  sb, gemm_power10_mma_8x16 )
+GEMM_UKR_PROT2(  float16,   float,  sh, gemm_power10_mma_8x16 )
+GEMM_UKR_PROT2(  int16_t, int32_t, i16, gemm_power10_mma_8x16 )
+GEMM_UKR_PROT2(   int8_t, int32_t,  i8, gemm_power10_mma_8x16 )
+GEMM_UKR_PROT2(  nibbles, int32_t,  i4, gemm_power10_mma_8x16 )
+
+/* Creates a function that initializes a matrix of type ctype with random vals */
+#define RandomMatrixMacro(ch, ctype, rand_func) \
+    RM_PROT(ch, ctype) \
+    { \
+    for ( int i=0; i<m; i++ ) \
+        for ( int j=0; j<n; j++ ) \
+            *(ap + j*cs_a + i*rs_a) = \
+                (ctype) rand_func(); \
+    }
+
+/* Creates a function that initializes a matrix of type ctype with random vals */
+#define RandomMatrixBounded(ch, ctype, rand_func) \
+    RM_B_PROT(ch, ctype) \
+    { \
+    for ( int i=0; i<m; i++ ) \
+        for ( int j=0; j<n; j++ ) \
+            *(ap + j*cs_a + i*rs_a) = \
+                (ctype) rand_func() % (upper - lower + 1) + lower; \
+    }
+
+GEMM_FUNC_PROT(  float16,   float,  sh);
+GEMM_FUNC_PROT( bfloat16,   float,  sb);
+GEMM_FUNC_PROT(  int16_t, int32_t, i16);
+GEMM_FUNC_PROT(   int8_t, int32_t,  i8);
+GEMM_FUNC_PROT(  nibbles, int32_t,  i4);
+
+#endif
--- a/sandbox/power10/gemm_api.c
+++ b/sandbox/power10/gemm_api.c
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// This file contains the API points for the low precision POWER10 GEMM kernels
+
+#include "generic_gemm.h"
+#include "gemm_api.h"
+
+#define GEMM_FUNC(ch, DTYPE_IN, DTYPE_OUT, A_ALIGNMENT, B_ALIGNMENT, MR, NR, MC, KC, NC) \
+\
+void GEMM_FUNC_NAME(ch) \
+    ( \
+        trans_t transa, \
+        trans_t transb, \
+        dim_t   m, \
+        dim_t   n, \
+        dim_t   k, \
+        DTYPE_OUT*  alpha, \
+        DTYPE_IN*   a, inc_t rsa, inc_t csa, \
+        DTYPE_IN*   b, inc_t rsb, inc_t csb, \
+        DTYPE_OUT*  beta, \
+        DTYPE_OUT*  c, inc_t rsc, inc_t csc \
+    ) \
+{ \
+\
+    if (transa != BLIS_NO_TRANSPOSE || transb != BLIS_NO_TRANSPOSE) { \
+        printf("Transpose functionality not implemented yet.\n"); \
+    } \
+\
+    GEMM_PASTEMAC(ch) \
+    ( \
+        MR, NR, MC, KC, NC, \
+        m, n, k, \
+        a, rsa, csa, A_ALIGNMENT, \
+        b, rsb, csb, B_ALIGNMENT, \
+        c, rsc, csc, \
+        alpha, beta \
+    ); \
+} \
+
+//          ch       dt_in   dt_out           MR   NR     MC     KC     NC
+GEMM_FUNC(  sb,   bfloat16,   float,   0,  0,  8,  16,  1664,  1026,  4096);
+GEMM_FUNC(  sh,    float16,   float,   0,  0,  8,  16,  1664,  1026,  4096);
+GEMM_FUNC( i16,    int16_t, int32_t,   0,  0,  8,  16,  1664,  1026,  4096);
+GEMM_FUNC(  i8,     int8_t, int32_t,   0,  0,  8,  16,  1664,  1026,  4096);
+GEMM_FUNC(  i4,    nibbles, int32_t,   0,  0,  8,  16,  1664,  1026,  4096);
--- a/sandbox/power10/gemm_api.h
+++ b/sandbox/power10/gemm_api.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Prototypes and template for the low precision POWER10 GEMM API
+
+#define GEMM_FUNC_NAME_(ch)    bli_ ## ch ## gemm
+#define GEMM_FUNC_NAME(ch)     GEMM_FUNC_NAME_(ch)
+
+#define GEMM_FUNC_PROT(DTYPE_IN, DTYPE_OUT, ch) \
+    void GEMM_FUNC_NAME(ch) \
+        ( \
+            trans_t transa, \
+            trans_t transb, \
+            dim_t   m, \
+            dim_t   n, \
+            dim_t   k, \
+            DTYPE_OUT*  alpha, \
+            DTYPE_IN*  a, inc_t rsa, inc_t csa, \
+            DTYPE_IN*  b, inc_t rsb, inc_t csb, \
+            DTYPE_OUT*  beta, \
+            DTYPE_OUT*  c, inc_t rsc, inc_t csc \
+        ) 
--- a/sandbox/power10/gemm_pack.c
+++ b/sandbox/power10/gemm_pack.c
@@ -0,0 +1,889 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Templates for different packing routine
+
+#include "gemm_pack.h"
+
+/*
+
+    Details on bit16_dt vector data structure
+
+    Vector X = [ X[0,0] X[0,1] X[1,0] X[1,1] X[2,0] X[2,1] X[3,0] X[3,1] ]
+    Vector Y = [ Y[0,0] Y[0,1] Y[1,0] Y[1,1] Y[2,0] Y[2,1] Y[3,0] Y[3,1] ]
+
+    These bit16_dt vectors represent a 4x2 matrix. Hence, in matrix form it 
+    looks like the following:
+
+    X = [ X[0,0] X[0,1] 
+          X[1,0] X[1,1]
+          X[2,0] X[2,1]
+          X[3,0] X[3,1] ]
+
+    The outer product instruction: xvbf16ger2 (bfloat16 outer product)
+
+    Syntax: 
+
+        xvbf16ger2 ACCUMULATOR A, VECTOR X, VECTOR Y
+
+    Semantics:
+
+        A = X * Y^T
+
+    The generic packing routine would load 8 elements from the same column.
+    This causes an issue since the instruction expects the vector to be a
+    4x2 matrix where the data is packed in contiguous order. Thus, we must make 
+    a packing routine that will interleave the matrix data. Making it so 
+    that when we load the 8 contiguous elements from A, it will represent
+    a 4x2 section of the matrix.
+
+*/
+
+#define k_even_apack_16(ir) \
+            *adest++ = ap[ (i+ir)*rs_a + p_idx*cs_a ]; \
+            *adest++ = ap[ (i+ir)*rs_a + (p_idx+1)*cs_a ];
+
+#define k_odd_apack_16(ir) \
+            *adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \
+            memset(adest, 0, 2); \
+            adest++;
+    
+#define pad_macro_16(dest_matrix) \
+            memset(dest_matrix, 0, 4); \
+            dest_matrix+=2; 
+
+#define BIT16_PACK_A(ch, DTYPE_IN) \
+\
+void PACK_FUNC_NAME(ch, A) \
+    ( \
+        dim_t MR, \
+        int m, int k, \
+        DTYPE_IN* ap, int rs_a, int cs_a, \
+        DTYPE_IN* apack \
+    ) \
+{ \
+    int k_odd = k%2; \
+    int p_idx; \
+\
+    DTYPE_IN* adest = apack; \
+    for (int i=0; i<m; i+=MR) \
+    { \
+        int ib = bli_min(MR, m-i); \
+        if (ib == MR) /* Full size column height */ \
+        { \
+            p_idx = 0; \
+            for (int p=0; p<(k/2); p++) \
+            {  \
+                k_even_apack_16(0); \
+                k_even_apack_16(1); \
+                k_even_apack_16(2); \
+                k_even_apack_16(3); \
+                k_even_apack_16(4); \
+                k_even_apack_16(5); \
+                k_even_apack_16(6); \
+                k_even_apack_16(7); \
+                p_idx += 2; \
+            } \
+\
+            /* In the case that k is odd, we must pad with 0s */ \
+            if(k_odd) \
+            { \
+                k_odd_apack_16(0); \
+                k_odd_apack_16(1); \
+                k_odd_apack_16(2); \
+                k_odd_apack_16(3); \
+                k_odd_apack_16(4); \
+                k_odd_apack_16(5); \
+                k_odd_apack_16(6); \
+                k_odd_apack_16(7); \
+            } \
+        } \
+\
+        else /* Not full size, pad with zeros */ \
+        { \
+            p_idx = 0; \
+            for (int p=0; p<(k/2); p++) \
+            { \
+                for (int ir=0; ir<ib; ir++) \
+                { \
+                    k_even_apack_16(ir); \
+                } \
+                for (int ir=ib; ir<MR; ir++) \
+                { \
+                    pad_macro_16(adest); \
+                } \
+                p_idx += 2; \
+            } \
+\
+            if(k_odd) \
+            { \
+                for (int ir=0; ir<ib; ir++) \
+                { \
+                    k_odd_apack_16(ir); \
+                } \
+                for (int ir=ib; ir<MR; ir++) \
+                { \
+                    pad_macro_16(adest); \
+                } \
+            } \
+        } \
+    } \
+} 
+
+
+#define k_even_bpack_16(jr) \
+            *bdest++ = bp[ p_idx*rs_b     + (j+jr)*cs_b ]; \
+            *bdest++ = bp[ (p_idx+1)*rs_b + (j+jr)*cs_b ]; \
+
+#define k_odd_bpack_16(jr) \
+            *bdest++ = bp[ (k-1)*rs_b + (j+jr)*cs_b ]; \
+            memset(bdest, 0, 2); \
+            bdest++; \
+
+#define BIT16_PACK_B(ch, DTYPE_IN) \
+\
+void PACK_FUNC_NAME(ch, B) \
+    ( \
+        dim_t NR, \
+        int k, int n, \
+        DTYPE_IN* bp, int rs_b, int cs_b, \
+        DTYPE_IN* bpack \
+    ) \
+{ \
+\
+    int k_odd = k%2; \
+    int p_idx; \
+\
+    DTYPE_IN* bdest = bpack; \
+\
+    for( int j=0; j<n; j += NR ) \
+    { \
+        int jb = bli_min(NR, n-j); \
+\
+        if ( jb == NR ) /* Full column width micro-panel.*/  \
+        { \
+            p_idx = 0; \
+            for ( int p=0; p<(k/2); p++ ) \
+            { \
+                k_even_bpack_16(0); \
+                k_even_bpack_16(1); \
+                k_even_bpack_16(2); \
+                k_even_bpack_16(3); \
+                k_even_bpack_16(4); \
+                k_even_bpack_16(5); \
+                k_even_bpack_16(6); \
+                k_even_bpack_16(7); \
+                k_even_bpack_16(8); \
+                k_even_bpack_16(9); \
+                k_even_bpack_16(10); \
+                k_even_bpack_16(11); \
+                k_even_bpack_16(12); \
+                k_even_bpack_16(13); \
+                k_even_bpack_16(14); \
+                k_even_bpack_16(15); \
+                p_idx += 2; \
+            } \
+\
+            /* In the case that k is odd, we must pad with 0s */ \
+            if(k_odd) \
+            { \
+                k_odd_bpack_16(0); \
+                k_odd_bpack_16(1); \
+                k_odd_bpack_16(2); \
+                k_odd_bpack_16(3); \
+                k_odd_bpack_16(4); \
+                k_odd_bpack_16(5); \
+                k_odd_bpack_16(6); \
+                k_odd_bpack_16(7); \
+                k_odd_bpack_16(8); \
+                k_odd_bpack_16(9); \
+                k_odd_bpack_16(10); \
+                k_odd_bpack_16(11); \
+                k_odd_bpack_16(12); \
+                k_odd_bpack_16(13); \
+                k_odd_bpack_16(14); \
+                k_odd_bpack_16(15); \
+            } \
+        } \
+\
+        else /* Not a full row size micro-panel.  We pad with zeroes. */ \
+        { \
+            p_idx = 0; \
+            for ( int p=0; p<(k/2); p++ )  \
+            { \
+                for ( int jr=0; jr<jb; jr++ ) \
+                { \
+                    k_even_bpack_16(jr); \
+                } \
+                for ( int jr=jb; jr<NR; jr++ ) \
+                { \
+                    pad_macro_16(bdest); \
+                } \
+                p_idx += 2; \
+            } \
+\
+            if(k_odd) \
+            { \
+                for ( int jr=0; jr<jb; jr++ ) \
+                { \
+                    k_odd_bpack_16(jr); \
+                } \
+                for ( int jr=jb; jr<NR; jr++ ) \
+                { \
+                    pad_macro_16(bdest); \
+                } \
+            } \
+        } \
+    } \
+};
+
+
+
+/* 8 bit packing routines */
+
+#define k_even_apack_8(ir) \
+            *adest++ = ap[ (i+ir)*rs_a + p_idx*cs_a ]; \
+            *adest++ = ap[ (i+ir)*rs_a + (p_idx+1)*cs_a ]; \
+            *adest++ = ap[ (i+ir)*rs_a + (p_idx+2)*cs_a ]; \
+            *adest++ = ap[ (i+ir)*rs_a + (p_idx+3)*cs_a ];
+
+#define k_left3_apack_8(ir) \
+            *adest++ = ap[ (i+ir)*rs_a + (k-3)*cs_a ]; \
+            *adest++ = ap[ (i+ir)*rs_a + (k-2)*cs_a ]; \
+            *adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \
+            memset(adest, 0, 1); \
+            adest++;
+
+#define k_left2_apack_8(ir) \
+            *adest++ = ap[ (i+ir)*rs_a + (k-2)*cs_a ]; \
+            *adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \
+            memset(adest, 0, 2); \
+            adest += 2;
+
+#define k_left1_apack_8(ir) \
+            *adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \
+            memset(adest, 0, 3); \
+            adest += 3;
+    
+#define pad_macro_8(dest_matrix) \
+            memset(dest_matrix, 0, 4); \
+            dest_matrix += 4;
+
+
+#define BIT8_PACK_A(ch, DTYPE_IN) \
+\
+void PACK_FUNC_NAME(ch, A) \
+    ( \
+        dim_t MR, \
+        int m, int k, \
+        DTYPE_IN* ap, int rs_a, int cs_a, \
+        DTYPE_IN* apack \
+    ) \
+{ \
+    int k_left = k%4; \
+    int k_iter = k/4; \
+    int p_idx; \
+\
+    DTYPE_IN* adest = apack; \
+\
+    /* Each panel must be packed in this format */ \
+    for (int i=0; i<m; i+=MR) \
+    { \
+        int ib = bli_min(MR, m-i); \
+\
+        if (ib == MR) /* Full size column height */ \
+        { \
+            p_idx = 0; \
+            for (int p=0; p<k_iter; p++) \
+            {  \
+                k_even_apack_8(0); \
+                k_even_apack_8(1); \
+                k_even_apack_8(2); \
+                k_even_apack_8(3); \
+                k_even_apack_8(4); \
+                k_even_apack_8(5); \
+                k_even_apack_8(6); \
+                k_even_apack_8(7); \
+                p_idx += 4; \
+            } \
+\
+            /* In the case that k is odd, we must pad with 0s */ \
+            if(k_left==3) \
+            { \
+                k_left3_apack_8(0); \
+                k_left3_apack_8(1); \
+                k_left3_apack_8(2); \
+                k_left3_apack_8(3); \
+                k_left3_apack_8(4); \
+                k_left3_apack_8(5); \
+                k_left3_apack_8(6); \
+                k_left3_apack_8(7); \
+            } \
+            else if(k_left==2) \
+            { \
+                k_left2_apack_8(0); \
+                k_left2_apack_8(1); \
+                k_left2_apack_8(2); \
+                k_left2_apack_8(3); \
+                k_left2_apack_8(4); \
+                k_left2_apack_8(5); \
+                k_left2_apack_8(6); \
+                k_left2_apack_8(7); \
+            } \
+            else if(k_left==1) \
+            { \
+                k_left1_apack_8(0); \
+                k_left1_apack_8(1); \
+                k_left1_apack_8(2); \
+                k_left1_apack_8(3); \
+                k_left1_apack_8(4); \
+                k_left1_apack_8(5); \
+                k_left1_apack_8(6); \
+                k_left1_apack_8(7); \
+            } \
+        } \
+\
+        else /* Not full size, pad with zeros */ \
+        { \
+            p_idx = 0; \
+            for (int p=0; p<k_iter; p++) \
+            { \
+                for (int ir=0; ir<ib; ir++) \
+                { \
+                    k_even_apack_8(ir); \
+                } \
+                for (int ir=ib; ir<MR; ir++) \
+                { \
+                    pad_macro_8(adest); \
+                } \
+                p_idx += 4; \
+            } \
+\
+            if(k_left==3) \
+            { \
+                for (int ir=0; ir<ib; ir++) \
+                { \
+                    k_left3_apack_8(ir); \
+                } \
+            } \
+            else if(k_left==2) \
+            { \
+                for (int ir=0; ir<ib; ir++) \
+                { \
+                    k_left2_apack_8(ir); \
+                } \
+            } \
+            else if(k_left==1) \
+            { \
+                for (int ir=0; ir<ib; ir++) \
+                { \
+                    k_left1_apack_8(ir); \
+                } \
+            } \
+            if(k_left!=0) \
+            { \
+                for (int ir=ib; ir<MR; ir++) { \
+                    pad_macro_8(adest); \
+                } \
+            } \
+        } \
+    } \
+}
+
+
+#define k_even_bpack_8(jr) \
+            *bdest++ = bp[ p_idx*rs_b     + (j+jr)*cs_b ]; \
+            *bdest++ = bp[ (p_idx+1)*rs_b + (j+jr)*cs_b ]; \
+            *bdest++ = bp[ (p_idx+2)*rs_b + (j+jr)*cs_b ]; \
+            *bdest++ = bp[ (p_idx+3)*rs_b + (j+jr)*cs_b ]; 
+
+#define k_left3_bpack_8(jr) \
+            *bdest++ = bp[ (k-3)*rs_b + (j+jr)*cs_b ]; \
+            *bdest++ = bp[ (k-2)*rs_b + (j+jr)*cs_b ]; \
+            *bdest++ = bp[ (k-1)*rs_b + (j+jr)*cs_b ]; \
+            memset(bdest, 0, 1); \
+            bdest++;
+
+#define k_left2_bpack_8(jr) \
+            *bdest++ = bp[ (k-2)*rs_b + (j+jr)*cs_b ]; \
+            *bdest++ = bp[ (k-1)*rs_b + (j+jr)*cs_b ]; \
+            memset(bdest, 0, 2); \
+            bdest+=2;
+
+#define k_left1_bpack_8(jr) \
+            *bdest++ = bp[ (k-1)*rs_b + (j+jr)*cs_b ]; \
+            memset(bdest, 0, 3); \
+            bdest+=3;
+
+
+#define BIT8_PACK_B(ch, DTYPE_IN) \
+\
+void PACK_FUNC_NAME(ch, B) \
+    ( \
+        dim_t NR, \
+        int k, int n, \
+        DTYPE_IN* bp, int rs_b, int cs_b, \
+        DTYPE_IN* bpack \
+    ) \
+{ \
+    int k_left = k%4; \
+    int k_iter = k/4; \
+    int p_idx; \
+\
+    DTYPE_IN* bdest = bpack; \
+\
+    for( int j=0; j<n; j += NR ) \
+    { \
+        int jb = bli_min(NR, n-j); \
+\
+        if ( jb == NR ) /* Full column width micro-panel.*/ \
+        { \
+            p_idx = 0; \
+            for ( int p=0; p<k_iter; p++ ) \
+            { \
+                k_even_bpack_8(0); \
+                k_even_bpack_8(1); \
+                k_even_bpack_8(2); \
+                k_even_bpack_8(3); \
+                k_even_bpack_8(4); \
+                k_even_bpack_8(5); \
+                k_even_bpack_8(6); \
+                k_even_bpack_8(7); \
+                k_even_bpack_8(8); \
+                k_even_bpack_8(9); \
+                k_even_bpack_8(10); \
+                k_even_bpack_8(11); \
+                k_even_bpack_8(12); \
+                k_even_bpack_8(13); \
+                k_even_bpack_8(14); \
+                k_even_bpack_8(15); \
+                p_idx += 4; \
+            } \
+\
+            if(k_left==3) \
+            { \
+                k_left3_bpack_8(0); \
+                k_left3_bpack_8(1); \
+                k_left3_bpack_8(2); \
+                k_left3_bpack_8(3); \
+                k_left3_bpack_8(4); \
+                k_left3_bpack_8(5); \
+                k_left3_bpack_8(6); \
+                k_left3_bpack_8(7); \
+                k_left3_bpack_8(8); \
+                k_left3_bpack_8(9); \
+                k_left3_bpack_8(10); \
+                k_left3_bpack_8(11); \
+                k_left3_bpack_8(12); \
+                k_left3_bpack_8(13); \
+                k_left3_bpack_8(14); \
+                k_left3_bpack_8(15); \
+            } \
+            else if(k_left==2) \
+            { \
+                k_left2_bpack_8(0); \
+                k_left2_bpack_8(1); \
+                k_left2_bpack_8(2); \
+                k_left2_bpack_8(3); \
+                k_left2_bpack_8(4); \
+                k_left2_bpack_8(5); \
+                k_left2_bpack_8(6); \
+                k_left2_bpack_8(7); \
+                k_left2_bpack_8(8); \
+                k_left2_bpack_8(9); \
+                k_left2_bpack_8(10); \
+                k_left2_bpack_8(11); \
+                k_left2_bpack_8(12); \
+                k_left2_bpack_8(13); \
+                k_left2_bpack_8(14); \
+                k_left2_bpack_8(15); \
+            } \
+            else if(k_left==1)  \
+            { \
+                k_left1_bpack_8(0); \
+                k_left1_bpack_8(1); \
+                k_left1_bpack_8(2); \
+                k_left1_bpack_8(3); \
+                k_left1_bpack_8(4); \
+                k_left1_bpack_8(5); \
+                k_left1_bpack_8(6); \
+                k_left1_bpack_8(7); \
+                k_left1_bpack_8(8); \
+                k_left1_bpack_8(9); \
+                k_left1_bpack_8(10); \
+                k_left1_bpack_8(11); \
+                k_left1_bpack_8(12); \
+                k_left1_bpack_8(13); \
+                k_left1_bpack_8(14); \
+                k_left1_bpack_8(15); \
+            } \
+        } \
+\
+        else /* Not a full row size micro-panel.  We pad with zeroes. */ \
+        { \
+            p_idx = 0; \
+            for ( int p=0; p<k_iter; p++ ) \
+            { \
+                for ( int jr=0; jr<jb; jr++ ) \
+                { \
+                    k_even_bpack_8(jr); \
+                } \
+                for ( int jr=jb; jr<NR; jr++ ) \
+                { \
+                    pad_macro_8(bdest); \
+                } \
+                p_idx += 4; \
+            } \
+\
+            if(k_left==3) \
+            { \
+                for ( int jr=0; jr<jb; jr++ ) \
+                { \
+                    k_left3_bpack_8(jr); \
+                } \
+            } \
+            else if(k_left==2) \
+            { \
+                for ( int jr=0; jr<jb; jr++ ) \
+                { \
+                    k_left2_bpack_8(jr); \
+                } \
+            } \
+            else if(k_left==1) \
+            { \
+                for ( int jr=0; jr<jb; jr++ ) \
+                { \
+                    k_left1_bpack_8(jr); \
+                } \
+            } \
+            if (k_left!=0) \
+            { \
+                for ( int jr=jb; jr<NR; jr++ ) { \
+                    pad_macro_8(bdest); \
+                } \
+            } \
+        } \
+    } \
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/*                            Packing Routines                                */
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+
+    Memory is byte-addressed. This results in two options when dealing with 
+    int4. Either store 1 int4 value in a byte, or store 2 int4 values in 1 
+    byte. The former is wasteful in storage, but it makes for a simpler
+    packing routine. However, we want to not waste any storage if possible. 
+    Therefore I went with the latter when designing my int4 kernel. 
+
+    The int4 outerproduct instruction expects a 4x8 matrix in row major order 
+    to be loaded into the vector. In order to achieve this 4x8 row major 
+    matrix, we pack as many 4x8 panels from the src matrix into the pack matrix.
+
+    To illustrate how my packing routine works:
+
+    x0  x1  x2  x3  x4  x5  x6  x7
+    x9  x10 x11 x12 x13 x14 x15 x16
+    x17 x18 x19 x20 x21 x22 x23 x24
+    x25 x26 x27 x28 x29 x30 x31 x32
+
+    Assume we have a 4x8 matrix that is stored in column major order. Also 
+    since we are dealing with int4 values, the values are stored as pairs 
+    within a union struct. i.e. (x0, x9) are stored together in the same struct.
+    
+    Therefore in order to get the desired 4x8 row major matrix, we must go 
+    through the first row of structs and grab the first int4 value and insert
+    it into the appropriate spot in the pack matrix. This means that after 
+    packing, (x0, x1) will be stored together in the same struct.
+
+    This process then repeats until the entire src matrix is packed in these
+    4x8 row major matrix panels. 
+
+    To handle edge cases, the packing routine will fill in zeros where it is
+    appropriate. 
+    
+*/
+
+#include "i4_macros.h"
+
+#define BIT4_PACK_A(ch, DTYPE_IN) \
+\
+void PACK_FUNC_NAME(ch, A) \
+    ( \
+        dim_t MR, \
+        int m, int k, \
+        DTYPE_IN* ap, int rs_a, int cs_a, \
+        DTYPE_IN* apack \
+    ) \
+{ \
+    int p_idx, k_left, k_iter; \
+    DTYPE_IN* adest = apack; \
+\
+    k_left = k%8; \
+    k_iter = k/8; \
+\
+    int i = 0; /* i is used for byte addressing */ \
+    for(int int4_i=0; int4_i<m; int4_i+=MR) { /* pack panels */ \
+\
+        int ib = bli_min(MR, m-int4_i); \
+        p_idx = 0; \
+\
+        if (ib == MR) { /* full size */ \
+            for (int p=0; p<k_iter; p++) { \
+                col_m_order_1(adest, ap, (i+0), rs_a, cs_a); \
+                col_m_order_2(adest, ap, (i+0), rs_a, cs_a); \
+                col_m_order_1(adest, ap, (i+1), rs_a, cs_a); \
+                col_m_order_2(adest, ap, (i+1), rs_a, cs_a); \
+                col_m_order_1(adest, ap, (i+2), rs_a, cs_a); \
+                col_m_order_2(adest, ap, (i+2), rs_a, cs_a); \
+                col_m_order_1(adest, ap, (i+3), rs_a, cs_a); \
+                col_m_order_2(adest, ap, (i+3), rs_a, cs_a); \
+                p_idx += 8; \
+            } \
+\
+            /* handle edge cases if there are any */ \
+            if(k_left == 7) { \
+                apad_col_kleft7(adest, ap, rs_a, cs_a); \
+            } \
+            else if(k_left == 6) { \
+                apad_col_kleft6(adest, ap, rs_a, cs_a); \
+            } \
+            else if(k_left == 5) { \
+                apad_col_kleft5(adest, ap, rs_a, cs_a); \
+            } \
+            else if(k_left == 4) { \
+                apad_col_kleft4(adest, ap, rs_a, cs_a); \
+            } \
+            else if(k_left == 3) { \
+                apad_col_kleft3(adest, ap, rs_a, cs_a); \
+            } \
+            else if(k_left == 2) { \
+                apad_col_kleft2(adest, ap, rs_a, cs_a); \
+            } \
+            else if(k_left == 1) { \
+                apad_col_kleft1(adest, ap, rs_a, cs_a); \
+            } \
+        } \
+\
+        else { /* not full size */ \
+            for (int p=0; p<k_iter; p++) { \
+                for (int ir=0; ir<ib; ir++) { \
+                    if (ir%2==0) { \
+                        col_m_order_1(adest, ap, (i+ir/2), rs_a, cs_a); \
+                    } \
+                    else { \
+                        col_m_order_2(adest, ap, (i+ir/2), rs_a, cs_a); \
+                    } \
+                } \
+                for (int ir=ib; ir<MR; ir++) { \
+                    zero_out_dest(adest); \
+                } \
+                p_idx += 8; \
+            } \
+\
+            /* handle edge cases if there are any */ \
+            if(k_left == 7) { \
+                edge7(adest, ap, i, ib, rs_a, cs_a); \
+            } \
+            else if(k_left == 6) { \
+                edge6(adest, ap, i, ib, rs_a, cs_a); \
+            } \
+            else if(k_left == 5) { \
+                edge5(adest, ap, i, ib, rs_a, cs_a); \
+            } \
+            else if(k_left == 4) { \
+                edge4(adest, ap, i, ib, rs_a, cs_a); \
+            } \
+            else if(k_left == 3) { \
+                edge3(adest, ap, i, ib, rs_a, cs_a); \
+            } \
+            else if(k_left == 2) { \
+                edge2(adest, ap, i, ib, rs_a, cs_a); \
+            } \
+            else if(k_left == 1) { \
+                edge1(adest, ap, i, ib, rs_a, cs_a); \
+            } \
+\
+            /* fill in zeros when an edge case occurs */ \
+            if(k_left!=0) \
+            { \
+                for (int ir=ib; ir<MR; ir++) \
+                    zero_out_dest(adest); \
+            } \
+        } \
+        i += (MR/2); \
+    } \
+}
+
+
+#define BIT4_PACK_B(ch, DTYPE_IN) \
+\
+void PACK_FUNC_NAME(ch, B) \
+    ( \
+        dim_t NR, \
+        int k, int n, \
+        DTYPE_IN* bp, int rs_b, int cs_b, \
+        DTYPE_IN* bpack \
+    ) \
+{ \
+\
+    int p_idx, k_left, k_iter; \
+    DTYPE_IN* bdest = bpack; \
+\
+    k_left = k%8; \
+    k_iter = k/8; \
+\
+    int j = 0; \
+\
+    for(int int4_j=0; int4_j<n; int4_j+=NR) { /* pack panels */ \
+        int jb = bli_min(NR, n-int4_j); \
+\
+        p_idx = 0; \
+        if (jb == NR) { /* full size */ \
+            for (int p=0; p<k_iter; p++) { \
+                col_m_order_1(bdest, bp, (j+0), cs_b, rs_b); \
+                col_m_order_2(bdest, bp, (j+0), cs_b, rs_b); \
+                col_m_order_1(bdest, bp, (j+1), cs_b, rs_b); \
+                col_m_order_2(bdest, bp, (j+1), cs_b, rs_b); \
+                col_m_order_1(bdest, bp, (j+2), cs_b, rs_b); \
+                col_m_order_2(bdest, bp, (j+2), cs_b, rs_b); \
+                col_m_order_1(bdest, bp, (j+3), cs_b, rs_b); \
+                col_m_order_2(bdest, bp, (j+3), cs_b, rs_b); \
+                col_m_order_1(bdest, bp, (j+4), cs_b, rs_b); \
+                col_m_order_2(bdest, bp, (j+4), cs_b, rs_b); \
+                col_m_order_1(bdest, bp, (j+5), cs_b, rs_b); \
+                col_m_order_2(bdest, bp, (j+5), cs_b, rs_b); \
+                col_m_order_1(bdest, bp, (j+6), cs_b, rs_b); \
+                col_m_order_2(bdest, bp, (j+6), cs_b, rs_b); \
+                col_m_order_1(bdest, bp, (j+7), cs_b, rs_b); \
+                col_m_order_2(bdest, bp, (j+7), cs_b, rs_b); \
+                p_idx += 8; \
+            } \
+\
+            /* handle edge cases if there are any */ \
+            if(k_left == 7) { \
+                bpad_col_kleft7(bdest, bp, cs_b, rs_b); \
+            } \
+            else if(k_left == 6) { \
+                bpad_col_kleft6(bdest, bp, cs_b, rs_b); \
+            } \
+            else if(k_left == 5) { \
+                bpad_col_kleft5(bdest, bp, cs_b, rs_b); \
+            } \
+            else if(k_left == 4) { \
+                bpad_col_kleft4(bdest, bp, cs_b, rs_b); \
+            } \
+            else if(k_left == 3) { \
+                bpad_col_kleft3(bdest, bp, cs_b, rs_b); \
+            } \
+            else if(k_left == 2) { \
+                bpad_col_kleft2(bdest, bp, cs_b, rs_b); \
+            } \
+            else if(k_left == 1) { \
+                bpad_col_kleft1(bdest, bp, cs_b, rs_b); \
+            } \
+        } \
+        else { /* not full size */ \
+            for (int p=0; p<k_iter; p++) { \
+                for (int jr=0; jr<jb; jr++) { \
+                    if (jr%2==0) { \
+                        col_m_order_1(bdest, bp, (j+jr/2), cs_b, rs_b); \
+                    } \
+                    else { \
+                        col_m_order_2(bdest, bp, (j+jr/2), cs_b, rs_b); \
+                    } \
+                } \
+                for (int jr=jb; jr<NR; jr++) { \
+                    zero_out_dest(bdest); \
+                } \
+                p_idx += 8; \
+            } \
+\
+            /* handle edge cases if there are any */ \
+            if(k_left == 7) { \
+                edge7(bdest, bp, j, jb, cs_b, rs_b); \
+            } \
+            else if(k_left == 6) { \
+                edge6(bdest, bp, j, jb, cs_b, rs_b); \
+            } \
+            else if(k_left == 5) { \
+                edge5(bdest, bp, j, jb, cs_b, rs_b); \
+            } \
+            else if(k_left == 4) { \
+                edge4(bdest, bp, j, jb, cs_b, rs_b); \
+            } \
+            else if(k_left == 3) { \
+                edge3(bdest, bp, j, jb, cs_b, rs_b); \
+            } \
+            else if(k_left == 2) { \
+                edge2(bdest, bp, j, jb, cs_b, rs_b); \
+            } \
+            else if(k_left == 1) { \
+                edge1(bdest, bp, j, jb, cs_b, rs_b); \
+            } \
+\
+            /* fill in zeros when an edge case occurs */ \
+            if(k_left!=0) \
+            { \
+                for (int ir=jb; ir<NR; ir++) \
+                    zero_out_dest(bdest); \
+            } \
+        } \
+        j += (NR/2); \
+    } \
+}
+
+
+
+#define BIT16_PACK_ROUTINES(ch, DTYPE_IN) \
+    BIT16_PACK_A(ch, DTYPE_IN); \
+    BIT16_PACK_B(ch, DTYPE_IN);
+
+#define BIT8_PACK_ROUTINES(ch, DTYPE_IN) \
+    BIT8_PACK_A(ch, DTYPE_IN); \
+    BIT8_PACK_B(ch, DTYPE_IN);
+
+#define BIT4_PACK_ROUTINES(ch, DTYPE_IN) \
+    BIT4_PACK_A(ch, DTYPE_IN); \
+    BIT4_PACK_B(ch, DTYPE_IN);
+
+BIT16_PACK_ROUTINES(sb, bfloat16);
+BIT16_PACK_ROUTINES(i16, int16_t);
+BIT16_PACK_ROUTINES(sh, float16);
+
+BIT8_PACK_ROUTINES(i8, int8_t);
+
+BIT4_PACK_ROUTINES(i4, nibbles);
--- a/sandbox/power10/gemm_pack.h
+++ b/sandbox/power10/gemm_pack.h
@@ -0,0 +1,64 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Templates for packing routines prototypes
+
+#include "bli_sandbox.h"
+
+#define PACK_FUNC_NAME_(ch, mat) ch ## _pack ## mat
+#define PACK_FUNC_NAME(ch, mat)  PACK_FUNC_NAME_(ch, mat)
+
+#define PACK_MACRO_PROTO(ch, DTYPE_IN) \
+\
+void PACK_FUNC_NAME(ch, A) \
+    (  \
+        dim_t MR, \
+        int m, int k, \
+        DTYPE_IN* ap, int rs_a, int cs_a, \
+        DTYPE_IN* apack \
+    ); \
+\
+void PACK_FUNC_NAME(ch, B) \
+    ( \
+        dim_t NR, \
+        int k, int n, \
+        DTYPE_IN* bp, int rs_b, int cs_b, \
+        DTYPE_IN* bpack \
+    ); 
+
+PACK_MACRO_PROTO(sb, bfloat16)
+PACK_MACRO_PROTO(sh, float16)
+PACK_MACRO_PROTO(i16, int16_t)
+PACK_MACRO_PROTO(i8, int8_t)
+PACK_MACRO_PROTO(i4, nibbles)
--- a/sandbox/power10/generic_gemm.c
+++ b/sandbox/power10/generic_gemm.c
@@ -0,0 +1,154 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Using the GENERIC_GEMM template, create GEMM functions for each datatype
+
+#include "generic_gemm.h"
+#include "gemm_pack.h"
+
+#define GENERIC_GEMM(ch, DTYPE_IN, DTYPE_OUT, NEW_PB, MULT, UK_FUNC) \
+\
+void GEMM_PASTEMAC(ch) \
+    ( \
+        dim_t MR, dim_t NR, dim_t KC, dim_t NC, dim_t MC, \
+        int m, int n, int k, \
+        DTYPE_IN* restrict A, int rs_a, int cs_a, int A_align, \
+        DTYPE_IN* restrict B, int rs_b, int cs_b, int B_align, \
+        DTYPE_OUT* restrict C, int rs_c, int cs_c, \
+        DTYPE_OUT* alpha, DTYPE_OUT* beta \
+    ) \
+{ \
+    DTYPE_OUT zero  = 0.0; \
+    DTYPE_OUT beta_  = *beta; \
+    \
+    DTYPE_IN * restrict btilde_sys = ( DTYPE_IN *) aligned_alloc( P10_PG_SIZE, B_align + KC * NC * sizeof( DTYPE_IN ) ); \
+    DTYPE_IN * restrict atilde_sys = ( DTYPE_IN *) aligned_alloc( P10_PG_SIZE, A_align + MC * KC * sizeof( DTYPE_IN ) ); \
+    \
+    DTYPE_IN * restrict btilde_usr = ( DTYPE_IN *)((char *)btilde_sys + B_align); \
+    DTYPE_IN * restrict atilde_usr = ( DTYPE_IN *)((char *)atilde_sys + A_align); \
+    \
+    const int rstep_c = MC*rs_c; \
+    const int cstep_c = NC*cs_c; \
+    \
+    const int rstep_a = MC*rs_a; \
+    const int cstep_a = KC*cs_a; \
+    \
+    const int rstep_b = KC*rs_b; \
+    const int cstep_b = NC*cs_b; \
+    \
+    const int rstep_mt_c = MR*rs_c; \
+    const int cstep_mt_c = NR*cs_c; \
+    \
+    DTYPE_OUT * restrict cblock = C; \
+    DTYPE_IN  * restrict bblock = B; \
+    \
+    DTYPE_OUT tmp_cmicrotile[MR*NR];  \
+    int   rs_ct = ( rs_c == 1 ? 1 : NR ); \
+    int   cs_ct = ( rs_c == 1 ? MR : 1 ); \
+    \
+    for ( int jc=0; jc<n; jc+=NC ) \
+    { \
+        int jb = bli_min( NC, n-jc ); \
+        DTYPE_IN * restrict apanel = A; \
+        DTYPE_IN * restrict bpanel = bblock; \
+        \
+        for ( int pc=0; pc<k; pc+=KC ) \
+        { \
+            int pb = bli_min( KC, k-pc ); \
+            ch ## _packB \
+            (NR, pb, jb, bpanel, rs_b, cs_b, btilde_usr); \
+            \
+            int new_pb = NEW_PB; \
+            const int a_ps = new_pb * (MULT * MR); \
+            const int b_ps = new_pb * (MULT * NR); \
+            \
+            DTYPE_OUT * restrict cpanel = cblock; \
+            DTYPE_IN  * restrict ablock = apanel; \
+            \
+            for ( int ic=0; ic<m; ic+=MC ) \
+            { \
+                int ib = bli_min( MC, m-ic ); \
+                \
+                ch ## _packA \
+                ( MR, ib, pb, ablock, rs_a, cs_a, atilde_usr ); \
+                \
+                DTYPE_OUT * restrict cmicrotile_col = cpanel; \
+                DTYPE_IN  * restrict bmicropanel = btilde_usr; \
+                \
+                for ( int jr=0; jr<jb; jr+=NR ) \
+                { \
+                    int jrb = bli_min( NR, jb-jr ); \
+                    DTYPE_OUT * restrict cmicrotile = cmicrotile_col; \
+                    DTYPE_IN  * restrict amicropanel = atilde_usr; \
+                    \
+                    for ( int ir=0; ir<ib; ir+=MR ) \
+                    {    \
+                        int irb = bli_min( MR, ib-ir ); \
+                        \
+                        if (jrb == NR && irb == MR) \
+                            UK_FUNC (new_pb, alpha, amicropanel, bmicropanel, beta, cmicrotile, rs_c, cs_c, NULL, NULL); \
+                        else \
+                        { \
+                            UK_FUNC (new_pb, alpha, amicropanel, bmicropanel, &zero, tmp_cmicrotile, rs_ct, cs_ct, NULL, NULL); \
+                            \
+                            for (int j=0; j<jrb;j++) \
+                                for (int i=0; i<irb;i++)  \
+                                    cmicrotile[i*rs_c + j*cs_c] = \
+                                        beta_ * cmicrotile[i*rs_c + j*cs_c] + \
+                                        tmp_cmicrotile[i*rs_ct + j*cs_ct]; \
+                        } \
+                        amicropanel += a_ps; \
+                        cmicrotile += rstep_mt_c; \
+                    } \
+                    bmicropanel += b_ps; \
+                    cmicrotile_col += cstep_mt_c; \
+                } \
+                ablock += rstep_a; \
+                cpanel += rstep_c; \
+            } \
+            apanel += cstep_a; \
+            bpanel += rstep_b; \
+        } \
+        cblock += cstep_c; \
+        bblock += cstep_b; \
+    } \
+    free(btilde_sys); \
+    free(atilde_sys); \
+} 
+
+GENERIC_GEMM( sb, bfloat16, float,   (pb/2 + pb%2), 2,  bli_sbgemm_power10_mma_8x16);
+GENERIC_GEMM(i16,  int16_t,   int,   (pb/2 + pb%2), 2, bli_i16gemm_power10_mma_8x16);
+GENERIC_GEMM( sh,  float16, float,   (pb/2 + pb%2), 2,  bli_shgemm_power10_mma_8x16); 
+GENERIC_GEMM( i8,   int8_t,   int, (pb/4 + pb%4>0), 4,  bli_i8gemm_power10_mma_8x16);
+GENERIC_GEMM( i4,  nibbles,   int, (pb/8 + pb%8>0), 8,  bli_i4gemm_power10_mma_8x16);
--- a/sandbox/power10/generic_gemm.h
+++ b/sandbox/power10/generic_gemm.h
@@ -0,0 +1,58 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Prototypes and template for the 5-loop gemm algorithm
+
+#include "bli_sandbox.h"
+
+#define GEMM_PASTEMAC_(ch)           bli_ ## ch ## gemm_
+#define GEMM_PASTEMAC(ch)            GEMM_PASTEMAC_(ch)
+
+#define GENERIC_GEMM_PROTO(ch, DTYPE_IN, DTYPE_OUT) \
+void GEMM_PASTEMAC(ch) \
+    ( \
+        dim_t MR, dim_t NR, dim_t KC, dim_t NC, dim_t MC, \
+        int m, int n, int k, \
+        DTYPE_IN* restrict A, int rs_a, int cs_a, int A_align, \
+        DTYPE_IN* restrict B, int rs_b, int cs_b, int B_align, \
+        DTYPE_OUT* restrict C, int rs_c, int cs_c, \
+        DTYPE_OUT* alpha, DTYPE_OUT* beta \
+    )
+
+GENERIC_GEMM_PROTO( sb, bfloat16,   float);
+GENERIC_GEMM_PROTO( sh,  float16,   float);
+GENERIC_GEMM_PROTO(i16,  int16_t, int32_t);
+GENERIC_GEMM_PROTO( i8,   int8_t, int32_t);
+GENERIC_GEMM_PROTO( i4,  nibbles, int32_t);
+
--- a/sandbox/power10/i4_macros.h
+++ b/sandbox/power10/i4_macros.h
@@ -0,0 +1,545 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// These are macros are used for int4 packing
+
+// zero out 1 nibbles struct
+#define zero_out_full(dest) \
+    dest->v = 0; \
+    dest++;
+
+// zero out 4 nibbles struct
+#define zero_out_dest(dest) \
+    memset(dest, 0, 4);
+
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+//////////////////////////// Col Major Order Macros ////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+
+    The following macros handle the case when there is a full size panel 
+    (ib/jb == MR/NR) and no edge case (k%8 == 0).
+
+*/
+
+#define col_m_order_1(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+0)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+1)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+2)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+3)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+4)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+5)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+6)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+7)*cs].bits.nib1; \
+    dest++;
+
+#define col_m_order_2(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+0)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+1)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+2)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+3)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+4)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+5)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+6)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+7)*cs].bits.nib2; \
+    dest++;
+
+/*
+
+    The following macros handle the case when there is a full size panel 
+    (ib/jb == MR/NR) and there is an edge case (k%8 != 0).
+
+*/
+
+#define col_m_order_1_kleft7(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-7)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-6)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
+    dest->bits.nib2 = 0; \
+    dest++;
+
+#define col_m_order_2_kleft7(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-7)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-6)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
+    dest->bits.nib2 = 0; \
+    dest++;
+
+#define col_m_order_1_kleft6(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-6)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
+    dest++; \
+    zero_out_full(dest);
+
+#define col_m_order_2_kleft6(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-6)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
+    dest++; \
+    zero_out_full(dest);
+
+#define col_m_order_1_kleft5(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
+    dest->bits.nib2 = 0; \
+    dest++; \
+    zero_out_full(dest);
+
+#define col_m_order_2_kleft5(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
+    dest->bits.nib2 = 0; \
+    dest++; \
+    zero_out_full(dest);
+
+#define col_m_order_1_kleft4(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
+    dest++; \
+    zero_out_full(dest); \
+    zero_out_full(dest);
+
+#define col_m_order_2_kleft4(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
+    dest++; \
+    zero_out_full(dest); \
+    zero_out_full(dest);
+
+#define col_m_order_1_kleft3(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
+    dest->bits.nib2 = 0; \
+    dest++; \
+    zero_out_full(dest); \
+    zero_out_full(dest);
+
+#define col_m_order_2_kleft3(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
+    dest++; \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
+    dest->bits.nib2 = 0; \
+    dest++; \
+    zero_out_full(dest); \
+    zero_out_full(dest);
+
+#define col_m_order_1_kleft2(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
+    dest++; \
+    zero_out_full(dest); \
+    zero_out_full(dest); \
+    zero_out_full(dest);
+
+#define col_m_order_2_kleft2(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
+    dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
+    dest++; \
+    zero_out_full(dest); \
+    zero_out_full(dest); \
+    zero_out_full(dest);
+
+#define col_m_order_1_kleft1(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
+    dest->bits.nib2 = 0; \
+    dest++; \
+    zero_out_full(dest); \
+    zero_out_full(dest); \
+    zero_out_full(dest);
+
+#define col_m_order_2_kleft1(dest, matrix, rs_mul, rs, cs) \
+    dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
+    dest->bits.nib2 = 0; \
+    dest++; \
+    zero_out_full(dest); \
+    zero_out_full(dest); \
+    zero_out_full(dest);
+
+/*
+
+
+    The following macros are used when we have a full panel (ib == MR) 
+    and we need to handle an edge case (k%8 != 0).
+
+    The MR loop is unrolled resulting in the stream of macros.
+
+*/
+
+#define apad_col_kleft7(dest, matrix, rs, cs) \
+    col_m_order_1_kleft7(dest, matrix, (i  ), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (i  ), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (i+1), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (i+1), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (i+2), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (i+2), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (i+3), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (i+3), rs, cs); 
+
+#define apad_col_kleft6(dest, matrix, rs, cs) \
+    col_m_order_1_kleft6(dest, matrix, (i  ), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (i  ), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (i+1), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (i+1), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (i+2), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (i+2), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (i+3), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (i+3), rs, cs); 
+
+#define apad_col_kleft5(dest, matrix, rs, cs) \
+    col_m_order_1_kleft5(dest, matrix, (i  ), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (i  ), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (i+1), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (i+1), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (i+2), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (i+2), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (i+3), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (i+3), rs, cs); 
+
+#define apad_col_kleft4(dest, matrix, rs, cs) \
+    col_m_order_1_kleft4(dest, matrix, (i  ), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (i  ), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (i+1), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (i+1), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (i+2), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (i+2), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (i+3), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (i+3), rs, cs); 
+
+#define apad_col_kleft3(dest, matrix, rs, cs) \
+    col_m_order_1_kleft3(dest, matrix, (i  ), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (i  ), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (i+1), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (i+1), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (i+2), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (i+2), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (i+3), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (i+3), rs, cs); 
+
+#define apad_col_kleft2(dest, matrix, rs, cs) \
+    col_m_order_1_kleft2(dest, matrix, (i  ), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (i  ), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (i+1), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (i+1), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (i+2), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (i+2), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (i+3), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (i+3), rs, cs); 
+
+#define apad_col_kleft1(dest, matrix, rs, cs) \
+    col_m_order_1_kleft1(dest, matrix, (i  ), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (i  ), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (i+1), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (i+1), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (i+2), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (i+2), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (i+3), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (i+3), rs, cs); 
+
+/*
+
+    The following macros are used when we have a full panel (jb == NR) 
+    and we need to handle an edge case (k%8 != 0).
+
+    The NR loop is unrolled resulting in the stream of macros.
+
+*/
+
+#define bpad_col_kleft7(dest, matrix, rs, cs) \
+    col_m_order_1_kleft7(dest, matrix, (j  ), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (j  ), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (j+1), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (j+1), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (j+2), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (j+2), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (j+3), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (j+3), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (j+4), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (j+4), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (j+5), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (j+5), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (j+6), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (j+6), rs, cs); \
+    col_m_order_1_kleft7(dest, matrix, (j+7), rs, cs); \
+    col_m_order_2_kleft7(dest, matrix, (j+7), rs, cs); 
+
+#define bpad_col_kleft6(dest, matrix, rs, cs) \
+    col_m_order_1_kleft6(dest, matrix, (j  ), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (j  ), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (j+1), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (j+1), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (j+2), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (j+2), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (j+3), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (j+3), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (j+4), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (j+4), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (j+5), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (j+5), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (j+6), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (j+6), rs, cs); \
+    col_m_order_1_kleft6(dest, matrix, (j+7), rs, cs); \
+    col_m_order_2_kleft6(dest, matrix, (j+7), rs, cs); 
+
+#define bpad_col_kleft5(dest, matrix, rs, cs) \
+    col_m_order_1_kleft5(dest, matrix, (j  ), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (j  ), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (j+1), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (j+1), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (j+2), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (j+2), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (j+3), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (j+3), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (j+4), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (j+4), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (j+5), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (j+5), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (j+6), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (j+6), rs, cs); \
+    col_m_order_1_kleft5(dest, matrix, (j+7), rs, cs); \
+    col_m_order_2_kleft5(dest, matrix, (j+7), rs, cs); 
+
+#define bpad_col_kleft4(dest, matrix, rs, cs) \
+    col_m_order_1_kleft4(dest, matrix, (j  ), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (j  ), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (j+1), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (j+1), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (j+2), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (j+2), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (j+3), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (j+3), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (j+4), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (j+4), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (j+5), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (j+5), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (j+6), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (j+6), rs, cs); \
+    col_m_order_1_kleft4(dest, matrix, (j+7), rs, cs); \
+    col_m_order_2_kleft4(dest, matrix, (j+7), rs, cs); 
+
+#define bpad_col_kleft3(dest, matrix, rs, cs) \
+    col_m_order_1_kleft3(dest, matrix, (j  ), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (j  ), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (j+1), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (j+1), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (j+2), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (j+2), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (j+3), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (j+3), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (j+4), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (j+4), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (j+5), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (j+5), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (j+6), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (j+6), rs, cs); \
+    col_m_order_1_kleft3(dest, matrix, (j+7), rs, cs); \
+    col_m_order_2_kleft3(dest, matrix, (j+7), rs, cs);
+
+#define bpad_col_kleft2(dest, matrix, rs, cs) \
+    col_m_order_1_kleft2(dest, matrix, (j  ), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (j  ), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (j+1), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (j+1), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (j+2), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (j+2), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (j+3), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (j+3), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (j+4), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (j+4), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (j+5), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (j+5), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (j+6), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (j+6), rs, cs); \
+    col_m_order_1_kleft2(dest, matrix, (j+7), rs, cs); \
+    col_m_order_2_kleft2(dest, matrix, (j+7), rs, cs);
+
+#define bpad_col_kleft1(dest, matrix, rs, cs) \
+    col_m_order_1_kleft1(dest, matrix, (j  ), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (j  ), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (j+1), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (j+1), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (j+2), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (j+2), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (j+3), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (j+3), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (j+4), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (j+4), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (j+5), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (j+5), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (j+6), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (j+6), rs, cs); \
+    col_m_order_1_kleft1(dest, matrix, (j+7), rs, cs); \
+    col_m_order_2_kleft1(dest, matrix, (j+7), rs, cs);
+
+
+/*
+
+    The following macros handle non full size panels (ib/jb != MR/NR) and 
+    edge cases (k%8 != 0).
+
+*/
+
+#define edge(edgefun, dest, matrix, panel, left, rs, cs) \
+    for (int ir=0; ir<left; ir++) { \
+        if (ir%2==0) { \
+            col_m_order_1_ ## edgefun ## (dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+        else { \
+            col_m_order_2_ ## edgefun ## (dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+    } 
+
+#define edge7(dest, matrix, panel, left, rs, cs) \
+    for (int ir=0; ir<left; ir++) { \
+        if (ir%2==0) { \
+            col_m_order_1_kleft7(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+        else { \
+            col_m_order_2_kleft7(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+    }
+
+#define edge6(dest, matrix, panel, left, rs, cs) \
+    for (int ir=0; ir<left; ir++) { \
+        if (ir%2==0) { \
+            col_m_order_1_kleft6(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+        else { \
+            col_m_order_2_kleft6(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+    }
+
+#define edge5(dest, matrix, panel, left, rs, cs) \
+    for (int ir=0; ir<left; ir++) { \
+        if (ir%2==0) { \
+            col_m_order_1_kleft5(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+        else { \
+            col_m_order_2_kleft5(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+    }
+
+#define edge4(dest, matrix, panel, left, rs, cs) \
+    for (int ir=0; ir<left; ir++) { \
+        if (ir%2==0) { \
+            col_m_order_1_kleft4(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+        else { \
+            col_m_order_2_kleft4(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+    }
+
+#define edge3(dest, matrix, panel, left, rs, cs) \
+    for (int ir=0; ir<left; ir++) { \
+        if (ir%2==0) { \
+            col_m_order_1_kleft3(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+        else { \
+            col_m_order_2_kleft3(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+    }
+
+#define edge2(dest, matrix, panel, left, rs, cs) \
+    for (int ir=0; ir<left; ir++) { \
+        if (ir%2==0) { \
+            col_m_order_1_kleft2(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+        else { \
+            col_m_order_2_kleft2(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+    }
+
+#define edge1(dest, matrix, panel, left, rs, cs) \
+    for (int ir=0; ir<left; ir++) { \
+        if (ir%2==0) { \
+            col_m_order_1_kleft1(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+        else { \
+            col_m_order_2_kleft1(dest, matrix, (panel+ir/2), rs, cs); \
+        } \
+    }
+