Add low-precision POWER10 gemm kernels (#467)

Details: - This commit adds a new BLIS sandbox that (1) provides implementations based on low-precision gemm kernels, and (2) extends the BLIS typed API for those new implementations. Currently, these new kernels can only be used for the POWER10 microarchitecture; however, they may provide a template for developing similar kernels for other microarchitectures (even those beyond POWER), as changes would likely be limited to select places in the microkernel and possibly the packing routines. The new low-precision operations that are now supported include: shgemm, sbgemm, i16gemm, i8gemm, i4gemm. For more information, refer to the POWER10.md document that is included in 'sandbox/power10'.
2026-05-13 10:35:38 +00:00 · 2021-03-05 13:53:43 -06:00
parent b8dcc5bc75
commit 670bc7b60f
24 changed files with 3363 additions and 371 deletions
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -0,0 +1,192 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#include "vector_int_macros.h"
+
+#define D_ASSEMBLE_VEC_PAIR \
+        __builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \
+        __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); 
+
+#define D_ACCUMULATE \
+        __builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \
+        __builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \
+        __builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \
+        __builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \
+        __builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \
+        __builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \
+        __builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \
+        __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); 
+
+#define D_INCREMENT \
+        A0+=8; \
+        B0+=8;
+
+#define D_AB_PRODUCT \
+        LOAD_VECTORS \
+        D_ASSEMBLE_VEC_PAIR \
+        D_INCREMENT \
+        D_ACCUMULATE 
+
+
+void bli_dgemm_power10_mma_8x8
+    (
+        dim_t               k0,
+        double*    restrict alpha,
+        double*    restrict a,
+        double*    restrict b,
+        double*    restrict beta,
+        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
+    uint64_t k_iter = (k0-1) / 4;
+    uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    double* restrict A0 = a;
+    double* restrict B0 = b;
+    double* restrict C0 = c;
+
+    double alpha_ = *alpha,
+           beta_ = *beta;
+
+    dv4sf_t result[4];
+    dv4sf_t *rowC;
+
+    /* 8 accumulator registers that will be used to store the result.
+       
+       Each accumulator register is mapped to 4 vector registers.
+       Illustration:
+                      
+            acc0 = [  vs0
+                      vs1
+                      vs3
+                      vs4  ]
+
+        These registers are used to store the result of an outer product 
+        instruction (general outer product instruction syntax: xv???ger??). */
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    /* 2 vector pairs are necessary for a double precision outer product 
+       instruction. */
+    __vector_pair colA_1, 
+                  colA_2;
+
+    /* Prefetch C so that it stays in cache */
+    PREFETCH1 (C0, 0);
+    PREFETCH1 (C0 + rs_c, 0);
+    PREFETCH1 (C0 + rs_c + rs_c, 0);
+    PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0);
+    PREFETCH1 (C0, 128);
+    PREFETCH1 (C0 + rs_c, 128);
+    PREFETCH1 (C0 + rs_c + rs_c, 128);
+    PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128);
+
+    /* Load elements into vector registers */
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0; 
+
+    /* Each accumulator represents a matrix of size 
+       4 x ( 16 / (datatype size in bytes) )  (vector register size = 16B)
+
+       Thus in the case of double, the accumulate registers represent a 4x2 
+       matrix. However, a vector register can hold at most 2 doubles. Thus, if
+       we performed an outer product using 2 vector register, we can only get a 
+       2x2 matrix. Therefore, we must create a vector register pair in order
+       to get the desired 4x2 matrix.
+    
+    */
+    D_ASSEMBLE_VEC_PAIR
+
+    /* Compute accumulate outer products and override accumulators with result */
+    __builtin_mma_xvf64ger (&acc0, colA_1, rb[0]);
+    __builtin_mma_xvf64ger (&acc1, colA_1, rb[1]);
+    __builtin_mma_xvf64ger (&acc2, colA_1, rb[2]);
+    __builtin_mma_xvf64ger (&acc3, colA_1, rb[3]);
+    __builtin_mma_xvf64ger (&acc4, colA_2, rb[0]);
+    __builtin_mma_xvf64ger (&acc5, colA_2, rb[1]);
+    __builtin_mma_xvf64ger (&acc6, colA_2, rb[2]);
+    __builtin_mma_xvf64ger (&acc7, colA_2, rb[3]);
+
+    /* Move A and B pointers */
+    D_INCREMENT
+
+    // k loop (unrolled by 4)
+    for (int k = 0; k<k_iter; k++)
+    {
+        D_AB_PRODUCT
+        D_AB_PRODUCT
+        D_AB_PRODUCT
+        D_AB_PRODUCT
+    }
+    
+    // edge loop
+    for (int k = 0; k<k_left; k++)
+    {
+        D_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(dv4sf_t, &acc0, rs_c, 0      );
+        SAVE_ACC(dv4sf_t, &acc1, rs_c, 4      );
+        SAVE_ACC(dv4sf_t, &acc2, rs_c, 8      );
+        SAVE_ACC(dv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(dv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(dv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(dv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(dv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(dv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(dv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(dv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(dv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(dv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(dv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(dv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(dv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+
+}
--- a/kernels/power10/3/bli_gemm_power10_mma.c
+++ b/kernels/power10/3/bli_gemm_power10_mma.c
@@ -1,359 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-typedef double dv4sf_t __attribute__ ((vector_size (16)));
-typedef unsigned char vec_t __attribute__ ((vector_size (16)));
-
-/*  disassemble the acc accumulator into a result array of vectors
-	store the result accordingly  */
-#define dgemm_SAVE_ACC_(ACC, rs_c, j)                   \
-    __builtin_mma_disassemble_acc (result, ACC);      \
-    rowC = (dv4sf_t *) &C0[j];                        \
-    rowC[0] = alpha_ * result[0] + beta_ * rowC[0];   \
-    rowC = (dv4sf_t *) &C0[rs_c+j];                     \
-    rowC[0] = alpha_ * result[1] + beta_ * rowC[0];   \
-    rowC = (dv4sf_t *) &C0[2*rs_c+j];                   \
-    rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ;  \
-    rowC = (dv4sf_t *) &C0[3*rs_c+j];                   \
-    rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
-
-#define dgemm_SAVE_ACC_bz(ACC, rs_c, j)                 \
-    __builtin_mma_disassemble_acc (result, ACC);      \
-    rowC = (dv4sf_t *) &C0[j];                        \
-    rowC[0] = alpha_ * result[0];                     \
-    rowC = (dv4sf_t *) &C0[rs_c+j];                     \
-    rowC[0] = alpha_ * result[1];                     \
-    rowC = (dv4sf_t *) &C0[2*rs_c+j];                   \
-    rowC[0] = alpha_ * result[2];                     \
-    rowC = (dv4sf_t *) &C0[3*rs_c+j];                   \
-    rowC[0] = alpha_ * result[3];
-
-#define PREFETCH1(x, y) __asm__ volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
-
-#define LOAD_VECTORS \
-		ca = (vec_t *) A0; \
-		rb = (vec_t *) B0; 
-
-#define D_ASSEMBLE_VEC_PAIR \
-		__builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \
-		__builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); 
-
-#define D_ACCUMULATE \
-		__builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \
-		__builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \
-		__builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \
-		__builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \
-		__builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \
-		__builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \
-		__builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \
-		__builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); 
-
-#define D_INCREMENT \
-		A0+=8; \
-		B0+=8;
-
-#define D_AB_PRODUCT \
-		LOAD_VECTORS \
-		D_ASSEMBLE_VEC_PAIR \
-		D_INCREMENT \
-		D_ACCUMULATE 
-
-
-void bli_dgemm_power10_mma_8x8
-	(
-		dim_t               k0,
-		double*    restrict alpha,
-		double*    restrict a,
-		double*    restrict b,
-		double*    restrict beta,
-		double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-		auxinfo_t* restrict data,
-		cntx_t*    restrict cntx
-	)
-{
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
-	uint64_t k_iter = (k0-1) / 4;
-	uint64_t k_left = (k0-1) % 4;
-
-	uint64_t rs_c   = rs_c0;
-
-	double* restrict A0 = a;
-	double* restrict B0 = b;
-	double* restrict C0 = c;
-
-	double alpha_ = *alpha,
-	       beta_ = *beta;
-
-	dv4sf_t result[4];
-  	dv4sf_t *rowC;
-
-	/* 8 accumulator registers that will be used to store the result.
-	   
-	   Each accumulator register is mapped to 4 vector registers.
-	   Illustration:
-					  
-			acc0 = [  vs0
-					  vs1
-			          vs3
-					  vs4  ]
-
-		These registers are used to store the result of an outer product 
-		instruction (general outer product instruction syntax: xv???ger??). */
-	__vector_quad acc0, acc1, acc2, acc3, 
-	              acc4, acc5, acc6, acc7;
-
-	/* 2 vector pairs are necessary for a double precision outer product 
-	   instruction. */
-	__vector_pair colA_1, 
-	              colA_2;
-
-	/* Prefetch C so that it stays in cache */
-	PREFETCH1 (C0, 0);
-	PREFETCH1 (C0 + rs_c, 0);
-	PREFETCH1 (C0 + rs_c + rs_c, 0);
-	PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0);
-	PREFETCH1 (C0, 128);
-	PREFETCH1 (C0 + rs_c, 128);
-	PREFETCH1 (C0 + rs_c + rs_c, 128);
-	PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128);
-
-	/* Load elements into vector registers */
-	vec_t *ca = (vec_t *) A0;
-	vec_t *rb = (vec_t *) B0; 
-
-	/* Each accumulator represents a matrix of size 
-	   4 x ((datatype size in bytes) / 16)  (vector register size = 128b)
-
-	   Thus in the case of double, the accumulate registers represent a 4x2 
-	   matrix. However, a vector register can hold at most 2 doubles. Thus, if
-	   we performed an outer product using 2 vector register, we can only get a 
-	   2x2 matrix. Therefore, we must create a vector register pair in order
-	   to get the desired 4x2 matrix.
-	
-	*/
-	D_ASSEMBLE_VEC_PAIR
-
-	/* Compute accumulate outer products and override accumulators with result */
-	__builtin_mma_xvf64ger (&acc0, colA_1, rb[0]);
-	__builtin_mma_xvf64ger (&acc1, colA_1, rb[1]);
-	__builtin_mma_xvf64ger (&acc2, colA_1, rb[2]);
-	__builtin_mma_xvf64ger (&acc3, colA_1, rb[3]);
-	__builtin_mma_xvf64ger (&acc4, colA_2, rb[0]);
-	__builtin_mma_xvf64ger (&acc5, colA_2, rb[1]);
-	__builtin_mma_xvf64ger (&acc6, colA_2, rb[2]);
-	__builtin_mma_xvf64ger (&acc7, colA_2, rb[3]);
-
-	/* Move A and B pointers */
-	D_INCREMENT
-
-	// k loop (unrolled by 4)
-	for (int k = 0; k<k_iter; k++)
-	{
-		D_AB_PRODUCT
-		D_AB_PRODUCT
-		D_AB_PRODUCT
-		D_AB_PRODUCT
-	}
-	
-	// edge loop
-	for (int k = 0; k<k_left; k++)
-	{
-		D_AB_PRODUCT
-	}
-
-	// handle beta cases
-	if (beta_ != 0.0)
-	{
-		dgemm_SAVE_ACC_(&acc0, rs_c, 0       );
-		dgemm_SAVE_ACC_(&acc1, rs_c, 2       );
-		dgemm_SAVE_ACC_(&acc2, rs_c, 4       );
-		dgemm_SAVE_ACC_(&acc3, rs_c, 6       );
-		dgemm_SAVE_ACC_(&acc4, rs_c,   4*rs_c);
-		dgemm_SAVE_ACC_(&acc5, rs_c, 2+4*rs_c);
-		dgemm_SAVE_ACC_(&acc6, rs_c, 4+4*rs_c);
-		dgemm_SAVE_ACC_(&acc7, rs_c, 6+4*rs_c);
-	}
-	else
-	{
-		dgemm_SAVE_ACC_bz(&acc0, rs_c, 0       );
-		dgemm_SAVE_ACC_bz(&acc1, rs_c, 2       );
-		dgemm_SAVE_ACC_bz(&acc2, rs_c, 4       );
-		dgemm_SAVE_ACC_bz(&acc3, rs_c, 6       );
-		dgemm_SAVE_ACC_bz(&acc4, rs_c,   4*rs_c);
-		dgemm_SAVE_ACC_bz(&acc5, rs_c, 2+4*rs_c);
-		dgemm_SAVE_ACC_bz(&acc6, rs_c, 4+4*rs_c);
-		dgemm_SAVE_ACC_bz(&acc7, rs_c, 6+4*rs_c);
-	}
-
-}
-
-
-typedef float fv4sf_t __attribute__ ((vector_size (16)));
-
-#define sgemm_SAVE_ACC_(ACC, rs_c, j)                \
-    __builtin_mma_disassemble_acc (result, ACC);       \
-    rowC = (fv4sf_t *) &C0[j];                        \
-    rowC[0] = alpha_ * result[0] + beta_ * rowC[0];    \
-    rowC = (fv4sf_t *) &C0[rs_c+j];                     \
-    rowC[0] = alpha_ * result[1] + beta_ * rowC[0];    \
-    rowC = (fv4sf_t *) &C0[2*rs_c+j];                   \
-    rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ;   \
-    rowC = (fv4sf_t *) &C0[3*rs_c+j];                   \
-    rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
-
-#define sgemm_SAVE_ACC_bz(ACC, rs_c, j)                     \
-    __builtin_mma_disassemble_acc (result, ACC);     \
-    rowC = (fv4sf_t *) &C0[j];                      \
-    rowC[0] = alpha_ * result[0];                      \
-    rowC = (fv4sf_t *) &C0[rs_c+j];                     \
-    rowC[0] = alpha_ * result[1];                      \
-    rowC = (fv4sf_t *) &C0[2*rs_c+j];                   \
-    rowC[0] = alpha_ * result[2];                      \
-    rowC = (fv4sf_t *) &C0[3*rs_c+j];                   \
-    rowC[0] = alpha_ * result[3];
-
-#define S_ACCUMULATE \
-		__builtin_mma_xvf32gerpp (&acc0, ca[0], rb[0]); \
-		__builtin_mma_xvf32gerpp (&acc1, ca[0], rb[1]); \
-		__builtin_mma_xvf32gerpp (&acc2, ca[0], rb[2]); \
-		__builtin_mma_xvf32gerpp (&acc3, ca[0], rb[3]); \
-		__builtin_mma_xvf32gerpp (&acc4, ca[1], rb[0]); \
-		__builtin_mma_xvf32gerpp (&acc5, ca[1], rb[1]); \
-		__builtin_mma_xvf32gerpp (&acc6, ca[1], rb[2]); \
-		__builtin_mma_xvf32gerpp (&acc7, ca[1], rb[3]); 
-
-#define S_INCREMENT \
-		A0+=8; \
-		B0+=16;
-
-#define S_AB_PRODUCT \
-		LOAD_VECTORS \
-		S_INCREMENT \
-		S_ACCUMULATE 
-
-void bli_sgemm_power10_mma_8x16
-	(
-		dim_t               k0,
-		float*     restrict alpha,
-		float*     restrict a,
-		float*     restrict b,
-		float*     restrict beta,
-		float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-		auxinfo_t* restrict data,
-		cntx_t*    restrict cntx
-	)
-{
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-	// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
-	uint64_t k_iter = (k0-1) / 4;
-	uint64_t k_left = (k0-1) % 4;
-	
-	uint64_t rs_c   = rs_c0;
-
-	fv4sf_t result[4];
-  	fv4sf_t *rowC;
-
-	// accumulators that will hold the matrix product
-	__vector_quad acc0, acc1, acc2, acc3, 
-	              acc4, acc5, acc6, acc7;
-
-	float* restrict A0 = a;
-	float* restrict B0 = b;
-	float* restrict C0 = c;
-
-	float alpha_ = *alpha,
-	      beta_  = *beta;
-
-	/* Load elements into vector registers */
-	vec_t *ca = (vec_t *) A0;
-	vec_t *rb = (vec_t *) B0;
-
-	/* Compute accumulate outer products and override accumulators with result */
-	__builtin_mma_xvf32ger (&acc0, ca[0], rb[0]);
-	__builtin_mma_xvf32ger (&acc1, ca[0], rb[1]);
-	__builtin_mma_xvf32ger (&acc2, ca[0], rb[2]);
-	__builtin_mma_xvf32ger (&acc3, ca[0], rb[3]);
-	__builtin_mma_xvf32ger (&acc4, ca[1], rb[0]);
-	__builtin_mma_xvf32ger (&acc5, ca[1], rb[1]);
-	__builtin_mma_xvf32ger (&acc6, ca[1], rb[2]);
-	__builtin_mma_xvf32ger (&acc7, ca[1], rb[3]);
-
-	S_INCREMENT
-
-	// k loop (unrolled by 4)
-	for (int k = 0; k<k_iter; k++)
-	{
-		S_AB_PRODUCT
-		S_AB_PRODUCT
-		S_AB_PRODUCT
-		S_AB_PRODUCT
-	}
-	
-	// edge loop
-	for (int k = 0; k<k_left; k++)
-	{
-		S_AB_PRODUCT
-	}
-
-	// handle beta cases
-	if (beta_ != 0.0)
-	{
-		sgemm_SAVE_ACC_(&acc0, rs_c, 0      );
-		sgemm_SAVE_ACC_(&acc1, rs_c, 4      );
-		sgemm_SAVE_ACC_(&acc2, rs_c, 8      );
-		sgemm_SAVE_ACC_(&acc3, rs_c, 12     );
-		sgemm_SAVE_ACC_(&acc4, rs_c,    4*rs_c);
-		sgemm_SAVE_ACC_(&acc5, rs_c,  4+4*rs_c);
-		sgemm_SAVE_ACC_(&acc6, rs_c,  8+4*rs_c);
-		sgemm_SAVE_ACC_(&acc7, rs_c, 12+4*rs_c);
-	}
-	else
-	{
-		sgemm_SAVE_ACC_bz( &acc0, rs_c,  0     );
-		sgemm_SAVE_ACC_bz( &acc1, rs_c,  4     );
-		sgemm_SAVE_ACC_bz( &acc2, rs_c,  8     );
-		sgemm_SAVE_ACC_bz( &acc3, rs_c, 12     );
-		sgemm_SAVE_ACC_bz( &acc4, rs_c,    4*rs_c);
-		sgemm_SAVE_ACC_bz( &acc5, rs_c,  4+4*rs_c);
-		sgemm_SAVE_ACC_bz( &acc6, rs_c,  8+4*rs_c);
-		sgemm_SAVE_ACC_bz( &acc7, rs_c, 12+4*rs_c);
-	}
-}
-
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -0,0 +1,140 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define I16_ACCUMULATE \
+    __builtin_mma_xvi16ger2pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvi16ger2pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvi16ger2pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvi16ger2pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvi16ger2pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvi16ger2pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvi16ger2pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvi16ger2pp (&acc7, ca[1], rb[3]);
+
+#define I16_INCREMENT \
+    A0+=16; \
+    B0+=32;
+
+#define I16_AB_PRODUCT \
+    LOAD_VECTORS \
+    I16_INCREMENT \
+    I16_ACCUMULATE
+
+void bli_i16gemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        int32_t*       restrict alpha,
+        short*     restrict a,
+        short*     restrict b,
+        int32_t*       restrict beta,
+        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1) / 4;
+    uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    short* restrict A0 = a;
+    short* restrict B0 = b;
+    int*   restrict C0 = c;
+
+    int alpha_ = *alpha,
+        beta_ = *beta;
+
+    iv4sf_t result[4];
+    iv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    __builtin_mma_xvi16ger2 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvi16ger2 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvi16ger2 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvi16ger2 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvi16ger2 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvi16ger2 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvi16ger2 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvi16ger2 (&acc7, ca[1], rb[3]);
+
+    I16_INCREMENT
+
+    // k loop
+    for (int k = 0; k<k_iter; k++)
+    {
+        I16_AB_PRODUCT
+        I16_AB_PRODUCT
+        I16_AB_PRODUCT
+        I16_AB_PRODUCT
+    }
+
+    // k loop
+    for (int k = 0; k<k_left; k++)
+    {
+        I16_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -0,0 +1,140 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define I16S_ACCUMULATE \
+    __builtin_mma_xvi16ger2spp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvi16ger2spp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvi16ger2spp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvi16ger2spp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvi16ger2spp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvi16ger2spp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvi16ger2spp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvi16ger2spp (&acc7, ca[1], rb[3]);
+
+#define I16S_INCREMENT \
+    A0+=16; \
+    B0+=32;
+
+#define I16S_AB_PRODUCT \
+    LOAD_VECTORS \
+    I16S_INCREMENT \
+    I16S_ACCUMULATE
+
+void bli_i16sgemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        int32_t*       restrict alpha,
+        short*     restrict a,
+        short*     restrict b,
+        int32_t*       restrict beta,
+        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1) / 4;
+    uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    short* restrict A0 = a;
+    short* restrict B0 = b;
+    int*   restrict C0 = c;
+
+    int alpha_ = *alpha,
+        beta_ = *beta;
+
+    iv4sf_t result[4];
+    iv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    __builtin_mma_xvi16ger2s (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvi16ger2s (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvi16ger2s (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvi16ger2s (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvi16ger2s (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvi16ger2s (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvi16ger2s (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvi16ger2s (&acc7, ca[1], rb[3]);
+
+    I16S_INCREMENT
+
+    // k loop
+    for (int k = 0; k<k_iter; k++)
+    {
+        I16S_AB_PRODUCT
+        I16S_AB_PRODUCT
+        I16S_AB_PRODUCT
+        I16S_AB_PRODUCT
+    }
+
+    // k loop
+    for (int k = 0; k<k_left; k++)
+    {
+        I16S_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -0,0 +1,140 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define I4_ACCUMULATE \
+    __builtin_mma_xvi4ger8pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvi4ger8pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvi4ger8pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvi4ger8pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvi4ger8pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvi4ger8pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvi4ger8pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvi4ger8pp (&acc7, ca[1], rb[3]);
+
+#define I4_INCREMENT \
+    A0+=32; \
+    B0+=64;
+
+#define I4_AB_PRODUCT \
+    LOAD_VECTORS \
+    I4_INCREMENT \
+    I4_ACCUMULATE
+
+void bli_i4gemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        int32_t*       restrict alpha,
+        nibbles*   restrict a,
+        nibbles*   restrict b,
+        int32_t*       restrict beta,
+        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1) / 4;
+	uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    nibbles* restrict A0 = a;
+    nibbles* restrict B0 = b;
+    int*     restrict C0 = c;
+
+    int alpha_ = *alpha,
+        beta_ = *beta;
+
+    iv4sf_t result[4];
+    iv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;        
+
+    __builtin_mma_xvi4ger8 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvi4ger8 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvi4ger8 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvi4ger8 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvi4ger8 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvi4ger8 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvi4ger8 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvi4ger8 (&acc7, ca[1], rb[3]);
+    
+    I4_INCREMENT
+
+    // k loop (unrolled by 4)
+	for (int k = 0; k<k_iter; k++)
+	{
+		I4_AB_PRODUCT
+		I4_AB_PRODUCT
+		I4_AB_PRODUCT
+		I4_AB_PRODUCT
+	}
+	
+	// edge loop
+	for (int k = 0; k<k_left; k++)
+	{
+		I4_AB_PRODUCT
+	}
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -0,0 +1,139 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define I8_ACCUMULATE \
+    __builtin_mma_xvi8ger4pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvi8ger4pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvi8ger4pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvi8ger4pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvi8ger4pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvi8ger4pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvi8ger4pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvi8ger4pp (&acc7, ca[1], rb[3]);
+
+#define I8_INCREMENT \
+    A0+=32; \
+    B0+=64;
+
+#define I8_AB_PRODUCT \
+    LOAD_VECTORS \
+    I8_INCREMENT \
+    I8_ACCUMULATE
+
+void bli_i8gemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        int32_t*       restrict alpha,
+        int8_t*    restrict a,
+        int8_t*    restrict b,
+        int32_t*       restrict beta,
+        int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+    uint64_t k_iter = (k0-1) / 4;
+	uint64_t k_left = (k0-1) % 4;
+
+    uint64_t rs_c   = rs_c0;
+
+    int8_t* restrict A0 = a;
+    int8_t* restrict B0 = b;
+    int*    restrict C0 = c;
+
+    int alpha_ = *alpha,
+        beta_ = *beta;
+
+    iv4sf_t result[4];
+    iv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;        
+
+    __builtin_mma_xvi8ger4 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvi8ger4 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvi8ger4 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvi8ger4 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvi8ger4 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvi8ger4 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvi8ger4 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvi8ger4 (&acc7, ca[1], rb[3]);
+
+    I8_INCREMENT
+
+    // k loop (unrolled by 4)
+	for (int k = 0; k<k_iter; k++)
+	{
+		I8_AB_PRODUCT
+		I8_AB_PRODUCT
+		I8_AB_PRODUCT
+		I8_AB_PRODUCT
+	}
+	
+	// edge loop
+	for (int k = 0; k<k_left; k++)
+	{
+		I8_AB_PRODUCT
+	}
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(iv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(iv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(iv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(iv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -0,0 +1,141 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define B_ACCUMULATE \
+    __builtin_mma_xvbf16ger2pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvbf16ger2pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvbf16ger2pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvbf16ger2pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvbf16ger2pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvbf16ger2pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvbf16ger2pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvbf16ger2pp (&acc7, ca[1], rb[3]); 
+
+#define B_INCREMENT \
+    A0+=16; \
+    B0+=32; 
+    
+#define B_AB_PRODUCT \
+    LOAD_VECTORS \
+    B_INCREMENT \
+    B_ACCUMULATE 
+
+
+void bli_sbgemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        float*     restrict alpha,
+        bfloat16*  restrict a,
+        bfloat16*  restrict b,
+        float*     restrict beta,
+        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1)/4;
+    uint64_t k_left = (k0-1)%4;
+
+    uint64_t rs_c   = rs_c0;
+
+    bfloat16* restrict A0 = a;
+    bfloat16* restrict B0 = b;
+    float* restrict C0 = c;
+
+    float alpha_= *alpha,
+          beta_ = *beta;
+
+    fv4sf_t result[4];
+    fv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    __builtin_mma_xvbf16ger2 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvbf16ger2 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvbf16ger2 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvbf16ger2 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvbf16ger2 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvbf16ger2 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvbf16ger2 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvbf16ger2 (&acc7, ca[1], rb[3]);
+
+    B_INCREMENT
+
+    // k loop
+    for (int k = 0; k<k_iter; k++)
+    {
+        B_AB_PRODUCT
+        B_AB_PRODUCT
+        B_AB_PRODUCT
+        B_AB_PRODUCT
+    }
+
+    for (int k = 0; k<k_left; k++)
+    {
+        B_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(fv4sf_t, &acc0, rs_c,  0       );
+        SAVE_ACC(fv4sf_t, &acc1, rs_c,  4       );
+        SAVE_ACC(fv4sf_t, &acc2, rs_c,  8       );
+        SAVE_ACC(fv4sf_t, &acc3, rs_c, 12       );
+        SAVE_ACC(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(fv4sf_t, &acc0, rs_c,  0       );
+        SAVE_ACC_bz(fv4sf_t, &acc1, rs_c,  4       );
+        SAVE_ACC_bz(fv4sf_t, &acc2, rs_c,  8       );
+        SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12       );
+        SAVE_ACC_bz(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+
+}
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -0,0 +1,144 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define S_ACCUMULATE \
+        __builtin_mma_xvf32gerpp (&acc0, ca[0], rb[0]); \
+        __builtin_mma_xvf32gerpp (&acc1, ca[0], rb[1]); \
+        __builtin_mma_xvf32gerpp (&acc2, ca[0], rb[2]); \
+        __builtin_mma_xvf32gerpp (&acc3, ca[0], rb[3]); \
+        __builtin_mma_xvf32gerpp (&acc4, ca[1], rb[0]); \
+        __builtin_mma_xvf32gerpp (&acc5, ca[1], rb[1]); \
+        __builtin_mma_xvf32gerpp (&acc6, ca[1], rb[2]); \
+        __builtin_mma_xvf32gerpp (&acc7, ca[1], rb[3]); 
+
+#define S_INCREMENT \
+        A0+=8; \
+        B0+=16;
+
+#define S_AB_PRODUCT \
+        LOAD_VECTORS \
+        S_INCREMENT \
+        S_ACCUMULATE 
+
+void bli_sgemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        float*     restrict alpha,
+        float*     restrict a,
+        float*     restrict b,
+        float*     restrict beta,
+        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
+    uint64_t k_iter = (k0-1) / 4;
+    uint64_t k_left = (k0-1) % 4;
+    
+    uint64_t rs_c   = rs_c0;
+
+    fv4sf_t result[4];
+      fv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    float* restrict A0 = a;
+    float* restrict B0 = b;
+    float* restrict C0 = c;
+
+    float alpha_ = *alpha,
+          beta_  = *beta;
+
+    /* Load elements into vector registers */
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    /* Compute accumulate outer products and override accumulators with result */
+    __builtin_mma_xvf32ger (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvf32ger (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvf32ger (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvf32ger (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvf32ger (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvf32ger (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvf32ger (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvf32ger (&acc7, ca[1], rb[3]);
+
+    S_INCREMENT
+
+    // k loop (unrolled by 4)
+    for (int k = 0; k<k_iter; k++)
+    {
+        S_AB_PRODUCT
+        S_AB_PRODUCT
+        S_AB_PRODUCT
+        S_AB_PRODUCT
+    }
+    
+    // edge loop
+    for (int k = 0; k<k_left; k++)
+    {
+        S_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(fv4sf_t, &acc0, rs_c, 0      );
+        SAVE_ACC(fv4sf_t, &acc1, rs_c, 4      );
+        SAVE_ACC(fv4sf_t, &acc2, rs_c, 8      );
+        SAVE_ACC(fv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(fv4sf_t, &acc0, rs_c,  0     );
+        SAVE_ACC_bz(fv4sf_t, &acc1, rs_c,  4     );
+        SAVE_ACC_bz(fv4sf_t, &acc2, rs_c,  8     );
+        SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12     );
+        SAVE_ACC_bz(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+}
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -0,0 +1,141 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vector_int_macros.h"
+
+#define H_ACCUMULATE \
+    __builtin_mma_xvf16ger2pp (&acc0, ca[0], rb[0]); \
+    __builtin_mma_xvf16ger2pp (&acc1, ca[0], rb[1]); \
+    __builtin_mma_xvf16ger2pp (&acc2, ca[0], rb[2]); \
+    __builtin_mma_xvf16ger2pp (&acc3, ca[0], rb[3]); \
+    __builtin_mma_xvf16ger2pp (&acc4, ca[1], rb[0]); \
+    __builtin_mma_xvf16ger2pp (&acc5, ca[1], rb[1]); \
+    __builtin_mma_xvf16ger2pp (&acc6, ca[1], rb[2]); \
+    __builtin_mma_xvf16ger2pp (&acc7, ca[1], rb[3]); 
+
+#define H_INCREMENT \
+    A0+=16; \
+    B0+=32; 
+    
+#define H_AB_PRODUCT \
+    LOAD_VECTORS \
+    H_INCREMENT \
+    H_ACCUMULATE 
+
+
+void bli_shgemm_power10_mma_8x16
+    (
+        dim_t               k0,
+        float*     restrict alpha,
+        float16*  restrict a,
+        float16*  restrict b,
+        float*     restrict beta,
+        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
+        auxinfo_t* restrict data,
+        cntx_t*    restrict cntx
+    )
+{
+
+    uint64_t k_iter = (k0-1)/4;
+    uint64_t k_left = (k0-1)%4;
+
+    uint64_t rs_c   = rs_c0;
+
+    float16* restrict A0 = a;
+    float16* restrict B0 = b;
+    float* restrict C0 = c;
+
+    float alpha_= *alpha,
+          beta_ = *beta;
+
+    fv4sf_t result[4];
+    fv4sf_t *rowC;
+
+    // accumulators that will hold the matrix product
+    __vector_quad acc0, acc1, acc2, acc3, 
+                  acc4, acc5, acc6, acc7;
+
+    vec_t *ca = (vec_t *) A0;
+    vec_t *rb = (vec_t *) B0;
+
+    __builtin_mma_xvf16ger2 (&acc0, ca[0], rb[0]);
+    __builtin_mma_xvf16ger2 (&acc1, ca[0], rb[1]);
+    __builtin_mma_xvf16ger2 (&acc2, ca[0], rb[2]);
+    __builtin_mma_xvf16ger2 (&acc3, ca[0], rb[3]);
+    __builtin_mma_xvf16ger2 (&acc4, ca[1], rb[0]);
+    __builtin_mma_xvf16ger2 (&acc5, ca[1], rb[1]);
+    __builtin_mma_xvf16ger2 (&acc6, ca[1], rb[2]);
+    __builtin_mma_xvf16ger2 (&acc7, ca[1], rb[3]);
+
+    H_INCREMENT
+
+    // k loop
+    for (int k = 0; k<k_iter; k++)
+    {
+        H_AB_PRODUCT
+        H_AB_PRODUCT
+        H_AB_PRODUCT
+        H_AB_PRODUCT
+    }
+
+    for (int k = 0; k<k_left; k++)
+    {
+        H_AB_PRODUCT
+    }
+
+    // handle beta cases
+    if (beta_ != 0.0)
+    {
+        SAVE_ACC(fv4sf_t, &acc0, rs_c,  0       );
+        SAVE_ACC(fv4sf_t, &acc1, rs_c,  4       );
+        SAVE_ACC(fv4sf_t, &acc2, rs_c,  8       );
+        SAVE_ACC(fv4sf_t, &acc3, rs_c, 12       );
+        SAVE_ACC(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+    else
+    {
+        SAVE_ACC_bz(fv4sf_t, &acc0, rs_c,  0       );
+        SAVE_ACC_bz(fv4sf_t, &acc1, rs_c,  4       );
+        SAVE_ACC_bz(fv4sf_t, &acc2, rs_c,  8       );
+        SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12       );
+        SAVE_ACC_bz(fv4sf_t, &acc4, rs_c,    4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc5, rs_c,  4+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
+        SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
+    }
+
+}
--- a/kernels/power10/3/vector_int_macros.h
+++ b/kernels/power10/3/vector_int_macros.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Common include/defines across microkernels
+
+#include "blis.h"
+
+#define PREFETCH1(x, y) __asm__ volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#define LOAD_VECTORS \
+        ca = (vec_t *) A0; \
+        rb = (vec_t *) B0; 
+
+typedef __vector float fv4sf_t;
+typedef __vector double dv4sf_t;
+typedef __vector int32_t iv4sf_t;
+typedef __vector unsigned char vec_t;
+
+#define SAVE_ACC(v_t, ACC, rs_c, j)                \
+    __builtin_mma_disassemble_acc ( (void *) result, ACC);       \
+    rowC = (v_t *) &C0[j];                        \
+    rowC[0] = alpha_ * result[0] + beta_ * rowC[0];    \
+    rowC = (v_t *) &C0[rs_c+j];                     \
+    rowC[0] = alpha_ * result[1] + beta_ * rowC[0];    \
+    rowC = (v_t *) &C0[2*rs_c+j];                   \
+    rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ;   \
+    rowC = (v_t *) &C0[3*rs_c+j];                   \
+    rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
+
+#define SAVE_ACC_bz(v_t, ACC, rs_c, j)                     \
+    __builtin_mma_disassemble_acc ( (void *) result, ACC);     \
+    rowC = (v_t *) &C0[j];                      \
+    rowC[0] = alpha_ * result[0];                      \
+    rowC = (v_t *) &C0[rs_c+j];                     \
+    rowC[0] = alpha_ * result[1];                      \
+    rowC = (v_t *) &C0[2*rs_c+j];                   \
+    rowC[0] = alpha_ * result[2];                      \
+    rowC = (v_t *) &C0[3*rs_c+j];                   \
+    rowC[0] = alpha_ * result[3];
+    
--- a/kernels/power10/bli_kernels_power10.h
+++ b/kernels/power10/bli_kernels_power10.h
@@ -34,4 +34,5 @@

 // gemm
 GEMM_UKR_PROT( double,   d, gemm_power10_mma_8x8  )
-GEMM_UKR_PROT( float,    s, gemm_power10_mma_8x16 )
+GEMM_UKR_PROT( float,    s, gemm_power10_mma_8x16 )
+