Add low-precision POWER10 gemm kernels (#467)

Details:
- This commit adds a new BLIS sandbox that (1) provides implementations 
  based on low-precision gemm kernels, and (2) extends the BLIS typed 
  API for those new implementations. Currently, these new kernels can 
  only be used for the POWER10 microarchitecture; however, they may 
  provide a template for developing similar kernels for other 
  microarchitectures (even those beyond POWER), as changes would likely 
  be limited to select places in the microkernel and possibly the 
  packing routines. The new low-precision operations that are now 
  supported include: shgemm, sbgemm, i16gemm, i8gemm, i4gemm. For more 
  information, refer to the POWER10.md document that is included in 
  'sandbox/power10'.
This commit is contained in:
Nicholai Tukanov
2021-03-05 13:53:43 -06:00
committed by GitHub
parent b8dcc5bc75
commit 670bc7b60f
24 changed files with 3363 additions and 371 deletions

View File

@@ -122,9 +122,9 @@ void bli_cntx_init_power10( cntx_t* cntx )
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 576, 576, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1408, 1408, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8192, 8184, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 832, 320, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1026, 960, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 );
// Update the context with the current architecture's register and cache

View File

@@ -66,8 +66,12 @@ CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=power10 -mtune=power10
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=power10 -mtune=power10
else
$(info $(CC_VENDOR))
$(error gcc is required for this configuration.)
$(error gcc, clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.
@@ -77,4 +81,3 @@ CRVECFLAGS := $(CKVECFLAGS)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -36,16 +36,18 @@
// Define template prototypes for level-3 micro-kernels.
//
#define GEMM_UKR_PROT( ctype, ch, opname ) \
#define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname)
#define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype_out* restrict alpha, \
ctype_in* restrict a, \
ctype_in* restrict b, \
ctype_out* restrict beta, \
ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);

View File

@@ -0,0 +1,192 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vector_int_macros.h"
#define D_ASSEMBLE_VEC_PAIR \
__builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \
__builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]);
#define D_ACCUMULATE \
__builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \
__builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \
__builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \
__builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \
__builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \
__builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \
__builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \
__builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]);
#define D_INCREMENT \
A0+=8; \
B0+=8;
#define D_AB_PRODUCT \
LOAD_VECTORS \
D_ASSEMBLE_VEC_PAIR \
D_INCREMENT \
D_ACCUMULATE
void bli_dgemm_power10_mma_8x8
(
dim_t k0,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
uint64_t k_iter = (k0-1) / 4;
uint64_t k_left = (k0-1) % 4;
uint64_t rs_c = rs_c0;
double* restrict A0 = a;
double* restrict B0 = b;
double* restrict C0 = c;
double alpha_ = *alpha,
beta_ = *beta;
dv4sf_t result[4];
dv4sf_t *rowC;
/* 8 accumulator registers that will be used to store the result.
Each accumulator register is mapped to 4 vector registers.
Illustration:
acc0 = [ vs0
vs1
vs3
vs4 ]
These registers are used to store the result of an outer product
instruction (general outer product instruction syntax: xv???ger??). */
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
/* 2 vector pairs are necessary for a double precision outer product
instruction. */
__vector_pair colA_1,
colA_2;
/* Prefetch C so that it stays in cache */
PREFETCH1 (C0, 0);
PREFETCH1 (C0 + rs_c, 0);
PREFETCH1 (C0 + rs_c + rs_c, 0);
PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0);
PREFETCH1 (C0, 128);
PREFETCH1 (C0 + rs_c, 128);
PREFETCH1 (C0 + rs_c + rs_c, 128);
PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128);
/* Load elements into vector registers */
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
/* Each accumulator represents a matrix of size
4 x ( 16 / (datatype size in bytes) ) (vector register size = 16B)
Thus in the case of double, the accumulate registers represent a 4x2
matrix. However, a vector register can hold at most 2 doubles. Thus, if
we performed an outer product using 2 vector register, we can only get a
2x2 matrix. Therefore, we must create a vector register pair in order
to get the desired 4x2 matrix.
*/
D_ASSEMBLE_VEC_PAIR
/* Compute accumulate outer products and override accumulators with result */
__builtin_mma_xvf64ger (&acc0, colA_1, rb[0]);
__builtin_mma_xvf64ger (&acc1, colA_1, rb[1]);
__builtin_mma_xvf64ger (&acc2, colA_1, rb[2]);
__builtin_mma_xvf64ger (&acc3, colA_1, rb[3]);
__builtin_mma_xvf64ger (&acc4, colA_2, rb[0]);
__builtin_mma_xvf64ger (&acc5, colA_2, rb[1]);
__builtin_mma_xvf64ger (&acc6, colA_2, rb[2]);
__builtin_mma_xvf64ger (&acc7, colA_2, rb[3]);
/* Move A and B pointers */
D_INCREMENT
// k loop (unrolled by 4)
for (int k = 0; k<k_iter; k++)
{
D_AB_PRODUCT
D_AB_PRODUCT
D_AB_PRODUCT
D_AB_PRODUCT
}
// edge loop
for (int k = 0; k<k_left; k++)
{
D_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
SAVE_ACC(dv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC(dv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC(dv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC(dv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC(dv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC(dv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC(dv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC(dv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
else
{
SAVE_ACC_bz(dv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC_bz(dv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC_bz(dv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC_bz(dv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC_bz(dv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC_bz(dv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC_bz(dv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC_bz(dv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -1,359 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
typedef double dv4sf_t __attribute__ ((vector_size (16)));
typedef unsigned char vec_t __attribute__ ((vector_size (16)));
/* disassemble the acc accumulator into a result array of vectors
store the result accordingly */
#define dgemm_SAVE_ACC_(ACC, rs_c, j) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (dv4sf_t *) &C0[j]; \
rowC[0] = alpha_ * result[0] + beta_ * rowC[0]; \
rowC = (dv4sf_t *) &C0[rs_c+j]; \
rowC[0] = alpha_ * result[1] + beta_ * rowC[0]; \
rowC = (dv4sf_t *) &C0[2*rs_c+j]; \
rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ; \
rowC = (dv4sf_t *) &C0[3*rs_c+j]; \
rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
#define dgemm_SAVE_ACC_bz(ACC, rs_c, j) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (dv4sf_t *) &C0[j]; \
rowC[0] = alpha_ * result[0]; \
rowC = (dv4sf_t *) &C0[rs_c+j]; \
rowC[0] = alpha_ * result[1]; \
rowC = (dv4sf_t *) &C0[2*rs_c+j]; \
rowC[0] = alpha_ * result[2]; \
rowC = (dv4sf_t *) &C0[3*rs_c+j]; \
rowC[0] = alpha_ * result[3];
#define PREFETCH1(x, y) __asm__ volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
#define LOAD_VECTORS \
ca = (vec_t *) A0; \
rb = (vec_t *) B0;
#define D_ASSEMBLE_VEC_PAIR \
__builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \
__builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]);
#define D_ACCUMULATE \
__builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \
__builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \
__builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \
__builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \
__builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \
__builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \
__builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \
__builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]);
#define D_INCREMENT \
A0+=8; \
B0+=8;
#define D_AB_PRODUCT \
LOAD_VECTORS \
D_ASSEMBLE_VEC_PAIR \
D_INCREMENT \
D_ACCUMULATE
void bli_dgemm_power10_mma_8x8
(
dim_t k0,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
uint64_t k_iter = (k0-1) / 4;
uint64_t k_left = (k0-1) % 4;
uint64_t rs_c = rs_c0;
double* restrict A0 = a;
double* restrict B0 = b;
double* restrict C0 = c;
double alpha_ = *alpha,
beta_ = *beta;
dv4sf_t result[4];
dv4sf_t *rowC;
/* 8 accumulator registers that will be used to store the result.
Each accumulator register is mapped to 4 vector registers.
Illustration:
acc0 = [ vs0
vs1
vs3
vs4 ]
These registers are used to store the result of an outer product
instruction (general outer product instruction syntax: xv???ger??). */
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
/* 2 vector pairs are necessary for a double precision outer product
instruction. */
__vector_pair colA_1,
colA_2;
/* Prefetch C so that it stays in cache */
PREFETCH1 (C0, 0);
PREFETCH1 (C0 + rs_c, 0);
PREFETCH1 (C0 + rs_c + rs_c, 0);
PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0);
PREFETCH1 (C0, 128);
PREFETCH1 (C0 + rs_c, 128);
PREFETCH1 (C0 + rs_c + rs_c, 128);
PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128);
/* Load elements into vector registers */
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
/* Each accumulator represents a matrix of size
4 x ((datatype size in bytes) / 16) (vector register size = 128b)
Thus in the case of double, the accumulate registers represent a 4x2
matrix. However, a vector register can hold at most 2 doubles. Thus, if
we performed an outer product using 2 vector register, we can only get a
2x2 matrix. Therefore, we must create a vector register pair in order
to get the desired 4x2 matrix.
*/
D_ASSEMBLE_VEC_PAIR
/* Compute accumulate outer products and override accumulators with result */
__builtin_mma_xvf64ger (&acc0, colA_1, rb[0]);
__builtin_mma_xvf64ger (&acc1, colA_1, rb[1]);
__builtin_mma_xvf64ger (&acc2, colA_1, rb[2]);
__builtin_mma_xvf64ger (&acc3, colA_1, rb[3]);
__builtin_mma_xvf64ger (&acc4, colA_2, rb[0]);
__builtin_mma_xvf64ger (&acc5, colA_2, rb[1]);
__builtin_mma_xvf64ger (&acc6, colA_2, rb[2]);
__builtin_mma_xvf64ger (&acc7, colA_2, rb[3]);
/* Move A and B pointers */
D_INCREMENT
// k loop (unrolled by 4)
for (int k = 0; k<k_iter; k++)
{
D_AB_PRODUCT
D_AB_PRODUCT
D_AB_PRODUCT
D_AB_PRODUCT
}
// edge loop
for (int k = 0; k<k_left; k++)
{
D_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
dgemm_SAVE_ACC_(&acc0, rs_c, 0 );
dgemm_SAVE_ACC_(&acc1, rs_c, 2 );
dgemm_SAVE_ACC_(&acc2, rs_c, 4 );
dgemm_SAVE_ACC_(&acc3, rs_c, 6 );
dgemm_SAVE_ACC_(&acc4, rs_c, 4*rs_c);
dgemm_SAVE_ACC_(&acc5, rs_c, 2+4*rs_c);
dgemm_SAVE_ACC_(&acc6, rs_c, 4+4*rs_c);
dgemm_SAVE_ACC_(&acc7, rs_c, 6+4*rs_c);
}
else
{
dgemm_SAVE_ACC_bz(&acc0, rs_c, 0 );
dgemm_SAVE_ACC_bz(&acc1, rs_c, 2 );
dgemm_SAVE_ACC_bz(&acc2, rs_c, 4 );
dgemm_SAVE_ACC_bz(&acc3, rs_c, 6 );
dgemm_SAVE_ACC_bz(&acc4, rs_c, 4*rs_c);
dgemm_SAVE_ACC_bz(&acc5, rs_c, 2+4*rs_c);
dgemm_SAVE_ACC_bz(&acc6, rs_c, 4+4*rs_c);
dgemm_SAVE_ACC_bz(&acc7, rs_c, 6+4*rs_c);
}
}
typedef float fv4sf_t __attribute__ ((vector_size (16)));
#define sgemm_SAVE_ACC_(ACC, rs_c, j) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (fv4sf_t *) &C0[j]; \
rowC[0] = alpha_ * result[0] + beta_ * rowC[0]; \
rowC = (fv4sf_t *) &C0[rs_c+j]; \
rowC[0] = alpha_ * result[1] + beta_ * rowC[0]; \
rowC = (fv4sf_t *) &C0[2*rs_c+j]; \
rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ; \
rowC = (fv4sf_t *) &C0[3*rs_c+j]; \
rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
#define sgemm_SAVE_ACC_bz(ACC, rs_c, j) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (fv4sf_t *) &C0[j]; \
rowC[0] = alpha_ * result[0]; \
rowC = (fv4sf_t *) &C0[rs_c+j]; \
rowC[0] = alpha_ * result[1]; \
rowC = (fv4sf_t *) &C0[2*rs_c+j]; \
rowC[0] = alpha_ * result[2]; \
rowC = (fv4sf_t *) &C0[3*rs_c+j]; \
rowC[0] = alpha_ * result[3];
#define S_ACCUMULATE \
__builtin_mma_xvf32gerpp (&acc0, ca[0], rb[0]); \
__builtin_mma_xvf32gerpp (&acc1, ca[0], rb[1]); \
__builtin_mma_xvf32gerpp (&acc2, ca[0], rb[2]); \
__builtin_mma_xvf32gerpp (&acc3, ca[0], rb[3]); \
__builtin_mma_xvf32gerpp (&acc4, ca[1], rb[0]); \
__builtin_mma_xvf32gerpp (&acc5, ca[1], rb[1]); \
__builtin_mma_xvf32gerpp (&acc6, ca[1], rb[2]); \
__builtin_mma_xvf32gerpp (&acc7, ca[1], rb[3]);
#define S_INCREMENT \
A0+=8; \
B0+=16;
#define S_AB_PRODUCT \
LOAD_VECTORS \
S_INCREMENT \
S_ACCUMULATE
void bli_sgemm_power10_mma_8x16
(
dim_t k0,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
uint64_t k_iter = (k0-1) / 4;
uint64_t k_left = (k0-1) % 4;
uint64_t rs_c = rs_c0;
fv4sf_t result[4];
fv4sf_t *rowC;
// accumulators that will hold the matrix product
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
float* restrict A0 = a;
float* restrict B0 = b;
float* restrict C0 = c;
float alpha_ = *alpha,
beta_ = *beta;
/* Load elements into vector registers */
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
/* Compute accumulate outer products and override accumulators with result */
__builtin_mma_xvf32ger (&acc0, ca[0], rb[0]);
__builtin_mma_xvf32ger (&acc1, ca[0], rb[1]);
__builtin_mma_xvf32ger (&acc2, ca[0], rb[2]);
__builtin_mma_xvf32ger (&acc3, ca[0], rb[3]);
__builtin_mma_xvf32ger (&acc4, ca[1], rb[0]);
__builtin_mma_xvf32ger (&acc5, ca[1], rb[1]);
__builtin_mma_xvf32ger (&acc6, ca[1], rb[2]);
__builtin_mma_xvf32ger (&acc7, ca[1], rb[3]);
S_INCREMENT
// k loop (unrolled by 4)
for (int k = 0; k<k_iter; k++)
{
S_AB_PRODUCT
S_AB_PRODUCT
S_AB_PRODUCT
S_AB_PRODUCT
}
// edge loop
for (int k = 0; k<k_left; k++)
{
S_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
sgemm_SAVE_ACC_(&acc0, rs_c, 0 );
sgemm_SAVE_ACC_(&acc1, rs_c, 4 );
sgemm_SAVE_ACC_(&acc2, rs_c, 8 );
sgemm_SAVE_ACC_(&acc3, rs_c, 12 );
sgemm_SAVE_ACC_(&acc4, rs_c, 4*rs_c);
sgemm_SAVE_ACC_(&acc5, rs_c, 4+4*rs_c);
sgemm_SAVE_ACC_(&acc6, rs_c, 8+4*rs_c);
sgemm_SAVE_ACC_(&acc7, rs_c, 12+4*rs_c);
}
else
{
sgemm_SAVE_ACC_bz( &acc0, rs_c, 0 );
sgemm_SAVE_ACC_bz( &acc1, rs_c, 4 );
sgemm_SAVE_ACC_bz( &acc2, rs_c, 8 );
sgemm_SAVE_ACC_bz( &acc3, rs_c, 12 );
sgemm_SAVE_ACC_bz( &acc4, rs_c, 4*rs_c);
sgemm_SAVE_ACC_bz( &acc5, rs_c, 4+4*rs_c);
sgemm_SAVE_ACC_bz( &acc6, rs_c, 8+4*rs_c);
sgemm_SAVE_ACC_bz( &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -0,0 +1,140 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vector_int_macros.h"
#define I16_ACCUMULATE \
__builtin_mma_xvi16ger2pp (&acc0, ca[0], rb[0]); \
__builtin_mma_xvi16ger2pp (&acc1, ca[0], rb[1]); \
__builtin_mma_xvi16ger2pp (&acc2, ca[0], rb[2]); \
__builtin_mma_xvi16ger2pp (&acc3, ca[0], rb[3]); \
__builtin_mma_xvi16ger2pp (&acc4, ca[1], rb[0]); \
__builtin_mma_xvi16ger2pp (&acc5, ca[1], rb[1]); \
__builtin_mma_xvi16ger2pp (&acc6, ca[1], rb[2]); \
__builtin_mma_xvi16ger2pp (&acc7, ca[1], rb[3]);
#define I16_INCREMENT \
A0+=16; \
B0+=32;
#define I16_AB_PRODUCT \
LOAD_VECTORS \
I16_INCREMENT \
I16_ACCUMULATE
void bli_i16gemm_power10_mma_8x16
(
dim_t k0,
int32_t* restrict alpha,
short* restrict a,
short* restrict b,
int32_t* restrict beta,
int32_t* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
uint64_t k_iter = (k0-1) / 4;
uint64_t k_left = (k0-1) % 4;
uint64_t rs_c = rs_c0;
short* restrict A0 = a;
short* restrict B0 = b;
int* restrict C0 = c;
int alpha_ = *alpha,
beta_ = *beta;
iv4sf_t result[4];
iv4sf_t *rowC;
// accumulators that will hold the matrix product
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
__builtin_mma_xvi16ger2 (&acc0, ca[0], rb[0]);
__builtin_mma_xvi16ger2 (&acc1, ca[0], rb[1]);
__builtin_mma_xvi16ger2 (&acc2, ca[0], rb[2]);
__builtin_mma_xvi16ger2 (&acc3, ca[0], rb[3]);
__builtin_mma_xvi16ger2 (&acc4, ca[1], rb[0]);
__builtin_mma_xvi16ger2 (&acc5, ca[1], rb[1]);
__builtin_mma_xvi16ger2 (&acc6, ca[1], rb[2]);
__builtin_mma_xvi16ger2 (&acc7, ca[1], rb[3]);
I16_INCREMENT
// k loop
for (int k = 0; k<k_iter; k++)
{
I16_AB_PRODUCT
I16_AB_PRODUCT
I16_AB_PRODUCT
I16_AB_PRODUCT
}
// k loop
for (int k = 0; k<k_left; k++)
{
I16_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
SAVE_ACC(iv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC(iv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC(iv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC(iv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC(iv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC(iv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC(iv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
else
{
SAVE_ACC_bz(iv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC_bz(iv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC_bz(iv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC_bz(iv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -0,0 +1,140 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vector_int_macros.h"
#define I16S_ACCUMULATE \
__builtin_mma_xvi16ger2spp (&acc0, ca[0], rb[0]); \
__builtin_mma_xvi16ger2spp (&acc1, ca[0], rb[1]); \
__builtin_mma_xvi16ger2spp (&acc2, ca[0], rb[2]); \
__builtin_mma_xvi16ger2spp (&acc3, ca[0], rb[3]); \
__builtin_mma_xvi16ger2spp (&acc4, ca[1], rb[0]); \
__builtin_mma_xvi16ger2spp (&acc5, ca[1], rb[1]); \
__builtin_mma_xvi16ger2spp (&acc6, ca[1], rb[2]); \
__builtin_mma_xvi16ger2spp (&acc7, ca[1], rb[3]);
#define I16S_INCREMENT \
A0+=16; \
B0+=32;
#define I16S_AB_PRODUCT \
LOAD_VECTORS \
I16S_INCREMENT \
I16S_ACCUMULATE
void bli_i16sgemm_power10_mma_8x16
(
dim_t k0,
int32_t* restrict alpha,
short* restrict a,
short* restrict b,
int32_t* restrict beta,
int32_t* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
uint64_t k_iter = (k0-1) / 4;
uint64_t k_left = (k0-1) % 4;
uint64_t rs_c = rs_c0;
short* restrict A0 = a;
short* restrict B0 = b;
int* restrict C0 = c;
int alpha_ = *alpha,
beta_ = *beta;
iv4sf_t result[4];
iv4sf_t *rowC;
// accumulators that will hold the matrix product
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
__builtin_mma_xvi16ger2s (&acc0, ca[0], rb[0]);
__builtin_mma_xvi16ger2s (&acc1, ca[0], rb[1]);
__builtin_mma_xvi16ger2s (&acc2, ca[0], rb[2]);
__builtin_mma_xvi16ger2s (&acc3, ca[0], rb[3]);
__builtin_mma_xvi16ger2s (&acc4, ca[1], rb[0]);
__builtin_mma_xvi16ger2s (&acc5, ca[1], rb[1]);
__builtin_mma_xvi16ger2s (&acc6, ca[1], rb[2]);
__builtin_mma_xvi16ger2s (&acc7, ca[1], rb[3]);
I16S_INCREMENT
// k loop
for (int k = 0; k<k_iter; k++)
{
I16S_AB_PRODUCT
I16S_AB_PRODUCT
I16S_AB_PRODUCT
I16S_AB_PRODUCT
}
// k loop
for (int k = 0; k<k_left; k++)
{
I16S_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
SAVE_ACC(iv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC(iv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC(iv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC(iv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC(iv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC(iv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC(iv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
else
{
SAVE_ACC_bz(iv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC_bz(iv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC_bz(iv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC_bz(iv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -0,0 +1,140 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vector_int_macros.h"
#define I4_ACCUMULATE \
__builtin_mma_xvi4ger8pp (&acc0, ca[0], rb[0]); \
__builtin_mma_xvi4ger8pp (&acc1, ca[0], rb[1]); \
__builtin_mma_xvi4ger8pp (&acc2, ca[0], rb[2]); \
__builtin_mma_xvi4ger8pp (&acc3, ca[0], rb[3]); \
__builtin_mma_xvi4ger8pp (&acc4, ca[1], rb[0]); \
__builtin_mma_xvi4ger8pp (&acc5, ca[1], rb[1]); \
__builtin_mma_xvi4ger8pp (&acc6, ca[1], rb[2]); \
__builtin_mma_xvi4ger8pp (&acc7, ca[1], rb[3]);
#define I4_INCREMENT \
A0+=32; \
B0+=64;
#define I4_AB_PRODUCT \
LOAD_VECTORS \
I4_INCREMENT \
I4_ACCUMULATE
void bli_i4gemm_power10_mma_8x16
(
dim_t k0,
int32_t* restrict alpha,
nibbles* restrict a,
nibbles* restrict b,
int32_t* restrict beta,
int32_t* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
uint64_t k_iter = (k0-1) / 4;
uint64_t k_left = (k0-1) % 4;
uint64_t rs_c = rs_c0;
nibbles* restrict A0 = a;
nibbles* restrict B0 = b;
int* restrict C0 = c;
int alpha_ = *alpha,
beta_ = *beta;
iv4sf_t result[4];
iv4sf_t *rowC;
// accumulators that will hold the matrix product
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
__builtin_mma_xvi4ger8 (&acc0, ca[0], rb[0]);
__builtin_mma_xvi4ger8 (&acc1, ca[0], rb[1]);
__builtin_mma_xvi4ger8 (&acc2, ca[0], rb[2]);
__builtin_mma_xvi4ger8 (&acc3, ca[0], rb[3]);
__builtin_mma_xvi4ger8 (&acc4, ca[1], rb[0]);
__builtin_mma_xvi4ger8 (&acc5, ca[1], rb[1]);
__builtin_mma_xvi4ger8 (&acc6, ca[1], rb[2]);
__builtin_mma_xvi4ger8 (&acc7, ca[1], rb[3]);
I4_INCREMENT
// k loop (unrolled by 4)
for (int k = 0; k<k_iter; k++)
{
I4_AB_PRODUCT
I4_AB_PRODUCT
I4_AB_PRODUCT
I4_AB_PRODUCT
}
// edge loop
for (int k = 0; k<k_left; k++)
{
I4_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
SAVE_ACC(iv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC(iv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC(iv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC(iv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC(iv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC(iv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC(iv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
else
{
SAVE_ACC_bz(iv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC_bz(iv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC_bz(iv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC_bz(iv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -0,0 +1,139 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vector_int_macros.h"
#define I8_ACCUMULATE \
__builtin_mma_xvi8ger4pp (&acc0, ca[0], rb[0]); \
__builtin_mma_xvi8ger4pp (&acc1, ca[0], rb[1]); \
__builtin_mma_xvi8ger4pp (&acc2, ca[0], rb[2]); \
__builtin_mma_xvi8ger4pp (&acc3, ca[0], rb[3]); \
__builtin_mma_xvi8ger4pp (&acc4, ca[1], rb[0]); \
__builtin_mma_xvi8ger4pp (&acc5, ca[1], rb[1]); \
__builtin_mma_xvi8ger4pp (&acc6, ca[1], rb[2]); \
__builtin_mma_xvi8ger4pp (&acc7, ca[1], rb[3]);
#define I8_INCREMENT \
A0+=32; \
B0+=64;
#define I8_AB_PRODUCT \
LOAD_VECTORS \
I8_INCREMENT \
I8_ACCUMULATE
void bli_i8gemm_power10_mma_8x16
(
dim_t k0,
int32_t* restrict alpha,
int8_t* restrict a,
int8_t* restrict b,
int32_t* restrict beta,
int32_t* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
uint64_t k_iter = (k0-1) / 4;
uint64_t k_left = (k0-1) % 4;
uint64_t rs_c = rs_c0;
int8_t* restrict A0 = a;
int8_t* restrict B0 = b;
int* restrict C0 = c;
int alpha_ = *alpha,
beta_ = *beta;
iv4sf_t result[4];
iv4sf_t *rowC;
// accumulators that will hold the matrix product
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
__builtin_mma_xvi8ger4 (&acc0, ca[0], rb[0]);
__builtin_mma_xvi8ger4 (&acc1, ca[0], rb[1]);
__builtin_mma_xvi8ger4 (&acc2, ca[0], rb[2]);
__builtin_mma_xvi8ger4 (&acc3, ca[0], rb[3]);
__builtin_mma_xvi8ger4 (&acc4, ca[1], rb[0]);
__builtin_mma_xvi8ger4 (&acc5, ca[1], rb[1]);
__builtin_mma_xvi8ger4 (&acc6, ca[1], rb[2]);
__builtin_mma_xvi8ger4 (&acc7, ca[1], rb[3]);
I8_INCREMENT
// k loop (unrolled by 4)
for (int k = 0; k<k_iter; k++)
{
I8_AB_PRODUCT
I8_AB_PRODUCT
I8_AB_PRODUCT
I8_AB_PRODUCT
}
// edge loop
for (int k = 0; k<k_left; k++)
{
I8_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
SAVE_ACC(iv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC(iv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC(iv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC(iv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC(iv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC(iv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC(iv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
else
{
SAVE_ACC_bz(iv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC_bz(iv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC_bz(iv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC_bz(iv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC_bz(iv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -0,0 +1,141 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vector_int_macros.h"
#define B_ACCUMULATE \
__builtin_mma_xvbf16ger2pp (&acc0, ca[0], rb[0]); \
__builtin_mma_xvbf16ger2pp (&acc1, ca[0], rb[1]); \
__builtin_mma_xvbf16ger2pp (&acc2, ca[0], rb[2]); \
__builtin_mma_xvbf16ger2pp (&acc3, ca[0], rb[3]); \
__builtin_mma_xvbf16ger2pp (&acc4, ca[1], rb[0]); \
__builtin_mma_xvbf16ger2pp (&acc5, ca[1], rb[1]); \
__builtin_mma_xvbf16ger2pp (&acc6, ca[1], rb[2]); \
__builtin_mma_xvbf16ger2pp (&acc7, ca[1], rb[3]);
#define B_INCREMENT \
A0+=16; \
B0+=32;
#define B_AB_PRODUCT \
LOAD_VECTORS \
B_INCREMENT \
B_ACCUMULATE
void bli_sbgemm_power10_mma_8x16
(
dim_t k0,
float* restrict alpha,
bfloat16* restrict a,
bfloat16* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
uint64_t k_iter = (k0-1)/4;
uint64_t k_left = (k0-1)%4;
uint64_t rs_c = rs_c0;
bfloat16* restrict A0 = a;
bfloat16* restrict B0 = b;
float* restrict C0 = c;
float alpha_= *alpha,
beta_ = *beta;
fv4sf_t result[4];
fv4sf_t *rowC;
// accumulators that will hold the matrix product
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
__builtin_mma_xvbf16ger2 (&acc0, ca[0], rb[0]);
__builtin_mma_xvbf16ger2 (&acc1, ca[0], rb[1]);
__builtin_mma_xvbf16ger2 (&acc2, ca[0], rb[2]);
__builtin_mma_xvbf16ger2 (&acc3, ca[0], rb[3]);
__builtin_mma_xvbf16ger2 (&acc4, ca[1], rb[0]);
__builtin_mma_xvbf16ger2 (&acc5, ca[1], rb[1]);
__builtin_mma_xvbf16ger2 (&acc6, ca[1], rb[2]);
__builtin_mma_xvbf16ger2 (&acc7, ca[1], rb[3]);
B_INCREMENT
// k loop
for (int k = 0; k<k_iter; k++)
{
B_AB_PRODUCT
B_AB_PRODUCT
B_AB_PRODUCT
B_AB_PRODUCT
}
for (int k = 0; k<k_left; k++)
{
B_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
SAVE_ACC(fv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC(fv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC(fv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC(fv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC(fv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC(fv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC(fv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
else
{
SAVE_ACC_bz(fv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC_bz(fv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC_bz(fv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC_bz(fv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -0,0 +1,144 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vector_int_macros.h"
#define S_ACCUMULATE \
__builtin_mma_xvf32gerpp (&acc0, ca[0], rb[0]); \
__builtin_mma_xvf32gerpp (&acc1, ca[0], rb[1]); \
__builtin_mma_xvf32gerpp (&acc2, ca[0], rb[2]); \
__builtin_mma_xvf32gerpp (&acc3, ca[0], rb[3]); \
__builtin_mma_xvf32gerpp (&acc4, ca[1], rb[0]); \
__builtin_mma_xvf32gerpp (&acc5, ca[1], rb[1]); \
__builtin_mma_xvf32gerpp (&acc6, ca[1], rb[2]); \
__builtin_mma_xvf32gerpp (&acc7, ca[1], rb[3]);
#define S_INCREMENT \
A0+=8; \
B0+=16;
#define S_AB_PRODUCT \
LOAD_VECTORS \
S_INCREMENT \
S_ACCUMULATE
void bli_sgemm_power10_mma_8x16
(
dim_t k0,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
uint64_t k_iter = (k0-1) / 4;
uint64_t k_left = (k0-1) % 4;
uint64_t rs_c = rs_c0;
fv4sf_t result[4];
fv4sf_t *rowC;
// accumulators that will hold the matrix product
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
float* restrict A0 = a;
float* restrict B0 = b;
float* restrict C0 = c;
float alpha_ = *alpha,
beta_ = *beta;
/* Load elements into vector registers */
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
/* Compute accumulate outer products and override accumulators with result */
__builtin_mma_xvf32ger (&acc0, ca[0], rb[0]);
__builtin_mma_xvf32ger (&acc1, ca[0], rb[1]);
__builtin_mma_xvf32ger (&acc2, ca[0], rb[2]);
__builtin_mma_xvf32ger (&acc3, ca[0], rb[3]);
__builtin_mma_xvf32ger (&acc4, ca[1], rb[0]);
__builtin_mma_xvf32ger (&acc5, ca[1], rb[1]);
__builtin_mma_xvf32ger (&acc6, ca[1], rb[2]);
__builtin_mma_xvf32ger (&acc7, ca[1], rb[3]);
S_INCREMENT
// k loop (unrolled by 4)
for (int k = 0; k<k_iter; k++)
{
S_AB_PRODUCT
S_AB_PRODUCT
S_AB_PRODUCT
S_AB_PRODUCT
}
// edge loop
for (int k = 0; k<k_left; k++)
{
S_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
SAVE_ACC(fv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC(fv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC(fv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC(fv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC(fv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC(fv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC(fv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
else
{
SAVE_ACC_bz(fv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC_bz(fv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC_bz(fv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC_bz(fv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -0,0 +1,141 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "vector_int_macros.h"
#define H_ACCUMULATE \
__builtin_mma_xvf16ger2pp (&acc0, ca[0], rb[0]); \
__builtin_mma_xvf16ger2pp (&acc1, ca[0], rb[1]); \
__builtin_mma_xvf16ger2pp (&acc2, ca[0], rb[2]); \
__builtin_mma_xvf16ger2pp (&acc3, ca[0], rb[3]); \
__builtin_mma_xvf16ger2pp (&acc4, ca[1], rb[0]); \
__builtin_mma_xvf16ger2pp (&acc5, ca[1], rb[1]); \
__builtin_mma_xvf16ger2pp (&acc6, ca[1], rb[2]); \
__builtin_mma_xvf16ger2pp (&acc7, ca[1], rb[3]);
#define H_INCREMENT \
A0+=16; \
B0+=32;
#define H_AB_PRODUCT \
LOAD_VECTORS \
H_INCREMENT \
H_ACCUMULATE
void bli_shgemm_power10_mma_8x16
(
dim_t k0,
float* restrict alpha,
float16* restrict a,
float16* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
uint64_t k_iter = (k0-1)/4;
uint64_t k_left = (k0-1)%4;
uint64_t rs_c = rs_c0;
float16* restrict A0 = a;
float16* restrict B0 = b;
float* restrict C0 = c;
float alpha_= *alpha,
beta_ = *beta;
fv4sf_t result[4];
fv4sf_t *rowC;
// accumulators that will hold the matrix product
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;
__builtin_mma_xvf16ger2 (&acc0, ca[0], rb[0]);
__builtin_mma_xvf16ger2 (&acc1, ca[0], rb[1]);
__builtin_mma_xvf16ger2 (&acc2, ca[0], rb[2]);
__builtin_mma_xvf16ger2 (&acc3, ca[0], rb[3]);
__builtin_mma_xvf16ger2 (&acc4, ca[1], rb[0]);
__builtin_mma_xvf16ger2 (&acc5, ca[1], rb[1]);
__builtin_mma_xvf16ger2 (&acc6, ca[1], rb[2]);
__builtin_mma_xvf16ger2 (&acc7, ca[1], rb[3]);
H_INCREMENT
// k loop
for (int k = 0; k<k_iter; k++)
{
H_AB_PRODUCT
H_AB_PRODUCT
H_AB_PRODUCT
H_AB_PRODUCT
}
for (int k = 0; k<k_left; k++)
{
H_AB_PRODUCT
}
// handle beta cases
if (beta_ != 0.0)
{
SAVE_ACC(fv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC(fv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC(fv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC(fv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC(fv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC(fv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC(fv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
else
{
SAVE_ACC_bz(fv4sf_t, &acc0, rs_c, 0 );
SAVE_ACC_bz(fv4sf_t, &acc1, rs_c, 4 );
SAVE_ACC_bz(fv4sf_t, &acc2, rs_c, 8 );
SAVE_ACC_bz(fv4sf_t, &acc3, rs_c, 12 );
SAVE_ACC_bz(fv4sf_t, &acc4, rs_c, 4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc5, rs_c, 4+4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc6, rs_c, 8+4*rs_c);
SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
}
}

View File

@@ -0,0 +1,71 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Common include/defines across microkernels
#include "blis.h"
#define PREFETCH1(x, y) __asm__ volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
#define LOAD_VECTORS \
ca = (vec_t *) A0; \
rb = (vec_t *) B0;
typedef __vector float fv4sf_t;
typedef __vector double dv4sf_t;
typedef __vector int32_t iv4sf_t;
typedef __vector unsigned char vec_t;
#define SAVE_ACC(v_t, ACC, rs_c, j) \
__builtin_mma_disassemble_acc ( (void *) result, ACC); \
rowC = (v_t *) &C0[j]; \
rowC[0] = alpha_ * result[0] + beta_ * rowC[0]; \
rowC = (v_t *) &C0[rs_c+j]; \
rowC[0] = alpha_ * result[1] + beta_ * rowC[0]; \
rowC = (v_t *) &C0[2*rs_c+j]; \
rowC[0] = alpha_ * result[2] + beta_ * rowC[0] ; \
rowC = (v_t *) &C0[3*rs_c+j]; \
rowC[0] = alpha_ * result[3] + beta_ * rowC[0] ;
#define SAVE_ACC_bz(v_t, ACC, rs_c, j) \
__builtin_mma_disassemble_acc ( (void *) result, ACC); \
rowC = (v_t *) &C0[j]; \
rowC[0] = alpha_ * result[0]; \
rowC = (v_t *) &C0[rs_c+j]; \
rowC[0] = alpha_ * result[1]; \
rowC = (v_t *) &C0[2*rs_c+j]; \
rowC[0] = alpha_ * result[2]; \
rowC = (v_t *) &C0[3*rs_c+j]; \
rowC[0] = alpha_ * result[3];

View File

@@ -34,4 +34,5 @@
// gemm
GEMM_UKR_PROT( double, d, gemm_power10_mma_8x8 )
GEMM_UKR_PROT( float, s, gemm_power10_mma_8x16 )
GEMM_UKR_PROT( float, s, gemm_power10_mma_8x16 )

View File

@@ -0,0 +1,71 @@
### Low Precision POWER10 Kernels
This is a special BLIS Sandbox that allows users to call low precision POWER10 `gemm` kernels.
#### Introduction
This document describes how the low precision POWER10 `gemm` kernels are implemented. The document will also demonstrate how to call the `gemm` kernels.
**Important: This sandbox does not have the full functionality of BLIS. This sandbox can only perform single threaded, no transpose, GEMM. At this time, full functioning POWER10 hardware has not be released. Once hardware has been released, the kernels will be further optimized in areas such as prefetching and cache blocksizes.**
#### Implementation
The kernels are implemented in `generic_gemm.c`. They are instantiated with macro templates. The main template is called `GENERIC_GEMM`. This template is used to create the 5-loop `gemm` function.
The API points are created in `gemm_api.c`. In this file, the API points are wrappers for the functions that are created by the templates in `generic_gemm.c`.
#### Kernels
The following low precision datatypes have POWER10 `gemm` kernels: `IEEE float16, bfloat16, int16, int8, int4`.
#### Low Precision Types
| BLIS type | BLIS char | Type definition | Used to represent... |
|:-----------|:----------|:---------------------------------------|:-------------------------------------|
| `float16` | `h` | `typedef union { uint16_t v; struct { uint16_t m:10; uint16_t e:5; uint16_t s:1} bits; }` | IEEE half-precision real numbers |
| `bfloat16` | `b` | `typedef union { uint16_t v; struct { uint16_t m:7; uint16_t e:8; uint16_t s:1; } bits; }` | Google's half-precision real numbers |
| `int16` | `i16` | `int16_t` | 16 bit integers |
| `int8` | `i8` | `int8_t` | 8 bit integers |
| `int4` | `i4` | `typedef union{ uint8_t v; struct { uint8_t nib1:4; uint8_t nib2:4; } bits; }` | 4 bit integers |
#### Low Precision API
The API that is used for the low precision POWER10 `gemm` kernels is similar to the existing [BLIS basic typed API](https://github.com/flame/blis/blob/master/docs/BLISTypedAPI.md). The main difference between the two is that in the existing BLIS typed API, there is only one type for the input and output matrices. However in the low precision API, there is a input and output type.
Thus the new `gemm` call looks like the following:
```
void bli_??gemm
(
trans_t transa,
trans_t transb,
dim_t m,
dim_t n,
dim_t k,
ctype_out* alpha,
ctype_in* a, inc_t rsa, inc_t csa,
ctype_in* b, inc_t rsb, inc_t csb,
ctype_out* beta,
ctype_out* c, inc_t rsc, inc_t csc
);
```
The first `?` is for the output type. The second `?` is for the input type.
At this time for IEEE float16 and bfloat16, the only output type is single precision float. For int16, int8, and int4, the only output type is 32 bit int.
#### How To Build The Sandbox
Add the following flags when running the configure script to build BLIS correctly.
`CFLAGS="-fPIC -std=c99 -D_ISOC11_SOURCE -D_POSIX_C_SOURCE=200112L" -s power10`
Ensure that you have GCC 10.2 or greater.
#### References
* [bfloat16 wiki](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format)
* [IEEE float16 wiki](https://en.wikipedia.org/wiki/Half-precision_floating-point_format)

View File

@@ -0,0 +1,71 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// This file is needed for the BLIS build system.
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, cname, imeth ) \
\
void PASTEMAC(opname,imeth) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
/* Obtain a valid (native) context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Invoke the operation's front end. */ \
PASTEMAC(opname,_front) \
( \
alpha, a, b, beta, c, cntx, rntm, NULL \
); \
}
GENFRONT( gemm, gemm, nat )

View File

@@ -0,0 +1,115 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of copyright holder(s) nor the names
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SANDBOX_H
#define BLIS_SANDBOX_H
#include "blis.h"
#include "gemm_api.h"
// NOTE: This header is the only header required to be present in the sandbox
// implementation directory.
// This header is used to create the typedefs needed for low precision
// int4 type
typedef union
{
uint8_t v;
struct
{
uint8_t nib1:4;
uint8_t nib2:4;
} bits;
} nibbles;
// bfloat16
typedef union
{
uint16_t v;
struct
{
uint16_t m:7;
uint16_t e:8;
uint16_t s:1;
} bits;
} bfloat16;
// ieee float16
typedef union
{
uint16_t v;
struct
{
uint16_t m:10;
uint16_t e:5;
uint16_t s:1;
} bits;
} float16;
#define P10_PG_SIZE 4096
GEMM_UKR_PROT2( bfloat16, float, sb, gemm_power10_mma_8x16 )
GEMM_UKR_PROT2( float16, float, sh, gemm_power10_mma_8x16 )
GEMM_UKR_PROT2( int16_t, int32_t, i16, gemm_power10_mma_8x16 )
GEMM_UKR_PROT2( int8_t, int32_t, i8, gemm_power10_mma_8x16 )
GEMM_UKR_PROT2( nibbles, int32_t, i4, gemm_power10_mma_8x16 )
/* Creates a function that initializes a matrix of type ctype with random vals */
#define RandomMatrixMacro(ch, ctype, rand_func) \
RM_PROT(ch, ctype) \
{ \
for ( int i=0; i<m; i++ ) \
for ( int j=0; j<n; j++ ) \
*(ap + j*cs_a + i*rs_a) = \
(ctype) rand_func(); \
}
/* Creates a function that initializes a matrix of type ctype with random vals */
#define RandomMatrixBounded(ch, ctype, rand_func) \
RM_B_PROT(ch, ctype) \
{ \
for ( int i=0; i<m; i++ ) \
for ( int j=0; j<n; j++ ) \
*(ap + j*cs_a + i*rs_a) = \
(ctype) rand_func() % (upper - lower + 1) + lower; \
}
GEMM_FUNC_PROT( float16, float, sh);
GEMM_FUNC_PROT( bfloat16, float, sb);
GEMM_FUNC_PROT( int16_t, int32_t, i16);
GEMM_FUNC_PROT( int8_t, int32_t, i8);
GEMM_FUNC_PROT( nibbles, int32_t, i4);
#endif

View File

@@ -0,0 +1,77 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// This file contains the API points for the low precision POWER10 GEMM kernels
#include "generic_gemm.h"
#include "gemm_api.h"
#define GEMM_FUNC(ch, DTYPE_IN, DTYPE_OUT, A_ALIGNMENT, B_ALIGNMENT, MR, NR, MC, KC, NC) \
\
void GEMM_FUNC_NAME(ch) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
DTYPE_OUT* alpha, \
DTYPE_IN* a, inc_t rsa, inc_t csa, \
DTYPE_IN* b, inc_t rsb, inc_t csb, \
DTYPE_OUT* beta, \
DTYPE_OUT* c, inc_t rsc, inc_t csc \
) \
{ \
\
if (transa != BLIS_NO_TRANSPOSE || transb != BLIS_NO_TRANSPOSE) { \
printf("Transpose functionality not implemented yet.\n"); \
} \
\
GEMM_PASTEMAC(ch) \
( \
MR, NR, MC, KC, NC, \
m, n, k, \
a, rsa, csa, A_ALIGNMENT, \
b, rsb, csb, B_ALIGNMENT, \
c, rsc, csc, \
alpha, beta \
); \
} \
// ch dt_in dt_out MR NR MC KC NC
GEMM_FUNC( sb, bfloat16, float, 0, 0, 8, 16, 1664, 1026, 4096);
GEMM_FUNC( sh, float16, float, 0, 0, 8, 16, 1664, 1026, 4096);
GEMM_FUNC( i16, int16_t, int32_t, 0, 0, 8, 16, 1664, 1026, 4096);
GEMM_FUNC( i8, int8_t, int32_t, 0, 0, 8, 16, 1664, 1026, 4096);
GEMM_FUNC( i4, nibbles, int32_t, 0, 0, 8, 16, 1664, 1026, 4096);

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Prototypes and template for the low precision POWER10 GEMM API
#define GEMM_FUNC_NAME_(ch) bli_ ## ch ## gemm
#define GEMM_FUNC_NAME(ch) GEMM_FUNC_NAME_(ch)
#define GEMM_FUNC_PROT(DTYPE_IN, DTYPE_OUT, ch) \
void GEMM_FUNC_NAME(ch) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
DTYPE_OUT* alpha, \
DTYPE_IN* a, inc_t rsa, inc_t csa, \
DTYPE_IN* b, inc_t rsb, inc_t csb, \
DTYPE_OUT* beta, \
DTYPE_OUT* c, inc_t rsc, inc_t csc \
)

889
sandbox/power10/gemm_pack.c Normal file
View File

@@ -0,0 +1,889 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Templates for different packing routine
#include "gemm_pack.h"
/*
Details on bit16_dt vector data structure
Vector X = [ X[0,0] X[0,1] X[1,0] X[1,1] X[2,0] X[2,1] X[3,0] X[3,1] ]
Vector Y = [ Y[0,0] Y[0,1] Y[1,0] Y[1,1] Y[2,0] Y[2,1] Y[3,0] Y[3,1] ]
These bit16_dt vectors represent a 4x2 matrix. Hence, in matrix form it
looks like the following:
X = [ X[0,0] X[0,1]
X[1,0] X[1,1]
X[2,0] X[2,1]
X[3,0] X[3,1] ]
The outer product instruction: xvbf16ger2 (bfloat16 outer product)
Syntax:
xvbf16ger2 ACCUMULATOR A, VECTOR X, VECTOR Y
Semantics:
A = X * Y^T
The generic packing routine would load 8 elements from the same column.
This causes an issue since the instruction expects the vector to be a
4x2 matrix where the data is packed in contiguous order. Thus, we must make
a packing routine that will interleave the matrix data. Making it so
that when we load the 8 contiguous elements from A, it will represent
a 4x2 section of the matrix.
*/
#define k_even_apack_16(ir) \
*adest++ = ap[ (i+ir)*rs_a + p_idx*cs_a ]; \
*adest++ = ap[ (i+ir)*rs_a + (p_idx+1)*cs_a ];
#define k_odd_apack_16(ir) \
*adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \
memset(adest, 0, 2); \
adest++;
#define pad_macro_16(dest_matrix) \
memset(dest_matrix, 0, 4); \
dest_matrix+=2;
#define BIT16_PACK_A(ch, DTYPE_IN) \
\
void PACK_FUNC_NAME(ch, A) \
( \
dim_t MR, \
int m, int k, \
DTYPE_IN* ap, int rs_a, int cs_a, \
DTYPE_IN* apack \
) \
{ \
int k_odd = k%2; \
int p_idx; \
\
DTYPE_IN* adest = apack; \
for (int i=0; i<m; i+=MR) \
{ \
int ib = bli_min(MR, m-i); \
if (ib == MR) /* Full size column height */ \
{ \
p_idx = 0; \
for (int p=0; p<(k/2); p++) \
{ \
k_even_apack_16(0); \
k_even_apack_16(1); \
k_even_apack_16(2); \
k_even_apack_16(3); \
k_even_apack_16(4); \
k_even_apack_16(5); \
k_even_apack_16(6); \
k_even_apack_16(7); \
p_idx += 2; \
} \
\
/* In the case that k is odd, we must pad with 0s */ \
if(k_odd) \
{ \
k_odd_apack_16(0); \
k_odd_apack_16(1); \
k_odd_apack_16(2); \
k_odd_apack_16(3); \
k_odd_apack_16(4); \
k_odd_apack_16(5); \
k_odd_apack_16(6); \
k_odd_apack_16(7); \
} \
} \
\
else /* Not full size, pad with zeros */ \
{ \
p_idx = 0; \
for (int p=0; p<(k/2); p++) \
{ \
for (int ir=0; ir<ib; ir++) \
{ \
k_even_apack_16(ir); \
} \
for (int ir=ib; ir<MR; ir++) \
{ \
pad_macro_16(adest); \
} \
p_idx += 2; \
} \
\
if(k_odd) \
{ \
for (int ir=0; ir<ib; ir++) \
{ \
k_odd_apack_16(ir); \
} \
for (int ir=ib; ir<MR; ir++) \
{ \
pad_macro_16(adest); \
} \
} \
} \
} \
}
#define k_even_bpack_16(jr) \
*bdest++ = bp[ p_idx*rs_b + (j+jr)*cs_b ]; \
*bdest++ = bp[ (p_idx+1)*rs_b + (j+jr)*cs_b ]; \
#define k_odd_bpack_16(jr) \
*bdest++ = bp[ (k-1)*rs_b + (j+jr)*cs_b ]; \
memset(bdest, 0, 2); \
bdest++; \
#define BIT16_PACK_B(ch, DTYPE_IN) \
\
void PACK_FUNC_NAME(ch, B) \
( \
dim_t NR, \
int k, int n, \
DTYPE_IN* bp, int rs_b, int cs_b, \
DTYPE_IN* bpack \
) \
{ \
\
int k_odd = k%2; \
int p_idx; \
\
DTYPE_IN* bdest = bpack; \
\
for( int j=0; j<n; j += NR ) \
{ \
int jb = bli_min(NR, n-j); \
\
if ( jb == NR ) /* Full column width micro-panel.*/ \
{ \
p_idx = 0; \
for ( int p=0; p<(k/2); p++ ) \
{ \
k_even_bpack_16(0); \
k_even_bpack_16(1); \
k_even_bpack_16(2); \
k_even_bpack_16(3); \
k_even_bpack_16(4); \
k_even_bpack_16(5); \
k_even_bpack_16(6); \
k_even_bpack_16(7); \
k_even_bpack_16(8); \
k_even_bpack_16(9); \
k_even_bpack_16(10); \
k_even_bpack_16(11); \
k_even_bpack_16(12); \
k_even_bpack_16(13); \
k_even_bpack_16(14); \
k_even_bpack_16(15); \
p_idx += 2; \
} \
\
/* In the case that k is odd, we must pad with 0s */ \
if(k_odd) \
{ \
k_odd_bpack_16(0); \
k_odd_bpack_16(1); \
k_odd_bpack_16(2); \
k_odd_bpack_16(3); \
k_odd_bpack_16(4); \
k_odd_bpack_16(5); \
k_odd_bpack_16(6); \
k_odd_bpack_16(7); \
k_odd_bpack_16(8); \
k_odd_bpack_16(9); \
k_odd_bpack_16(10); \
k_odd_bpack_16(11); \
k_odd_bpack_16(12); \
k_odd_bpack_16(13); \
k_odd_bpack_16(14); \
k_odd_bpack_16(15); \
} \
} \
\
else /* Not a full row size micro-panel. We pad with zeroes. */ \
{ \
p_idx = 0; \
for ( int p=0; p<(k/2); p++ ) \
{ \
for ( int jr=0; jr<jb; jr++ ) \
{ \
k_even_bpack_16(jr); \
} \
for ( int jr=jb; jr<NR; jr++ ) \
{ \
pad_macro_16(bdest); \
} \
p_idx += 2; \
} \
\
if(k_odd) \
{ \
for ( int jr=0; jr<jb; jr++ ) \
{ \
k_odd_bpack_16(jr); \
} \
for ( int jr=jb; jr<NR; jr++ ) \
{ \
pad_macro_16(bdest); \
} \
} \
} \
} \
};
/* 8 bit packing routines */
#define k_even_apack_8(ir) \
*adest++ = ap[ (i+ir)*rs_a + p_idx*cs_a ]; \
*adest++ = ap[ (i+ir)*rs_a + (p_idx+1)*cs_a ]; \
*adest++ = ap[ (i+ir)*rs_a + (p_idx+2)*cs_a ]; \
*adest++ = ap[ (i+ir)*rs_a + (p_idx+3)*cs_a ];
#define k_left3_apack_8(ir) \
*adest++ = ap[ (i+ir)*rs_a + (k-3)*cs_a ]; \
*adest++ = ap[ (i+ir)*rs_a + (k-2)*cs_a ]; \
*adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \
memset(adest, 0, 1); \
adest++;
#define k_left2_apack_8(ir) \
*adest++ = ap[ (i+ir)*rs_a + (k-2)*cs_a ]; \
*adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \
memset(adest, 0, 2); \
adest += 2;
#define k_left1_apack_8(ir) \
*adest++ = ap[ (i+ir)*rs_a + (k-1)*cs_a ]; \
memset(adest, 0, 3); \
adest += 3;
#define pad_macro_8(dest_matrix) \
memset(dest_matrix, 0, 4); \
dest_matrix += 4;
#define BIT8_PACK_A(ch, DTYPE_IN) \
\
void PACK_FUNC_NAME(ch, A) \
( \
dim_t MR, \
int m, int k, \
DTYPE_IN* ap, int rs_a, int cs_a, \
DTYPE_IN* apack \
) \
{ \
int k_left = k%4; \
int k_iter = k/4; \
int p_idx; \
\
DTYPE_IN* adest = apack; \
\
/* Each panel must be packed in this format */ \
for (int i=0; i<m; i+=MR) \
{ \
int ib = bli_min(MR, m-i); \
\
if (ib == MR) /* Full size column height */ \
{ \
p_idx = 0; \
for (int p=0; p<k_iter; p++) \
{ \
k_even_apack_8(0); \
k_even_apack_8(1); \
k_even_apack_8(2); \
k_even_apack_8(3); \
k_even_apack_8(4); \
k_even_apack_8(5); \
k_even_apack_8(6); \
k_even_apack_8(7); \
p_idx += 4; \
} \
\
/* In the case that k is odd, we must pad with 0s */ \
if(k_left==3) \
{ \
k_left3_apack_8(0); \
k_left3_apack_8(1); \
k_left3_apack_8(2); \
k_left3_apack_8(3); \
k_left3_apack_8(4); \
k_left3_apack_8(5); \
k_left3_apack_8(6); \
k_left3_apack_8(7); \
} \
else if(k_left==2) \
{ \
k_left2_apack_8(0); \
k_left2_apack_8(1); \
k_left2_apack_8(2); \
k_left2_apack_8(3); \
k_left2_apack_8(4); \
k_left2_apack_8(5); \
k_left2_apack_8(6); \
k_left2_apack_8(7); \
} \
else if(k_left==1) \
{ \
k_left1_apack_8(0); \
k_left1_apack_8(1); \
k_left1_apack_8(2); \
k_left1_apack_8(3); \
k_left1_apack_8(4); \
k_left1_apack_8(5); \
k_left1_apack_8(6); \
k_left1_apack_8(7); \
} \
} \
\
else /* Not full size, pad with zeros */ \
{ \
p_idx = 0; \
for (int p=0; p<k_iter; p++) \
{ \
for (int ir=0; ir<ib; ir++) \
{ \
k_even_apack_8(ir); \
} \
for (int ir=ib; ir<MR; ir++) \
{ \
pad_macro_8(adest); \
} \
p_idx += 4; \
} \
\
if(k_left==3) \
{ \
for (int ir=0; ir<ib; ir++) \
{ \
k_left3_apack_8(ir); \
} \
} \
else if(k_left==2) \
{ \
for (int ir=0; ir<ib; ir++) \
{ \
k_left2_apack_8(ir); \
} \
} \
else if(k_left==1) \
{ \
for (int ir=0; ir<ib; ir++) \
{ \
k_left1_apack_8(ir); \
} \
} \
if(k_left!=0) \
{ \
for (int ir=ib; ir<MR; ir++) { \
pad_macro_8(adest); \
} \
} \
} \
} \
}
#define k_even_bpack_8(jr) \
*bdest++ = bp[ p_idx*rs_b + (j+jr)*cs_b ]; \
*bdest++ = bp[ (p_idx+1)*rs_b + (j+jr)*cs_b ]; \
*bdest++ = bp[ (p_idx+2)*rs_b + (j+jr)*cs_b ]; \
*bdest++ = bp[ (p_idx+3)*rs_b + (j+jr)*cs_b ];
#define k_left3_bpack_8(jr) \
*bdest++ = bp[ (k-3)*rs_b + (j+jr)*cs_b ]; \
*bdest++ = bp[ (k-2)*rs_b + (j+jr)*cs_b ]; \
*bdest++ = bp[ (k-1)*rs_b + (j+jr)*cs_b ]; \
memset(bdest, 0, 1); \
bdest++;
#define k_left2_bpack_8(jr) \
*bdest++ = bp[ (k-2)*rs_b + (j+jr)*cs_b ]; \
*bdest++ = bp[ (k-1)*rs_b + (j+jr)*cs_b ]; \
memset(bdest, 0, 2); \
bdest+=2;
#define k_left1_bpack_8(jr) \
*bdest++ = bp[ (k-1)*rs_b + (j+jr)*cs_b ]; \
memset(bdest, 0, 3); \
bdest+=3;
#define BIT8_PACK_B(ch, DTYPE_IN) \
\
void PACK_FUNC_NAME(ch, B) \
( \
dim_t NR, \
int k, int n, \
DTYPE_IN* bp, int rs_b, int cs_b, \
DTYPE_IN* bpack \
) \
{ \
int k_left = k%4; \
int k_iter = k/4; \
int p_idx; \
\
DTYPE_IN* bdest = bpack; \
\
for( int j=0; j<n; j += NR ) \
{ \
int jb = bli_min(NR, n-j); \
\
if ( jb == NR ) /* Full column width micro-panel.*/ \
{ \
p_idx = 0; \
for ( int p=0; p<k_iter; p++ ) \
{ \
k_even_bpack_8(0); \
k_even_bpack_8(1); \
k_even_bpack_8(2); \
k_even_bpack_8(3); \
k_even_bpack_8(4); \
k_even_bpack_8(5); \
k_even_bpack_8(6); \
k_even_bpack_8(7); \
k_even_bpack_8(8); \
k_even_bpack_8(9); \
k_even_bpack_8(10); \
k_even_bpack_8(11); \
k_even_bpack_8(12); \
k_even_bpack_8(13); \
k_even_bpack_8(14); \
k_even_bpack_8(15); \
p_idx += 4; \
} \
\
if(k_left==3) \
{ \
k_left3_bpack_8(0); \
k_left3_bpack_8(1); \
k_left3_bpack_8(2); \
k_left3_bpack_8(3); \
k_left3_bpack_8(4); \
k_left3_bpack_8(5); \
k_left3_bpack_8(6); \
k_left3_bpack_8(7); \
k_left3_bpack_8(8); \
k_left3_bpack_8(9); \
k_left3_bpack_8(10); \
k_left3_bpack_8(11); \
k_left3_bpack_8(12); \
k_left3_bpack_8(13); \
k_left3_bpack_8(14); \
k_left3_bpack_8(15); \
} \
else if(k_left==2) \
{ \
k_left2_bpack_8(0); \
k_left2_bpack_8(1); \
k_left2_bpack_8(2); \
k_left2_bpack_8(3); \
k_left2_bpack_8(4); \
k_left2_bpack_8(5); \
k_left2_bpack_8(6); \
k_left2_bpack_8(7); \
k_left2_bpack_8(8); \
k_left2_bpack_8(9); \
k_left2_bpack_8(10); \
k_left2_bpack_8(11); \
k_left2_bpack_8(12); \
k_left2_bpack_8(13); \
k_left2_bpack_8(14); \
k_left2_bpack_8(15); \
} \
else if(k_left==1) \
{ \
k_left1_bpack_8(0); \
k_left1_bpack_8(1); \
k_left1_bpack_8(2); \
k_left1_bpack_8(3); \
k_left1_bpack_8(4); \
k_left1_bpack_8(5); \
k_left1_bpack_8(6); \
k_left1_bpack_8(7); \
k_left1_bpack_8(8); \
k_left1_bpack_8(9); \
k_left1_bpack_8(10); \
k_left1_bpack_8(11); \
k_left1_bpack_8(12); \
k_left1_bpack_8(13); \
k_left1_bpack_8(14); \
k_left1_bpack_8(15); \
} \
} \
\
else /* Not a full row size micro-panel. We pad with zeroes. */ \
{ \
p_idx = 0; \
for ( int p=0; p<k_iter; p++ ) \
{ \
for ( int jr=0; jr<jb; jr++ ) \
{ \
k_even_bpack_8(jr); \
} \
for ( int jr=jb; jr<NR; jr++ ) \
{ \
pad_macro_8(bdest); \
} \
p_idx += 4; \
} \
\
if(k_left==3) \
{ \
for ( int jr=0; jr<jb; jr++ ) \
{ \
k_left3_bpack_8(jr); \
} \
} \
else if(k_left==2) \
{ \
for ( int jr=0; jr<jb; jr++ ) \
{ \
k_left2_bpack_8(jr); \
} \
} \
else if(k_left==1) \
{ \
for ( int jr=0; jr<jb; jr++ ) \
{ \
k_left1_bpack_8(jr); \
} \
} \
if (k_left!=0) \
{ \
for ( int jr=jb; jr<NR; jr++ ) { \
pad_macro_8(bdest); \
} \
} \
} \
} \
}
////////////////////////////////////////////////////////////////////////////////
/* Packing Routines */
////////////////////////////////////////////////////////////////////////////////
/*
Memory is byte-addressed. This results in two options when dealing with
int4. Either store 1 int4 value in a byte, or store 2 int4 values in 1
byte. The former is wasteful in storage, but it makes for a simpler
packing routine. However, we want to not waste any storage if possible.
Therefore I went with the latter when designing my int4 kernel.
The int4 outerproduct instruction expects a 4x8 matrix in row major order
to be loaded into the vector. In order to achieve this 4x8 row major
matrix, we pack as many 4x8 panels from the src matrix into the pack matrix.
To illustrate how my packing routine works:
x0 x1 x2 x3 x4 x5 x6 x7
x9 x10 x11 x12 x13 x14 x15 x16
x17 x18 x19 x20 x21 x22 x23 x24
x25 x26 x27 x28 x29 x30 x31 x32
Assume we have a 4x8 matrix that is stored in column major order. Also
since we are dealing with int4 values, the values are stored as pairs
within a union struct. i.e. (x0, x9) are stored together in the same struct.
Therefore in order to get the desired 4x8 row major matrix, we must go
through the first row of structs and grab the first int4 value and insert
it into the appropriate spot in the pack matrix. This means that after
packing, (x0, x1) will be stored together in the same struct.
This process then repeats until the entire src matrix is packed in these
4x8 row major matrix panels.
To handle edge cases, the packing routine will fill in zeros where it is
appropriate.
*/
#include "i4_macros.h"
#define BIT4_PACK_A(ch, DTYPE_IN) \
\
void PACK_FUNC_NAME(ch, A) \
( \
dim_t MR, \
int m, int k, \
DTYPE_IN* ap, int rs_a, int cs_a, \
DTYPE_IN* apack \
) \
{ \
int p_idx, k_left, k_iter; \
DTYPE_IN* adest = apack; \
\
k_left = k%8; \
k_iter = k/8; \
\
int i = 0; /* i is used for byte addressing */ \
for(int int4_i=0; int4_i<m; int4_i+=MR) { /* pack panels */ \
\
int ib = bli_min(MR, m-int4_i); \
p_idx = 0; \
\
if (ib == MR) { /* full size */ \
for (int p=0; p<k_iter; p++) { \
col_m_order_1(adest, ap, (i+0), rs_a, cs_a); \
col_m_order_2(adest, ap, (i+0), rs_a, cs_a); \
col_m_order_1(adest, ap, (i+1), rs_a, cs_a); \
col_m_order_2(adest, ap, (i+1), rs_a, cs_a); \
col_m_order_1(adest, ap, (i+2), rs_a, cs_a); \
col_m_order_2(adest, ap, (i+2), rs_a, cs_a); \
col_m_order_1(adest, ap, (i+3), rs_a, cs_a); \
col_m_order_2(adest, ap, (i+3), rs_a, cs_a); \
p_idx += 8; \
} \
\
/* handle edge cases if there are any */ \
if(k_left == 7) { \
apad_col_kleft7(adest, ap, rs_a, cs_a); \
} \
else if(k_left == 6) { \
apad_col_kleft6(adest, ap, rs_a, cs_a); \
} \
else if(k_left == 5) { \
apad_col_kleft5(adest, ap, rs_a, cs_a); \
} \
else if(k_left == 4) { \
apad_col_kleft4(adest, ap, rs_a, cs_a); \
} \
else if(k_left == 3) { \
apad_col_kleft3(adest, ap, rs_a, cs_a); \
} \
else if(k_left == 2) { \
apad_col_kleft2(adest, ap, rs_a, cs_a); \
} \
else if(k_left == 1) { \
apad_col_kleft1(adest, ap, rs_a, cs_a); \
} \
} \
\
else { /* not full size */ \
for (int p=0; p<k_iter; p++) { \
for (int ir=0; ir<ib; ir++) { \
if (ir%2==0) { \
col_m_order_1(adest, ap, (i+ir/2), rs_a, cs_a); \
} \
else { \
col_m_order_2(adest, ap, (i+ir/2), rs_a, cs_a); \
} \
} \
for (int ir=ib; ir<MR; ir++) { \
zero_out_dest(adest); \
} \
p_idx += 8; \
} \
\
/* handle edge cases if there are any */ \
if(k_left == 7) { \
edge7(adest, ap, i, ib, rs_a, cs_a); \
} \
else if(k_left == 6) { \
edge6(adest, ap, i, ib, rs_a, cs_a); \
} \
else if(k_left == 5) { \
edge5(adest, ap, i, ib, rs_a, cs_a); \
} \
else if(k_left == 4) { \
edge4(adest, ap, i, ib, rs_a, cs_a); \
} \
else if(k_left == 3) { \
edge3(adest, ap, i, ib, rs_a, cs_a); \
} \
else if(k_left == 2) { \
edge2(adest, ap, i, ib, rs_a, cs_a); \
} \
else if(k_left == 1) { \
edge1(adest, ap, i, ib, rs_a, cs_a); \
} \
\
/* fill in zeros when an edge case occurs */ \
if(k_left!=0) \
{ \
for (int ir=ib; ir<MR; ir++) \
zero_out_dest(adest); \
} \
} \
i += (MR/2); \
} \
}
#define BIT4_PACK_B(ch, DTYPE_IN) \
\
void PACK_FUNC_NAME(ch, B) \
( \
dim_t NR, \
int k, int n, \
DTYPE_IN* bp, int rs_b, int cs_b, \
DTYPE_IN* bpack \
) \
{ \
\
int p_idx, k_left, k_iter; \
DTYPE_IN* bdest = bpack; \
\
k_left = k%8; \
k_iter = k/8; \
\
int j = 0; \
\
for(int int4_j=0; int4_j<n; int4_j+=NR) { /* pack panels */ \
int jb = bli_min(NR, n-int4_j); \
\
p_idx = 0; \
if (jb == NR) { /* full size */ \
for (int p=0; p<k_iter; p++) { \
col_m_order_1(bdest, bp, (j+0), cs_b, rs_b); \
col_m_order_2(bdest, bp, (j+0), cs_b, rs_b); \
col_m_order_1(bdest, bp, (j+1), cs_b, rs_b); \
col_m_order_2(bdest, bp, (j+1), cs_b, rs_b); \
col_m_order_1(bdest, bp, (j+2), cs_b, rs_b); \
col_m_order_2(bdest, bp, (j+2), cs_b, rs_b); \
col_m_order_1(bdest, bp, (j+3), cs_b, rs_b); \
col_m_order_2(bdest, bp, (j+3), cs_b, rs_b); \
col_m_order_1(bdest, bp, (j+4), cs_b, rs_b); \
col_m_order_2(bdest, bp, (j+4), cs_b, rs_b); \
col_m_order_1(bdest, bp, (j+5), cs_b, rs_b); \
col_m_order_2(bdest, bp, (j+5), cs_b, rs_b); \
col_m_order_1(bdest, bp, (j+6), cs_b, rs_b); \
col_m_order_2(bdest, bp, (j+6), cs_b, rs_b); \
col_m_order_1(bdest, bp, (j+7), cs_b, rs_b); \
col_m_order_2(bdest, bp, (j+7), cs_b, rs_b); \
p_idx += 8; \
} \
\
/* handle edge cases if there are any */ \
if(k_left == 7) { \
bpad_col_kleft7(bdest, bp, cs_b, rs_b); \
} \
else if(k_left == 6) { \
bpad_col_kleft6(bdest, bp, cs_b, rs_b); \
} \
else if(k_left == 5) { \
bpad_col_kleft5(bdest, bp, cs_b, rs_b); \
} \
else if(k_left == 4) { \
bpad_col_kleft4(bdest, bp, cs_b, rs_b); \
} \
else if(k_left == 3) { \
bpad_col_kleft3(bdest, bp, cs_b, rs_b); \
} \
else if(k_left == 2) { \
bpad_col_kleft2(bdest, bp, cs_b, rs_b); \
} \
else if(k_left == 1) { \
bpad_col_kleft1(bdest, bp, cs_b, rs_b); \
} \
} \
else { /* not full size */ \
for (int p=0; p<k_iter; p++) { \
for (int jr=0; jr<jb; jr++) { \
if (jr%2==0) { \
col_m_order_1(bdest, bp, (j+jr/2), cs_b, rs_b); \
} \
else { \
col_m_order_2(bdest, bp, (j+jr/2), cs_b, rs_b); \
} \
} \
for (int jr=jb; jr<NR; jr++) { \
zero_out_dest(bdest); \
} \
p_idx += 8; \
} \
\
/* handle edge cases if there are any */ \
if(k_left == 7) { \
edge7(bdest, bp, j, jb, cs_b, rs_b); \
} \
else if(k_left == 6) { \
edge6(bdest, bp, j, jb, cs_b, rs_b); \
} \
else if(k_left == 5) { \
edge5(bdest, bp, j, jb, cs_b, rs_b); \
} \
else if(k_left == 4) { \
edge4(bdest, bp, j, jb, cs_b, rs_b); \
} \
else if(k_left == 3) { \
edge3(bdest, bp, j, jb, cs_b, rs_b); \
} \
else if(k_left == 2) { \
edge2(bdest, bp, j, jb, cs_b, rs_b); \
} \
else if(k_left == 1) { \
edge1(bdest, bp, j, jb, cs_b, rs_b); \
} \
\
/* fill in zeros when an edge case occurs */ \
if(k_left!=0) \
{ \
for (int ir=jb; ir<NR; ir++) \
zero_out_dest(bdest); \
} \
} \
j += (NR/2); \
} \
}
#define BIT16_PACK_ROUTINES(ch, DTYPE_IN) \
BIT16_PACK_A(ch, DTYPE_IN); \
BIT16_PACK_B(ch, DTYPE_IN);
#define BIT8_PACK_ROUTINES(ch, DTYPE_IN) \
BIT8_PACK_A(ch, DTYPE_IN); \
BIT8_PACK_B(ch, DTYPE_IN);
#define BIT4_PACK_ROUTINES(ch, DTYPE_IN) \
BIT4_PACK_A(ch, DTYPE_IN); \
BIT4_PACK_B(ch, DTYPE_IN);
BIT16_PACK_ROUTINES(sb, bfloat16);
BIT16_PACK_ROUTINES(i16, int16_t);
BIT16_PACK_ROUTINES(sh, float16);
BIT8_PACK_ROUTINES(i8, int8_t);
BIT4_PACK_ROUTINES(i4, nibbles);

View File

@@ -0,0 +1,64 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Templates for packing routines prototypes
#include "bli_sandbox.h"
#define PACK_FUNC_NAME_(ch, mat) ch ## _pack ## mat
#define PACK_FUNC_NAME(ch, mat) PACK_FUNC_NAME_(ch, mat)
#define PACK_MACRO_PROTO(ch, DTYPE_IN) \
\
void PACK_FUNC_NAME(ch, A) \
( \
dim_t MR, \
int m, int k, \
DTYPE_IN* ap, int rs_a, int cs_a, \
DTYPE_IN* apack \
); \
\
void PACK_FUNC_NAME(ch, B) \
( \
dim_t NR, \
int k, int n, \
DTYPE_IN* bp, int rs_b, int cs_b, \
DTYPE_IN* bpack \
);
PACK_MACRO_PROTO(sb, bfloat16)
PACK_MACRO_PROTO(sh, float16)
PACK_MACRO_PROTO(i16, int16_t)
PACK_MACRO_PROTO(i8, int8_t)
PACK_MACRO_PROTO(i4, nibbles)

View File

@@ -0,0 +1,154 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Using the GENERIC_GEMM template, create GEMM functions for each datatype
#include "generic_gemm.h"
#include "gemm_pack.h"
#define GENERIC_GEMM(ch, DTYPE_IN, DTYPE_OUT, NEW_PB, MULT, UK_FUNC) \
\
void GEMM_PASTEMAC(ch) \
( \
dim_t MR, dim_t NR, dim_t KC, dim_t NC, dim_t MC, \
int m, int n, int k, \
DTYPE_IN* restrict A, int rs_a, int cs_a, int A_align, \
DTYPE_IN* restrict B, int rs_b, int cs_b, int B_align, \
DTYPE_OUT* restrict C, int rs_c, int cs_c, \
DTYPE_OUT* alpha, DTYPE_OUT* beta \
) \
{ \
DTYPE_OUT zero = 0.0; \
DTYPE_OUT beta_ = *beta; \
\
DTYPE_IN * restrict btilde_sys = ( DTYPE_IN *) aligned_alloc( P10_PG_SIZE, B_align + KC * NC * sizeof( DTYPE_IN ) ); \
DTYPE_IN * restrict atilde_sys = ( DTYPE_IN *) aligned_alloc( P10_PG_SIZE, A_align + MC * KC * sizeof( DTYPE_IN ) ); \
\
DTYPE_IN * restrict btilde_usr = ( DTYPE_IN *)((char *)btilde_sys + B_align); \
DTYPE_IN * restrict atilde_usr = ( DTYPE_IN *)((char *)atilde_sys + A_align); \
\
const int rstep_c = MC*rs_c; \
const int cstep_c = NC*cs_c; \
\
const int rstep_a = MC*rs_a; \
const int cstep_a = KC*cs_a; \
\
const int rstep_b = KC*rs_b; \
const int cstep_b = NC*cs_b; \
\
const int rstep_mt_c = MR*rs_c; \
const int cstep_mt_c = NR*cs_c; \
\
DTYPE_OUT * restrict cblock = C; \
DTYPE_IN * restrict bblock = B; \
\
DTYPE_OUT tmp_cmicrotile[MR*NR]; \
int rs_ct = ( rs_c == 1 ? 1 : NR ); \
int cs_ct = ( rs_c == 1 ? MR : 1 ); \
\
for ( int jc=0; jc<n; jc+=NC ) \
{ \
int jb = bli_min( NC, n-jc ); \
DTYPE_IN * restrict apanel = A; \
DTYPE_IN * restrict bpanel = bblock; \
\
for ( int pc=0; pc<k; pc+=KC ) \
{ \
int pb = bli_min( KC, k-pc ); \
ch ## _packB \
(NR, pb, jb, bpanel, rs_b, cs_b, btilde_usr); \
\
int new_pb = NEW_PB; \
const int a_ps = new_pb * (MULT * MR); \
const int b_ps = new_pb * (MULT * NR); \
\
DTYPE_OUT * restrict cpanel = cblock; \
DTYPE_IN * restrict ablock = apanel; \
\
for ( int ic=0; ic<m; ic+=MC ) \
{ \
int ib = bli_min( MC, m-ic ); \
\
ch ## _packA \
( MR, ib, pb, ablock, rs_a, cs_a, atilde_usr ); \
\
DTYPE_OUT * restrict cmicrotile_col = cpanel; \
DTYPE_IN * restrict bmicropanel = btilde_usr; \
\
for ( int jr=0; jr<jb; jr+=NR ) \
{ \
int jrb = bli_min( NR, jb-jr ); \
DTYPE_OUT * restrict cmicrotile = cmicrotile_col; \
DTYPE_IN * restrict amicropanel = atilde_usr; \
\
for ( int ir=0; ir<ib; ir+=MR ) \
{ \
int irb = bli_min( MR, ib-ir ); \
\
if (jrb == NR && irb == MR) \
UK_FUNC (new_pb, alpha, amicropanel, bmicropanel, beta, cmicrotile, rs_c, cs_c, NULL, NULL); \
else \
{ \
UK_FUNC (new_pb, alpha, amicropanel, bmicropanel, &zero, tmp_cmicrotile, rs_ct, cs_ct, NULL, NULL); \
\
for (int j=0; j<jrb;j++) \
for (int i=0; i<irb;i++) \
cmicrotile[i*rs_c + j*cs_c] = \
beta_ * cmicrotile[i*rs_c + j*cs_c] + \
tmp_cmicrotile[i*rs_ct + j*cs_ct]; \
} \
amicropanel += a_ps; \
cmicrotile += rstep_mt_c; \
} \
bmicropanel += b_ps; \
cmicrotile_col += cstep_mt_c; \
} \
ablock += rstep_a; \
cpanel += rstep_c; \
} \
apanel += cstep_a; \
bpanel += rstep_b; \
} \
cblock += cstep_c; \
bblock += cstep_b; \
} \
free(btilde_sys); \
free(atilde_sys); \
}
GENERIC_GEMM( sb, bfloat16, float, (pb/2 + pb%2), 2, bli_sbgemm_power10_mma_8x16);
GENERIC_GEMM(i16, int16_t, int, (pb/2 + pb%2), 2, bli_i16gemm_power10_mma_8x16);
GENERIC_GEMM( sh, float16, float, (pb/2 + pb%2), 2, bli_shgemm_power10_mma_8x16);
GENERIC_GEMM( i8, int8_t, int, (pb/4 + pb%4>0), 4, bli_i8gemm_power10_mma_8x16);
GENERIC_GEMM( i4, nibbles, int, (pb/8 + pb%8>0), 8, bli_i4gemm_power10_mma_8x16);

View File

@@ -0,0 +1,58 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Prototypes and template for the 5-loop gemm algorithm
#include "bli_sandbox.h"
#define GEMM_PASTEMAC_(ch) bli_ ## ch ## gemm_
#define GEMM_PASTEMAC(ch) GEMM_PASTEMAC_(ch)
#define GENERIC_GEMM_PROTO(ch, DTYPE_IN, DTYPE_OUT) \
void GEMM_PASTEMAC(ch) \
( \
dim_t MR, dim_t NR, dim_t KC, dim_t NC, dim_t MC, \
int m, int n, int k, \
DTYPE_IN* restrict A, int rs_a, int cs_a, int A_align, \
DTYPE_IN* restrict B, int rs_b, int cs_b, int B_align, \
DTYPE_OUT* restrict C, int rs_c, int cs_c, \
DTYPE_OUT* alpha, DTYPE_OUT* beta \
)
GENERIC_GEMM_PROTO( sb, bfloat16, float);
GENERIC_GEMM_PROTO( sh, float16, float);
GENERIC_GEMM_PROTO(i16, int16_t, int32_t);
GENERIC_GEMM_PROTO( i8, int8_t, int32_t);
GENERIC_GEMM_PROTO( i4, nibbles, int32_t);

545
sandbox/power10/i4_macros.h Normal file
View File

@@ -0,0 +1,545 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// These are macros are used for int4 packing
// zero out 1 nibbles struct
#define zero_out_full(dest) \
dest->v = 0; \
dest++;
// zero out 4 nibbles struct
#define zero_out_dest(dest) \
memset(dest, 0, 4);
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//////////////////////////// Col Major Order Macros ////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/*
The following macros handle the case when there is a full size panel
(ib/jb == MR/NR) and no edge case (k%8 == 0).
*/
#define col_m_order_1(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+0)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+1)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+2)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+3)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+4)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+5)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+6)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+7)*cs].bits.nib1; \
dest++;
#define col_m_order_2(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+0)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+1)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+2)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+3)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+4)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+5)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+6)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+7)*cs].bits.nib2; \
dest++;
/*
The following macros handle the case when there is a full size panel
(ib/jb == MR/NR) and there is an edge case (k%8 != 0).
*/
#define col_m_order_1_kleft7(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-7)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-6)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
dest->bits.nib2 = 0; \
dest++;
#define col_m_order_2_kleft7(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-7)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-6)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
dest->bits.nib2 = 0; \
dest++;
#define col_m_order_1_kleft6(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-6)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
dest++; \
zero_out_full(dest);
#define col_m_order_2_kleft6(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-6)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
dest++; \
zero_out_full(dest);
#define col_m_order_1_kleft5(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
dest->bits.nib2 = 0; \
dest++; \
zero_out_full(dest);
#define col_m_order_2_kleft5(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
dest->bits.nib2 = 0; \
dest++; \
zero_out_full(dest);
#define col_m_order_1_kleft4(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
dest++; \
zero_out_full(dest); \
zero_out_full(dest);
#define col_m_order_2_kleft4(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
dest++; \
zero_out_full(dest); \
zero_out_full(dest);
#define col_m_order_1_kleft3(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
dest->bits.nib2 = 0; \
dest++; \
zero_out_full(dest); \
zero_out_full(dest);
#define col_m_order_2_kleft3(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
dest++; \
dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
dest->bits.nib2 = 0; \
dest++; \
zero_out_full(dest); \
zero_out_full(dest);
#define col_m_order_1_kleft2(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
dest++; \
zero_out_full(dest); \
zero_out_full(dest); \
zero_out_full(dest);
#define col_m_order_2_kleft2(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \
dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
dest++; \
zero_out_full(dest); \
zero_out_full(dest); \
zero_out_full(dest);
#define col_m_order_1_kleft1(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \
dest->bits.nib2 = 0; \
dest++; \
zero_out_full(dest); \
zero_out_full(dest); \
zero_out_full(dest);
#define col_m_order_2_kleft1(dest, matrix, rs_mul, rs, cs) \
dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \
dest->bits.nib2 = 0; \
dest++; \
zero_out_full(dest); \
zero_out_full(dest); \
zero_out_full(dest);
/*
The following macros are used when we have a full panel (ib == MR)
and we need to handle an edge case (k%8 != 0).
The MR loop is unrolled resulting in the stream of macros.
*/
#define apad_col_kleft7(dest, matrix, rs, cs) \
col_m_order_1_kleft7(dest, matrix, (i ), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (i ), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (i+1), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (i+1), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (i+2), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (i+2), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (i+3), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (i+3), rs, cs);
#define apad_col_kleft6(dest, matrix, rs, cs) \
col_m_order_1_kleft6(dest, matrix, (i ), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (i ), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (i+1), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (i+1), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (i+2), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (i+2), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (i+3), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (i+3), rs, cs);
#define apad_col_kleft5(dest, matrix, rs, cs) \
col_m_order_1_kleft5(dest, matrix, (i ), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (i ), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (i+1), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (i+1), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (i+2), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (i+2), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (i+3), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (i+3), rs, cs);
#define apad_col_kleft4(dest, matrix, rs, cs) \
col_m_order_1_kleft4(dest, matrix, (i ), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (i ), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (i+1), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (i+1), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (i+2), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (i+2), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (i+3), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (i+3), rs, cs);
#define apad_col_kleft3(dest, matrix, rs, cs) \
col_m_order_1_kleft3(dest, matrix, (i ), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (i ), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (i+1), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (i+1), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (i+2), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (i+2), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (i+3), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (i+3), rs, cs);
#define apad_col_kleft2(dest, matrix, rs, cs) \
col_m_order_1_kleft2(dest, matrix, (i ), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (i ), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (i+1), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (i+1), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (i+2), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (i+2), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (i+3), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (i+3), rs, cs);
#define apad_col_kleft1(dest, matrix, rs, cs) \
col_m_order_1_kleft1(dest, matrix, (i ), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (i ), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (i+1), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (i+1), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (i+2), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (i+2), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (i+3), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (i+3), rs, cs);
/*
The following macros are used when we have a full panel (jb == NR)
and we need to handle an edge case (k%8 != 0).
The NR loop is unrolled resulting in the stream of macros.
*/
#define bpad_col_kleft7(dest, matrix, rs, cs) \
col_m_order_1_kleft7(dest, matrix, (j ), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (j ), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (j+1), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (j+1), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (j+2), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (j+2), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (j+3), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (j+3), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (j+4), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (j+4), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (j+5), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (j+5), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (j+6), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (j+6), rs, cs); \
col_m_order_1_kleft7(dest, matrix, (j+7), rs, cs); \
col_m_order_2_kleft7(dest, matrix, (j+7), rs, cs);
#define bpad_col_kleft6(dest, matrix, rs, cs) \
col_m_order_1_kleft6(dest, matrix, (j ), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (j ), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (j+1), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (j+1), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (j+2), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (j+2), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (j+3), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (j+3), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (j+4), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (j+4), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (j+5), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (j+5), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (j+6), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (j+6), rs, cs); \
col_m_order_1_kleft6(dest, matrix, (j+7), rs, cs); \
col_m_order_2_kleft6(dest, matrix, (j+7), rs, cs);
#define bpad_col_kleft5(dest, matrix, rs, cs) \
col_m_order_1_kleft5(dest, matrix, (j ), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (j ), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (j+1), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (j+1), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (j+2), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (j+2), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (j+3), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (j+3), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (j+4), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (j+4), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (j+5), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (j+5), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (j+6), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (j+6), rs, cs); \
col_m_order_1_kleft5(dest, matrix, (j+7), rs, cs); \
col_m_order_2_kleft5(dest, matrix, (j+7), rs, cs);
#define bpad_col_kleft4(dest, matrix, rs, cs) \
col_m_order_1_kleft4(dest, matrix, (j ), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (j ), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (j+1), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (j+1), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (j+2), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (j+2), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (j+3), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (j+3), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (j+4), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (j+4), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (j+5), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (j+5), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (j+6), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (j+6), rs, cs); \
col_m_order_1_kleft4(dest, matrix, (j+7), rs, cs); \
col_m_order_2_kleft4(dest, matrix, (j+7), rs, cs);
#define bpad_col_kleft3(dest, matrix, rs, cs) \
col_m_order_1_kleft3(dest, matrix, (j ), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (j ), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (j+1), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (j+1), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (j+2), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (j+2), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (j+3), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (j+3), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (j+4), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (j+4), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (j+5), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (j+5), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (j+6), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (j+6), rs, cs); \
col_m_order_1_kleft3(dest, matrix, (j+7), rs, cs); \
col_m_order_2_kleft3(dest, matrix, (j+7), rs, cs);
#define bpad_col_kleft2(dest, matrix, rs, cs) \
col_m_order_1_kleft2(dest, matrix, (j ), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (j ), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (j+1), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (j+1), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (j+2), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (j+2), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (j+3), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (j+3), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (j+4), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (j+4), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (j+5), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (j+5), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (j+6), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (j+6), rs, cs); \
col_m_order_1_kleft2(dest, matrix, (j+7), rs, cs); \
col_m_order_2_kleft2(dest, matrix, (j+7), rs, cs);
#define bpad_col_kleft1(dest, matrix, rs, cs) \
col_m_order_1_kleft1(dest, matrix, (j ), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (j ), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (j+1), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (j+1), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (j+2), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (j+2), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (j+3), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (j+3), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (j+4), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (j+4), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (j+5), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (j+5), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (j+6), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (j+6), rs, cs); \
col_m_order_1_kleft1(dest, matrix, (j+7), rs, cs); \
col_m_order_2_kleft1(dest, matrix, (j+7), rs, cs);
/*
The following macros handle non full size panels (ib/jb != MR/NR) and
edge cases (k%8 != 0).
*/
#define edge(edgefun, dest, matrix, panel, left, rs, cs) \
for (int ir=0; ir<left; ir++) { \
if (ir%2==0) { \
col_m_order_1_ ## edgefun ## (dest, matrix, (panel+ir/2), rs, cs); \
} \
else { \
col_m_order_2_ ## edgefun ## (dest, matrix, (panel+ir/2), rs, cs); \
} \
}
#define edge7(dest, matrix, panel, left, rs, cs) \
for (int ir=0; ir<left; ir++) { \
if (ir%2==0) { \
col_m_order_1_kleft7(dest, matrix, (panel+ir/2), rs, cs); \
} \
else { \
col_m_order_2_kleft7(dest, matrix, (panel+ir/2), rs, cs); \
} \
}
#define edge6(dest, matrix, panel, left, rs, cs) \
for (int ir=0; ir<left; ir++) { \
if (ir%2==0) { \
col_m_order_1_kleft6(dest, matrix, (panel+ir/2), rs, cs); \
} \
else { \
col_m_order_2_kleft6(dest, matrix, (panel+ir/2), rs, cs); \
} \
}
#define edge5(dest, matrix, panel, left, rs, cs) \
for (int ir=0; ir<left; ir++) { \
if (ir%2==0) { \
col_m_order_1_kleft5(dest, matrix, (panel+ir/2), rs, cs); \
} \
else { \
col_m_order_2_kleft5(dest, matrix, (panel+ir/2), rs, cs); \
} \
}
#define edge4(dest, matrix, panel, left, rs, cs) \
for (int ir=0; ir<left; ir++) { \
if (ir%2==0) { \
col_m_order_1_kleft4(dest, matrix, (panel+ir/2), rs, cs); \
} \
else { \
col_m_order_2_kleft4(dest, matrix, (panel+ir/2), rs, cs); \
} \
}
#define edge3(dest, matrix, panel, left, rs, cs) \
for (int ir=0; ir<left; ir++) { \
if (ir%2==0) { \
col_m_order_1_kleft3(dest, matrix, (panel+ir/2), rs, cs); \
} \
else { \
col_m_order_2_kleft3(dest, matrix, (panel+ir/2), rs, cs); \
} \
}
#define edge2(dest, matrix, panel, left, rs, cs) \
for (int ir=0; ir<left; ir++) { \
if (ir%2==0) { \
col_m_order_1_kleft2(dest, matrix, (panel+ir/2), rs, cs); \
} \
else { \
col_m_order_2_kleft2(dest, matrix, (panel+ir/2), rs, cs); \
} \
}
#define edge1(dest, matrix, panel, left, rs, cs) \
for (int ir=0; ir<left; ir++) { \
if (ir%2==0) { \
col_m_order_1_kleft1(dest, matrix, (panel+ir/2), rs, cs); \
} \
else { \
col_m_order_2_kleft1(dest, matrix, (panel+ir/2), rs, cs); \
} \
}