From 670bc7b60f6065893e8ec1bebd2fc9e5ba710dff Mon Sep 17 00:00:00 2001 From: Nicholai Tukanov Date: Fri, 5 Mar 2021 13:53:43 -0600 Subject: [PATCH] Add low-precision POWER10 gemm kernels (#467) Details: - This commit adds a new BLIS sandbox that (1) provides implementations based on low-precision gemm kernels, and (2) extends the BLIS typed API for those new implementations. Currently, these new kernels can only be used for the POWER10 microarchitecture; however, they may provide a template for developing similar kernels for other microarchitectures (even those beyond POWER), as changes would likely be limited to select places in the microkernel and possibly the packing routines. The new low-precision operations that are now supported include: shgemm, sbgemm, i16gemm, i8gemm, i4gemm. For more information, refer to the POWER10.md document that is included in 'sandbox/power10'. --- config/power10/bli_cntx_init_power10.c | 6 +- config/power10/make_defs.mk | 7 +- frame/3/bli_l3_ukr_prot.h | 14 +- kernels/power10/3/bli_dgemm_power10_mma.c | 192 ++++ kernels/power10/3/bli_gemm_power10_mma.c | 359 -------- kernels/power10/3/bli_i16gemm_power10_mma.c | 140 +++ kernels/power10/3/bli_i16sgemm_power10_mma.c | 140 +++ kernels/power10/3/bli_i4gemm_power10_mma.c | 140 +++ kernels/power10/3/bli_i8gemm_power10_mma.c | 139 +++ kernels/power10/3/bli_sbgemm_power10_mma.c | 141 +++ kernels/power10/3/bli_sgemm_power10_mma.c | 144 +++ kernels/power10/3/bli_shgemm_power10_mma.c | 141 +++ kernels/power10/3/vector_int_macros.h | 71 ++ kernels/power10/bli_kernels_power10.h | 3 +- sandbox/power10/POWER10.md | 71 ++ sandbox/power10/bli_gemmnat.c | 71 ++ sandbox/power10/bli_sandbox.h | 115 +++ sandbox/power10/gemm_api.c | 77 ++ sandbox/power10/gemm_api.h | 53 ++ sandbox/power10/gemm_pack.c | 889 +++++++++++++++++++ sandbox/power10/gemm_pack.h | 64 ++ sandbox/power10/generic_gemm.c | 154 ++++ sandbox/power10/generic_gemm.h | 58 ++ sandbox/power10/i4_macros.h | 545 ++++++++++++ 24 files changed, 3363 insertions(+), 371 deletions(-) create mode 100644 kernels/power10/3/bli_dgemm_power10_mma.c delete mode 100644 kernels/power10/3/bli_gemm_power10_mma.c create mode 100644 kernels/power10/3/bli_i16gemm_power10_mma.c create mode 100644 kernels/power10/3/bli_i16sgemm_power10_mma.c create mode 100644 kernels/power10/3/bli_i4gemm_power10_mma.c create mode 100644 kernels/power10/3/bli_i8gemm_power10_mma.c create mode 100644 kernels/power10/3/bli_sbgemm_power10_mma.c create mode 100644 kernels/power10/3/bli_sgemm_power10_mma.c create mode 100644 kernels/power10/3/bli_shgemm_power10_mma.c create mode 100644 kernels/power10/3/vector_int_macros.h create mode 100644 sandbox/power10/POWER10.md create mode 100644 sandbox/power10/bli_gemmnat.c create mode 100644 sandbox/power10/bli_sandbox.h create mode 100644 sandbox/power10/gemm_api.c create mode 100644 sandbox/power10/gemm_api.h create mode 100644 sandbox/power10/gemm_pack.c create mode 100644 sandbox/power10/gemm_pack.h create mode 100644 sandbox/power10/generic_gemm.c create mode 100644 sandbox/power10/generic_gemm.h create mode 100644 sandbox/power10/i4_macros.h diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c index 564e725e7..14c940f99 100644 --- a/config/power10/bli_cntx_init_power10.c +++ b/config/power10/bli_cntx_init_power10.c @@ -122,9 +122,9 @@ void bli_cntx_init_power10( cntx_t* cntx ) // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 576, 576, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1408, 1408, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8192, 8184, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 832, 320, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1026, 960, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 ); // Update the context with the current architecture's register and cache diff --git a/config/power10/make_defs.mk b/config/power10/make_defs.mk index 749b88b34..2c3f7cd7b 100644 --- a/config/power10/make_defs.mk +++ b/config/power10/make_defs.mk @@ -66,8 +66,12 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=power10 -mtune=power10 else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := -mcpu=power10 -mtune=power10 +else $(info $(CC_VENDOR)) -$(error gcc is required for this configuration.) +$(error gcc, clang is required for this configuration.) +endif endif # Flags specific to reference kernels. @@ -77,4 +81,3 @@ CRVECFLAGS := $(CKVECFLAGS) # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) - diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h index 80733897b..ca523b1d7 100644 --- a/frame/3/bli_l3_ukr_prot.h +++ b/frame/3/bli_l3_ukr_prot.h @@ -36,16 +36,18 @@ // Define template prototypes for level-3 micro-kernels. // -#define GEMM_UKR_PROT( ctype, ch, opname ) \ +#define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) + +#define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype_out* restrict alpha, \ + ctype_in* restrict a, \ + ctype_in* restrict b, \ + ctype_out* restrict beta, \ + ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c new file mode 100644 index 000000000..83f1c1dc5 --- /dev/null +++ b/kernels/power10/3/bli_dgemm_power10_mma.c @@ -0,0 +1,192 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#include "vector_int_macros.h" + +#define D_ASSEMBLE_VEC_PAIR \ + __builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \ + __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); + +#define D_ACCUMULATE \ + __builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \ + __builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \ + __builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \ + __builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \ + __builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \ + __builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \ + __builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \ + __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); + +#define D_INCREMENT \ + A0+=8; \ + B0+=8; + +#define D_AB_PRODUCT \ + LOAD_VECTORS \ + D_ASSEMBLE_VEC_PAIR \ + D_INCREMENT \ + D_ACCUMULATE + + +void bli_dgemm_power10_mma_8x8 + ( + dim_t k0, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out) + uint64_t k_iter = (k0-1) / 4; + uint64_t k_left = (k0-1) % 4; + + uint64_t rs_c = rs_c0; + + double* restrict A0 = a; + double* restrict B0 = b; + double* restrict C0 = c; + + double alpha_ = *alpha, + beta_ = *beta; + + dv4sf_t result[4]; + dv4sf_t *rowC; + + /* 8 accumulator registers that will be used to store the result. + + Each accumulator register is mapped to 4 vector registers. + Illustration: + + acc0 = [ vs0 + vs1 + vs3 + vs4 ] + + These registers are used to store the result of an outer product + instruction (general outer product instruction syntax: xv???ger??). */ + __vector_quad acc0, acc1, acc2, acc3, + acc4, acc5, acc6, acc7; + + /* 2 vector pairs are necessary for a double precision outer product + instruction. */ + __vector_pair colA_1, + colA_2; + + /* Prefetch C so that it stays in cache */ + PREFETCH1 (C0, 0); + PREFETCH1 (C0 + rs_c, 0); + PREFETCH1 (C0 + rs_c + rs_c, 0); + PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0); + PREFETCH1 (C0, 128); + PREFETCH1 (C0 + rs_c, 128); + PREFETCH1 (C0 + rs_c + rs_c, 128); + PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128); + + /* Load elements into vector registers */ + vec_t *ca = (vec_t *) A0; + vec_t *rb = (vec_t *) B0; + + /* Each accumulator represents a matrix of size + 4 x ( 16 / (datatype size in bytes) ) (vector register size = 16B) + + Thus in the case of double, the accumulate registers represent a 4x2 + matrix. However, a vector register can hold at most 2 doubles. Thus, if + we performed an outer product using 2 vector register, we can only get a + 2x2 matrix. Therefore, we must create a vector register pair in order + to get the desired 4x2 matrix. + + */ + D_ASSEMBLE_VEC_PAIR + + /* Compute accumulate outer products and override accumulators with result */ + __builtin_mma_xvf64ger (&acc0, colA_1, rb[0]); + __builtin_mma_xvf64ger (&acc1, colA_1, rb[1]); + __builtin_mma_xvf64ger (&acc2, colA_1, rb[2]); + __builtin_mma_xvf64ger (&acc3, colA_1, rb[3]); + __builtin_mma_xvf64ger (&acc4, colA_2, rb[0]); + __builtin_mma_xvf64ger (&acc5, colA_2, rb[1]); + __builtin_mma_xvf64ger (&acc6, colA_2, rb[2]); + __builtin_mma_xvf64ger (&acc7, colA_2, rb[3]); + + /* Move A and B pointers */ + D_INCREMENT + + // k loop (unrolled by 4) + for (int k = 0; k0), 4, bli_i8gemm_power10_mma_8x16); +GENERIC_GEMM( i4, nibbles, int, (pb/8 + pb%8>0), 8, bli_i4gemm_power10_mma_8x16); diff --git a/sandbox/power10/generic_gemm.h b/sandbox/power10/generic_gemm.h new file mode 100644 index 000000000..8b1a16dc9 --- /dev/null +++ b/sandbox/power10/generic_gemm.h @@ -0,0 +1,58 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Prototypes and template for the 5-loop gemm algorithm + +#include "bli_sandbox.h" + +#define GEMM_PASTEMAC_(ch) bli_ ## ch ## gemm_ +#define GEMM_PASTEMAC(ch) GEMM_PASTEMAC_(ch) + +#define GENERIC_GEMM_PROTO(ch, DTYPE_IN, DTYPE_OUT) \ +void GEMM_PASTEMAC(ch) \ + ( \ + dim_t MR, dim_t NR, dim_t KC, dim_t NC, dim_t MC, \ + int m, int n, int k, \ + DTYPE_IN* restrict A, int rs_a, int cs_a, int A_align, \ + DTYPE_IN* restrict B, int rs_b, int cs_b, int B_align, \ + DTYPE_OUT* restrict C, int rs_c, int cs_c, \ + DTYPE_OUT* alpha, DTYPE_OUT* beta \ + ) + +GENERIC_GEMM_PROTO( sb, bfloat16, float); +GENERIC_GEMM_PROTO( sh, float16, float); +GENERIC_GEMM_PROTO(i16, int16_t, int32_t); +GENERIC_GEMM_PROTO( i8, int8_t, int32_t); +GENERIC_GEMM_PROTO( i4, nibbles, int32_t); + diff --git a/sandbox/power10/i4_macros.h b/sandbox/power10/i4_macros.h new file mode 100644 index 000000000..f4500bc93 --- /dev/null +++ b/sandbox/power10/i4_macros.h @@ -0,0 +1,545 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// These are macros are used for int4 packing + +// zero out 1 nibbles struct +#define zero_out_full(dest) \ + dest->v = 0; \ + dest++; + +// zero out 4 nibbles struct +#define zero_out_dest(dest) \ + memset(dest, 0, 4); + + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////// Col Major Order Macros //////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/* + + The following macros handle the case when there is a full size panel + (ib/jb == MR/NR) and no edge case (k%8 == 0). + +*/ + +#define col_m_order_1(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+0)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+1)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+2)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+3)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+4)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+5)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+6)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+7)*cs].bits.nib1; \ + dest++; + +#define col_m_order_2(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+0)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+1)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+2)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+3)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+4)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+5)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+6)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+7)*cs].bits.nib2; \ + dest++; + +/* + + The following macros handle the case when there is a full size panel + (ib/jb == MR/NR) and there is an edge case (k%8 != 0). + +*/ + +#define col_m_order_1_kleft7(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-7)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-6)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest->bits.nib2 = 0; \ + dest++; + +#define col_m_order_2_kleft7(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-7)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-6)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest->bits.nib2 = 0; \ + dest++; + +#define col_m_order_1_kleft6(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-6)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest++; \ + zero_out_full(dest); + +#define col_m_order_2_kleft6(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-6)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest++; \ + zero_out_full(dest); + +#define col_m_order_1_kleft5(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); + +#define col_m_order_2_kleft5(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); + +#define col_m_order_1_kleft4(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_2_kleft4(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_1_kleft3(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_2_kleft3(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_1_kleft2(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_2_kleft2(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_1_kleft1(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_2_kleft1(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); \ + zero_out_full(dest); + +/* + + + The following macros are used when we have a full panel (ib == MR) + and we need to handle an edge case (k%8 != 0). + + The MR loop is unrolled resulting in the stream of macros. + +*/ + +#define apad_col_kleft7(dest, matrix, rs, cs) \ + col_m_order_1_kleft7(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft6(dest, matrix, rs, cs) \ + col_m_order_1_kleft6(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft5(dest, matrix, rs, cs) \ + col_m_order_1_kleft5(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft4(dest, matrix, rs, cs) \ + col_m_order_1_kleft4(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft3(dest, matrix, rs, cs) \ + col_m_order_1_kleft3(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft2(dest, matrix, rs, cs) \ + col_m_order_1_kleft2(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft1(dest, matrix, rs, cs) \ + col_m_order_1_kleft1(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (i+3), rs, cs); + +/* + + The following macros are used when we have a full panel (jb == NR) + and we need to handle an edge case (k%8 != 0). + + The NR loop is unrolled resulting in the stream of macros. + +*/ + +#define bpad_col_kleft7(dest, matrix, rs, cs) \ + col_m_order_1_kleft7(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft6(dest, matrix, rs, cs) \ + col_m_order_1_kleft6(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft5(dest, matrix, rs, cs) \ + col_m_order_1_kleft5(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft4(dest, matrix, rs, cs) \ + col_m_order_1_kleft4(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft3(dest, matrix, rs, cs) \ + col_m_order_1_kleft3(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft2(dest, matrix, rs, cs) \ + col_m_order_1_kleft2(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft1(dest, matrix, rs, cs) \ + col_m_order_1_kleft1(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+7), rs, cs); + + +/* + + The following macros handle non full size panels (ib/jb != MR/NR) and + edge cases (k%8 != 0). + +*/ + +#define edge(edgefun, dest, matrix, panel, left, rs, cs) \ + for (int ir=0; ir