diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c index 564e725e7..14c940f99 100644 --- a/config/power10/bli_cntx_init_power10.c +++ b/config/power10/bli_cntx_init_power10.c @@ -122,9 +122,9 @@ void bli_cntx_init_power10( cntx_t* cntx ) // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 576, 576, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1408, 1408, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8192, 8184, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 832, 320, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1026, 960, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 ); // Update the context with the current architecture's register and cache diff --git a/config/power10/make_defs.mk b/config/power10/make_defs.mk index 749b88b34..2c3f7cd7b 100644 --- a/config/power10/make_defs.mk +++ b/config/power10/make_defs.mk @@ -66,8 +66,12 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=power10 -mtune=power10 else +ifeq ($(CC_VENDOR),clang) +CKVECFLAGS := -mcpu=power10 -mtune=power10 +else $(info $(CC_VENDOR)) -$(error gcc is required for this configuration.) +$(error gcc, clang is required for this configuration.) +endif endif # Flags specific to reference kernels. @@ -77,4 +81,3 @@ CRVECFLAGS := $(CKVECFLAGS) # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) - diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h index 80733897b..ca523b1d7 100644 --- a/frame/3/bli_l3_ukr_prot.h +++ b/frame/3/bli_l3_ukr_prot.h @@ -36,16 +36,18 @@ // Define template prototypes for level-3 micro-kernels. // -#define GEMM_UKR_PROT( ctype, ch, opname ) \ +#define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) + +#define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype_out* restrict alpha, \ + ctype_in* restrict a, \ + ctype_in* restrict b, \ + ctype_out* restrict beta, \ + ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c new file mode 100644 index 000000000..83f1c1dc5 --- /dev/null +++ b/kernels/power10/3/bli_dgemm_power10_mma.c @@ -0,0 +1,192 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#include "vector_int_macros.h" + +#define D_ASSEMBLE_VEC_PAIR \ + __builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \ + __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); + +#define D_ACCUMULATE \ + __builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \ + __builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \ + __builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \ + __builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \ + __builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \ + __builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \ + __builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \ + __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); + +#define D_INCREMENT \ + A0+=8; \ + B0+=8; + +#define D_AB_PRODUCT \ + LOAD_VECTORS \ + D_ASSEMBLE_VEC_PAIR \ + D_INCREMENT \ + D_ACCUMULATE + + +void bli_dgemm_power10_mma_8x8 + ( + dim_t k0, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out) + uint64_t k_iter = (k0-1) / 4; + uint64_t k_left = (k0-1) % 4; + + uint64_t rs_c = rs_c0; + + double* restrict A0 = a; + double* restrict B0 = b; + double* restrict C0 = c; + + double alpha_ = *alpha, + beta_ = *beta; + + dv4sf_t result[4]; + dv4sf_t *rowC; + + /* 8 accumulator registers that will be used to store the result. + + Each accumulator register is mapped to 4 vector registers. + Illustration: + + acc0 = [ vs0 + vs1 + vs3 + vs4 ] + + These registers are used to store the result of an outer product + instruction (general outer product instruction syntax: xv???ger??). */ + __vector_quad acc0, acc1, acc2, acc3, + acc4, acc5, acc6, acc7; + + /* 2 vector pairs are necessary for a double precision outer product + instruction. */ + __vector_pair colA_1, + colA_2; + + /* Prefetch C so that it stays in cache */ + PREFETCH1 (C0, 0); + PREFETCH1 (C0 + rs_c, 0); + PREFETCH1 (C0 + rs_c + rs_c, 0); + PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0); + PREFETCH1 (C0, 128); + PREFETCH1 (C0 + rs_c, 128); + PREFETCH1 (C0 + rs_c + rs_c, 128); + PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128); + + /* Load elements into vector registers */ + vec_t *ca = (vec_t *) A0; + vec_t *rb = (vec_t *) B0; + + /* Each accumulator represents a matrix of size + 4 x ( 16 / (datatype size in bytes) ) (vector register size = 16B) + + Thus in the case of double, the accumulate registers represent a 4x2 + matrix. However, a vector register can hold at most 2 doubles. Thus, if + we performed an outer product using 2 vector register, we can only get a + 2x2 matrix. Therefore, we must create a vector register pair in order + to get the desired 4x2 matrix. + + */ + D_ASSEMBLE_VEC_PAIR + + /* Compute accumulate outer products and override accumulators with result */ + __builtin_mma_xvf64ger (&acc0, colA_1, rb[0]); + __builtin_mma_xvf64ger (&acc1, colA_1, rb[1]); + __builtin_mma_xvf64ger (&acc2, colA_1, rb[2]); + __builtin_mma_xvf64ger (&acc3, colA_1, rb[3]); + __builtin_mma_xvf64ger (&acc4, colA_2, rb[0]); + __builtin_mma_xvf64ger (&acc5, colA_2, rb[1]); + __builtin_mma_xvf64ger (&acc6, colA_2, rb[2]); + __builtin_mma_xvf64ger (&acc7, colA_2, rb[3]); + + /* Move A and B pointers */ + D_INCREMENT + + // k loop (unrolled by 4) + for (int k = 0; k0), 4, bli_i8gemm_power10_mma_8x16); +GENERIC_GEMM( i4, nibbles, int, (pb/8 + pb%8>0), 8, bli_i4gemm_power10_mma_8x16); diff --git a/sandbox/power10/generic_gemm.h b/sandbox/power10/generic_gemm.h new file mode 100644 index 000000000..8b1a16dc9 --- /dev/null +++ b/sandbox/power10/generic_gemm.h @@ -0,0 +1,58 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Prototypes and template for the 5-loop gemm algorithm + +#include "bli_sandbox.h" + +#define GEMM_PASTEMAC_(ch) bli_ ## ch ## gemm_ +#define GEMM_PASTEMAC(ch) GEMM_PASTEMAC_(ch) + +#define GENERIC_GEMM_PROTO(ch, DTYPE_IN, DTYPE_OUT) \ +void GEMM_PASTEMAC(ch) \ + ( \ + dim_t MR, dim_t NR, dim_t KC, dim_t NC, dim_t MC, \ + int m, int n, int k, \ + DTYPE_IN* restrict A, int rs_a, int cs_a, int A_align, \ + DTYPE_IN* restrict B, int rs_b, int cs_b, int B_align, \ + DTYPE_OUT* restrict C, int rs_c, int cs_c, \ + DTYPE_OUT* alpha, DTYPE_OUT* beta \ + ) + +GENERIC_GEMM_PROTO( sb, bfloat16, float); +GENERIC_GEMM_PROTO( sh, float16, float); +GENERIC_GEMM_PROTO(i16, int16_t, int32_t); +GENERIC_GEMM_PROTO( i8, int8_t, int32_t); +GENERIC_GEMM_PROTO( i4, nibbles, int32_t); + diff --git a/sandbox/power10/i4_macros.h b/sandbox/power10/i4_macros.h new file mode 100644 index 000000000..f4500bc93 --- /dev/null +++ b/sandbox/power10/i4_macros.h @@ -0,0 +1,545 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// These are macros are used for int4 packing + +// zero out 1 nibbles struct +#define zero_out_full(dest) \ + dest->v = 0; \ + dest++; + +// zero out 4 nibbles struct +#define zero_out_dest(dest) \ + memset(dest, 0, 4); + + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////// Col Major Order Macros //////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/* + + The following macros handle the case when there is a full size panel + (ib/jb == MR/NR) and no edge case (k%8 == 0). + +*/ + +#define col_m_order_1(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+0)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+1)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+2)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+3)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+4)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+5)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+6)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+7)*cs].bits.nib1; \ + dest++; + +#define col_m_order_2(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+0)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+1)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+2)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+3)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+4)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+5)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (p_idx+6)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (p_idx+7)*cs].bits.nib2; \ + dest++; + +/* + + The following macros handle the case when there is a full size panel + (ib/jb == MR/NR) and there is an edge case (k%8 != 0). + +*/ + +#define col_m_order_1_kleft7(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-7)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-6)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest->bits.nib2 = 0; \ + dest++; + +#define col_m_order_2_kleft7(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-7)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-6)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest->bits.nib2 = 0; \ + dest++; + +#define col_m_order_1_kleft6(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-6)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest++; \ + zero_out_full(dest); + +#define col_m_order_2_kleft6(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-6)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest++; \ + zero_out_full(dest); + +#define col_m_order_1_kleft5(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); + +#define col_m_order_2_kleft5(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-5)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); + +#define col_m_order_1_kleft4(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_2_kleft4(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-4)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_1_kleft3(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_2_kleft3(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-3)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest++; \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_1_kleft2(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib1; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_2_kleft2(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-2)*cs].bits.nib2; \ + dest->bits.nib2 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_1_kleft1(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib1; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); \ + zero_out_full(dest); + +#define col_m_order_2_kleft1(dest, matrix, rs_mul, rs, cs) \ + dest->bits.nib1 = matrix[rs_mul*rs + (k-1)*cs].bits.nib2; \ + dest->bits.nib2 = 0; \ + dest++; \ + zero_out_full(dest); \ + zero_out_full(dest); \ + zero_out_full(dest); + +/* + + + The following macros are used when we have a full panel (ib == MR) + and we need to handle an edge case (k%8 != 0). + + The MR loop is unrolled resulting in the stream of macros. + +*/ + +#define apad_col_kleft7(dest, matrix, rs, cs) \ + col_m_order_1_kleft7(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft6(dest, matrix, rs, cs) \ + col_m_order_1_kleft6(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft5(dest, matrix, rs, cs) \ + col_m_order_1_kleft5(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft4(dest, matrix, rs, cs) \ + col_m_order_1_kleft4(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft3(dest, matrix, rs, cs) \ + col_m_order_1_kleft3(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft2(dest, matrix, rs, cs) \ + col_m_order_1_kleft2(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (i+3), rs, cs); + +#define apad_col_kleft1(dest, matrix, rs, cs) \ + col_m_order_1_kleft1(dest, matrix, (i ), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (i ), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (i+1), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (i+1), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (i+2), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (i+2), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (i+3), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (i+3), rs, cs); + +/* + + The following macros are used when we have a full panel (jb == NR) + and we need to handle an edge case (k%8 != 0). + + The NR loop is unrolled resulting in the stream of macros. + +*/ + +#define bpad_col_kleft7(dest, matrix, rs, cs) \ + col_m_order_1_kleft7(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft7(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft7(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft6(dest, matrix, rs, cs) \ + col_m_order_1_kleft6(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft6(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft6(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft5(dest, matrix, rs, cs) \ + col_m_order_1_kleft5(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft5(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft5(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft4(dest, matrix, rs, cs) \ + col_m_order_1_kleft4(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft4(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft4(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft3(dest, matrix, rs, cs) \ + col_m_order_1_kleft3(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft3(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft3(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft2(dest, matrix, rs, cs) \ + col_m_order_1_kleft2(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft2(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft2(dest, matrix, (j+7), rs, cs); + +#define bpad_col_kleft1(dest, matrix, rs, cs) \ + col_m_order_1_kleft1(dest, matrix, (j ), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j ), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+1), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+1), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+2), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+2), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+3), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+3), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+4), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+4), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+5), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+5), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+6), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+6), rs, cs); \ + col_m_order_1_kleft1(dest, matrix, (j+7), rs, cs); \ + col_m_order_2_kleft1(dest, matrix, (j+7), rs, cs); + + +/* + + The following macros handle non full size panels (ib/jb != MR/NR) and + edge cases (k%8 != 0). + +*/ + +#define edge(edgefun, dest, matrix, panel, left, rs, cs) \ + for (int ir=0; ir