diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c index 979c26dea..af84d0588 100644 --- a/kernels/zen/3/bli_trsm_small.c +++ b/kernels/zen/3/bli_trsm_small.c @@ -4,11 +4,7 @@ BLIS An object-based framework for developing high-performance BLAS-like libraries. -<<<<<<< HEAD Copyright (C) 2018-2019, Advanced Micro Devices, Inc. -======= -Copyright (C) 2018, Advanced Micro Devices, Inc. ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -30,24 +26,15 @@ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -<<<<<<< HEAD THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -======= -THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -<<<<<<< HEAD -======= -//#define BLIS_ENABLE_SMALL_MATRIX_TRSM ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B #include "blis.h" #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM #include "immintrin.h" -<<<<<<< HEAD #define GEMM_BLK_V1 8 //Block size to perform gemm and apply trsm #define GEMM_ACCUM_A 1 //Peform B1=B1-(B0*A0) operation instead of B1'=(B0*A0) and then B1=B1-B1' #define OPT_CACHE_BLOCKING_L1 1 //Perform trsm block-wise in blocks of GEMM_BLK_V1 instead of all columns of B together. @@ -354,91 +341,6 @@ static void trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, int cs_b, float alpha); -======= - -static void (*fp_blis_strsm_microkernel)( float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b - ); -static void blis_strsm_microkernel( float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b - ); -static void blis_strsm_microkernel_alpha( float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b, - float alphaVal - ); -static void blis_strsm_microkernel_unitDiag( float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b - ); -static void blis_strsm_microkernel_alpha_unitDiag( float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b, - float alphaVal - ); -static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b); -static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b, - float alphaVal); -static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b); -static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, - float *ptr_b, - int numRows_lb, - int numCols_b, - int rs_l, - int rs_b, - int cs_l, - int cs_b, - float alphaVal); - ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B //AX = B; A is lower triangular; No transpose; single precision static err_t bli_strsm_small_AlXB ( @@ -449,21 +351,6 @@ static err_t bli_strsm_small_AlXB cntx_t* cntx, cntl_t* cntl ); -<<<<<<< HEAD -======= - -//AX = B; A is lower triangular; No transpose; double precision -static err_t bli_dtrsm_small_AlXB - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - cntl_t* cntl - ); - ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B //A.'X = B; A is upper triangular; A has to be transposed; single precision static err_t bli_strsm_small_AutXB ( @@ -475,20 +362,6 @@ static err_t bli_strsm_small_AutXB cntl_t* cntl ); -<<<<<<< HEAD -======= -//A.'X = B; A is upper triangular; A has to be transposed; double precision -static err_t bli_dtrsm_small_AutXB - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - cntl_t* cntl - ); - ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B //XA.' = B; A is lower triangular; A has to be transposed; single precision static err_t bli_strsm_small_XAltB ( @@ -500,13 +373,8 @@ static err_t bli_strsm_small_XAltB cntl_t* cntl ); -<<<<<<< HEAD //A.'X = B; A is upper triangular; A has to be transposed; double precision static err_t bli_dtrsm_small_AutXB -======= -//XA.' = B; A is lower triangular; A has to be transposed; double precision -static err_t bli_dtrsm_small_XAltB ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B ( side_t side, obj_t* alpha, @@ -515,11 +383,7 @@ static err_t bli_dtrsm_small_XAltB cntx_t* cntx, cntl_t* cntl ); -<<<<<<< HEAD -======= - void trsm_block_c(float *ptr_l, float *ptr_b, int blk_height, int blk_width, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b); ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B /* * The bli_trsm_small implements unpacked version of TRSM * Currently only column-major is supported, A & B are column-major @@ -543,7 +407,6 @@ err_t bli_trsm_small return BLIS_NOT_YET_IMPLEMENTED; #endif -<<<<<<< HEAD dim_t m = bli_obj_length(b); dim_t n = bli_obj_width(b); @@ -551,8 +414,6 @@ err_t bli_trsm_small return BLIS_SUCCESS; -======= ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B // If alpha is zero, B matrix will become zero after scaling & hence solution is also zero matrix if (bli_obj_equals(alpha, &BLIS_ZERO)) { @@ -561,13 +422,8 @@ err_t bli_trsm_small // We have to call matrix scaling if alpha != 1.0 // if row major format return. Check this again. -<<<<<<< HEAD if ((bli_obj_row_stride(a) != 1) || (bli_obj_row_stride(b) != 1)) -======= - if ((bli_obj_row_stride(*a) != 1) || - (bli_obj_row_stride(*b) != 1)) ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B { return BLIS_INVALID_ROW_STRIDE; } @@ -577,7 +433,6 @@ err_t bli_trsm_small // only float and double datatypes are supported as of now. if (dt != BLIS_DOUBLE && dt != BLIS_FLOAT) { -<<<<<<< HEAD return BLIS_EXPECTED_REAL_DATATYPE; } @@ -585,22 +440,12 @@ err_t bli_trsm_small if (!bli_obj_is_upper_or_lower (a)) { return BLIS_EXPECTED_TRIANGULAR_OBJECT; -======= - return BLIS_EXPECTED_REAL_DATATYPE; - } - - // A is expected to be triangular in trsm - if (!bli_obj_is_upper_or_lower (*a)) - { - return BLIS_EXPECTED_TRIANGULAR_OBJECT; ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B } // can use other control structs - even can use array of function pointers, // indexed by a number with bits formed by f('side', 'uplo', 'transa', dt). // In the below implementation, based on the number of finally implemented // cases, can move the checks with more cases higher up. -<<<<<<< HEAD if(side == BLIS_LEFT) { @@ -664,45 +509,10 @@ err_t bli_trsm_small } -======= - if (side == BLIS_LEFT) - { - if (bli_obj_has_trans(*a)) - { - if (dt == BLIS_DOUBLE) - { - if (bli_obj_is_upper(*a)) - { - //A.'X = B; A is upper triangular; A has to be transposed; double precision -#if 0 // planning to implement this in this iteration - return bli_dtrsm_small_AutXB(side, alpha, a, b, cntx, cntl); -#else - return BLIS_NOT_YET_IMPLEMENTED; -#endif - } - else - { - return BLIS_NOT_YET_IMPLEMENTED; - } - } - else if (dt == BLIS_FLOAT) - { - if (bli_obj_is_upper(*a)) - { - //A.'X = B; A is upper triangular; A has to be transposed; single precision - //return bli_strsm_small_AutXB(side, alpha, a, b, cntx, cntl); - return BLIS_NOT_YET_IMPLEMENTED; - } - else - { - return BLIS_NOT_YET_IMPLEMENTED; - } ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B } } else { -<<<<<<< HEAD if(bli_obj_has_trans(a)) { if(dt == BLIS_DOUBLE) @@ -25289,5214 +25099,3 @@ static void trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, ///////////////////loop ends ///////////////////// } #endif -======= - if (dt == BLIS_DOUBLE) - { - if (bli_obj_is_upper(*a)) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - else - { - //AX = B; A is lower triangular; No transpose; double precision - //return bli_dtrsm_small_AlXB(side, alpha, a, b, cntx, cntl); - return BLIS_NOT_YET_IMPLEMENTED; - } - } - else if (dt == BLIS_FLOAT) - { - if (bli_obj_is_upper(*a)) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - else - { - //AX = B; A is lower triangular; No transpose; single precision - return bli_strsm_small_AlXB(side, alpha, a, b, cntx, cntl); - } - } - } - } - else - { - if (bli_obj_has_trans(*a)) - { - if (dt == BLIS_DOUBLE) - { - if (bli_obj_is_upper(*a)) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - else - { - //XA.' = B; A is lower triangular; A has to be transposed; double precision -#if 0 // planning to implement this in this iteration - return bli_dtrsm_small_XAltB(side, alpha, a, b, cntx, cntl); -#else - return BLIS_NOT_YET_IMPLEMENTED; -#endif - } - } - else if (dt == BLIS_FLOAT) - { - if (bli_obj_is_upper(*a)) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - else - { - //XA.' = B; A is lower triangular; A has to be transposed; single precision - return bli_strsm_small_XAltB(side, alpha, a, b, cntx, cntl); - } - } - } - else - { - return BLIS_NOT_YET_IMPLEMENTED; - } - } - - return BLIS_NOT_YET_IMPLEMENTED; -}; - - - - -/* - * AX = alpha*B, Double precision, A: lower triangular - */ -static err_t bli_dtrsm_small_AlXB ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - cntl_t* cntl - ) -{ - - int M = bli_obj_length(*b); // number of rows of matrix B - int N = bli_obj_width(*b); // number of columns of matrix B - - int lda = bli_obj_col_stride(*a); // column stride of A - int ldb = bli_obj_col_stride(*b); // column stride of B - - int i; - int j; - int k; - - double *A = a->buffer; - double *B = b->buffer; - - // Need to incorporate alpha - - #if 0 - - for (k = 0; k < M; k++) - { - double lkk_inv = 1.0/A[k+k*lda]; - - for (j = 0; j < N; j++) - { - B[k + j*ldb] *= lkk_inv; - } - for (i = k+1; i < M; i++) - { - for (j = 0; j < N; j++) - { - B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb]; - } - } - }// k -loop - #else - for (k = 0; k < M; k++) - { - double lkk_inv = 1.0/A[k+k*lda]; - - for (j = 0; j < N; j++) - { - B[k + j*ldb] *= lkk_inv; - - // for (j = 0; j < N; j++) - for (i = k+1; i < M; i++) - { - B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb]; - } - } - }// k -loop - - #endif - - return BLIS_SUCCESS; -}// end of function - - - -static void trsm_small_AlXB ( - float *A, - float *B, - int M, - int N, - int lda, - int ldb - ) -{ - int i; - int j; - int k; - - // Need to incorporate alpha - - for (k = 0; k < M; k++) - { - float lkk_inv = 1.0/A[k+k*lda]; - - for (j = 0; j < N; j++) - { - B[k + j*ldb] *= lkk_inv; - - for (i = k+1; i < M; i++) - { - B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb]; - } - } - }// k -loop - -}// end of function - - -// Test code: -void gemm_small( float *ptr_l, - float *ptr_b, - int blk_m, - int blk_n, - float *ptr_gemmOut, - int cs_l, - int cs_b, - int rs_l, - int rs_b, - float alpha, - float beta) -{ - int i, j, k; - - for (i = 0; i < blk_m; i++) - { - for (j = 0; j < blk_n; j++) - { - float t = 0.0; - for (k = 0; k < blk_m; k++) - { - t += (ptr_l[i*rs_l + k* cs_l] * ptr_b[k*rs_b + j*cs_b]); - } - ptr_gemmOut[i*rs_b + j*cs_b] = beta * ptr_gemmOut[i*rs_b + j*cs_b] + alpha * t; - } - } -} - - - - -/* - * AX = Alpha*B, Single precision, A: lower triangular - */ -static err_t bli_strsm_small_AlXB ( - side_t side, - obj_t* AlphaObj, - obj_t* a, - obj_t* b, - cntx_t* cntx, - cntl_t* cntl - ) -{ - obj_t alpha, beta; // gemm parameters - obj_t Ga, Gb, Gc; // for GEMM - int m = bli_obj_length(*b); // number of rows of matrix B - int n = bli_obj_width(*b); // number of columns of matrix B - - int lda = bli_obj_col_stride(*a); // column stride of A - int ldb = bli_obj_col_stride(*b); // column stride of B - - int rsa = bli_obj_row_stride(*a); // row stride of A - int rsb = bli_obj_row_stride(*b); // row stride of B - - int i = 0; - int j; - int blk_size = 8; - int isUnitDiag = bli_obj_has_unit_diag(*a); - - float alphaVal; - float *L = a->buffer; - float *B = b->buffer; - - if (m != 16 || (n%8) != 0) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - if ( (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - - alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, *AlphaObj)); - - /* Small _GEMM preparation code */ - bli_obj_create( BLIS_FLOAT, 1, 1, 0, 0, &alpha ); - bli_obj_create( BLIS_FLOAT, 1, 1, 0, 0, &beta ); - - /* B = B - A*B */ - bli_setsc( -(1.0), 0.0, &alpha ); - bli_setsc( (1.0), 0.0, &beta ); - - - bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, blk_size, a->buffer, rsa, lda, &Ga); - bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, n, b->buffer, rsb, ldb, &Gb); - bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, n, b->buffer, rsb, ldb, &Gc); - - bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, Ga ); - bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, Gb ); - bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, Gc ); - - //first block of trsm - Gb.buffer = (void*)(B + i); - - //trsm of first 8xn block - if (alphaVal != 1) - { - if (isUnitDiag == 0) - { - blis_strsm_microkernel_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); - fp_blis_strsm_microkernel = blis_strsm_microkernel; - } - else - { - blis_strsm_microkernel_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); - fp_blis_strsm_microkernel = blis_strsm_microkernel_unitDiag; - } - bli_setsc( alphaVal, 0.0, &beta ); - } - else - { - if (isUnitDiag == 0) - { - blis_strsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); - fp_blis_strsm_microkernel = blis_strsm_microkernel; - } - else - { - blis_strsm_microkernel_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); - fp_blis_strsm_microkernel = blis_strsm_microkernel_unitDiag; - } - } - - //gemm update - for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT - { - Ga.buffer = (void*)(L + j + i*lda); - Gc.buffer = (void*)(B + j); - - bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb - } - - //trsm of remaining blocks - for (i = blk_size; i < m; i += blk_size) - { - Gb.buffer = (void*)(B + i); - - fp_blis_strsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); - - for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT - { - Ga.buffer = (void*)(L + j + i*lda); - Gc.buffer = (void*)(B + j); - - bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb - } - - } // End of for loop - i - - return BLIS_SUCCESS; -} - -void trsm_block_c(float *ptr_l, float *ptr_b, int blk_height, int blk_width, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) -{ - int i, j, k, l; - float inv_l; - - inv_l = 1.0 / *ptr_l; - - for (j = 0; j < numCols_b; j += blk_width) - { - for (l = j; l < (j+blk_width); l++) - { - ptr_b[l*cs_b] = ptr_b[l*cs_b] * inv_l; - } - - for (i = 1; i < blk_height; i++) - { - for (l = j; l < (j+blk_width); l++) - { - for (k = 0; k < i; k++) - { - ptr_b[i*rs_b + l*cs_b] -= (ptr_b[k*rs_b + l*cs_b] * ptr_l[i*rs_l + k*cs_l]); - } - ptr_b[i*rs_b + l*cs_b] = ptr_b[i*rs_b + l*cs_b] / ptr_l[i*rs_l + i*cs_l]; - } - } - } -} - -/* - * XA' = Alpha*B, Single precision, A: lower triangular - */ -static err_t bli_strsm_small_XAltB( - side_t side, - obj_t* AlphaObj, - obj_t* a, - obj_t* b, - cntx_t* cntx, - cntl_t* cntl - ) -{ - int m = bli_obj_length(*a); // number of rows of matrix B - int n = bli_obj_length(*b); // number of columns of matrix B - - int lda = bli_obj_col_stride(*a); // column stride of A - int ldb = bli_obj_col_stride(*b); // column stride of B - - int rsa = bli_obj_row_stride(*a); // row stride of A - int rsb = bli_obj_row_stride(*b); // row stride of B - - int i = 0; - int isUnitDiag = bli_obj_has_unit_diag(*a); - - float alphaVal; - float *L = a->buffer; - float *B = b->buffer; - - if ((m%8) != 0 || (n%8) != 0) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - if ( (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - - alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, *AlphaObj)); - - if (alphaVal != 1) - { - if (isUnitDiag == 0) - { - trsm_XAtB_block_allSmallSizedMatrices_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); - } - else - { - trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); - } - } - else - { - if (isUnitDiag == 0) - { - trsm_XAtB_block_allSmallSizedMatrices((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); - } - else - { - trsm_XAtB_block_allSmallSizedMatrices_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); - } - } - return BLIS_SUCCESS; -} - -static void blis_strsm_microkernel_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal) -{ - float ones = 1.0; - int j; - int cs_b_offset[6]; - //int row2, row4, row6; - float *ptr_b_dup; - - //70 number of ymm(256 bits) registers used - __m256 mat_b_col[8]; - __m256 mat_b_rearr[8]; - __m256 mat_a_cols[8]; - __m256 mat_a_cols_rearr[36]; - __m256 mat_a_diag_inv[8]; - __m256 reciprocal_diags; - __m256 alphaReg; - - cs_b_offset[0] = (cs_b << 1); - cs_b_offset[1] = cs_b + cs_b_offset[0]; - cs_b_offset[2] = (cs_b << 2); - cs_b_offset[3] = cs_b + cs_b_offset[2]; - cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; - cs_b_offset[5] = cs_b + cs_b_offset[4]; - - //reciprocal_diags = _mm256_loadu_ps((float const *)ones); - reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); - alphaReg = _mm256_broadcast_ss((float const *)&alphaVal); - - // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // - - //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); - //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); - //row2 = (cs_l << 1); - //row4 = (cs_l << 2); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); - //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); - //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); - //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); - //row6 = row2 + row4; - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); - //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); - //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); - //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); - //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); - - //reciprocal_diags = _mm256_loadu_ps((float const *)ones); - - //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L - /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ - - //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers - //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. - //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); - //1st col - mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); - mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); - mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); - mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); - mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); - mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); - mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); - mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); - //2nd col - ptr_l += cs_l; - mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); - mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //3rd col - ptr_l += cs_l; - mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //4rth col - ptr_l += cs_l; - mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //5th col - ptr_l += cs_l; - mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //6th col - ptr_l += cs_l; - mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //7th col - ptr_l += cs_l; - mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //7th col - ptr_l += cs_l; - mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - - numCols_b -= 8; // blk_width = 8 - - //compute reciprocals of L(i,i) and broadcast in registers - mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); - mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); - mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); - mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); - - //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); - //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); - mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); - - //reciprocal of diagnol elements - reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); - - //Start loop for cols of B to be processed in size of blk_width - for (j = 0; j < numCols_b; j += 8) - { - ptr_b_dup = ptr_b; - - /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ - - ////unpacklow//// - mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); - mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); - mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); - mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); - mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); - mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); - mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); - - ////unpackhigh//// - mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); - mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); - mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); - mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); - mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); -#else - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); - mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); - mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); - mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); - mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); -#endif - - //extract diag a00 from a - mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); - - //Merge rearranged high elements into complete rows - mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); - mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); - mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); - mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); - - mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); - mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); - mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); - mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); - - //extract diag a11 from a - mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); - mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); - - //extract diag a22 from a - mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); - mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); - - //extract diag a33 from a - mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); - mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); - - //extract diag a44 from a - mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); - mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); - - //extract diag a55 from a - mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); - mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); - - //extract diag a66 from a - mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); - mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); - mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); - - //--> Transpose and store results of columns of B block <--// - ////unpacklow//// - mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); - mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); -#else - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); - mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); - mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); - mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); - mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); - mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); - mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); - mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); - - ////unpackhigh//// - mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - - //Merge rearranged high elements into complete rows - mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - //Read next set of B columns - ptr_b += (cs_b + cs_b_offset[5]); - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); - - //end loop of cols - } - - //Last block trsm processing - ptr_b_dup = ptr_b; - - /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ - - ////unpacklow//// - mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); - mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); - mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); - mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); - mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); - mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); - mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); - - ////unpackhigh//// - mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); - mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); - mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); - mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); - mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); -#else - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); - mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); - mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); - mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); - mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); -#endif - - //extract diag a00 from a - mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); - - //Merge rearranged high elements into complete rows - mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); - mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); - mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); - mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); - - mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); - mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); - mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); - mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); - - //extract diag a11 from a - mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); - mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); - - //extract diag a22 from a - mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); - mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); - - //extract diag a33 from a - mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); - mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); - - //extract diag a44 from a - mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); - mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); - - //extract diag a55 from a - mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); - mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); - - //extract diag a66 from a - mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); - mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); - mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); - - //--> Transpose and store results of columns of B block <--// - ////unpacklow//// - mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); - mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); -#else - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); - mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); - mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); - mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); - mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); - mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); - mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); - mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); - - ////unpackhigh//// - mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - - //Merge rearranged high elements into complete rows - mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); - - //end loop of cols -} - -static void blis_strsm_microkernel_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal) -{ - //float ones = 1.0; - int j; - int cs_b_offset[6]; - //int row2, row4, row6; - float *ptr_b_dup; - - //70 number of ymm(256 bits) registers used - __m256 mat_b_col[8]; - __m256 mat_b_rearr[8]; - __m256 mat_a_cols[8]; - __m256 mat_a_cols_rearr[36]; - //__m256 mat_a_diag_inv[8]; - //__m256 reciprocal_diags; - __m256 alphaReg; - - cs_b_offset[0] = (cs_b << 1); - cs_b_offset[1] = cs_b + cs_b_offset[0]; - cs_b_offset[2] = (cs_b << 2); - cs_b_offset[3] = cs_b + cs_b_offset[2]; - cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; - cs_b_offset[5] = cs_b + cs_b_offset[4]; - - //reciprocal_diags = _mm256_loadu_ps((float const *)ones); - //reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); - alphaReg = _mm256_broadcast_ss((float const *)&alphaVal); - - // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // - - //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); - //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); - //row2 = (cs_l << 1); - //row4 = (cs_l << 2); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); - //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); - //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); - //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); - //row6 = row2 + row4; - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); - //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); - //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); - //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); - //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); - - //reciprocal_diags = _mm256_loadu_ps((float const *)ones); - - //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L - /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ - - //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers - //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. - //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); - //1st col - mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); - mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); - mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); - mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); - mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); - mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); - mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); - mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); - //2nd col - ptr_l += cs_l; - mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); - mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //3rd col - ptr_l += cs_l; - mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //4rth col - ptr_l += cs_l; - mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //5th col - ptr_l += cs_l; - mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //6th col - ptr_l += cs_l; - mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //7th col - ptr_l += cs_l; - mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //8th col - //ptr_l += cs_l; - //mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - - numCols_b -= 8; // blk_width = 8 - - //compute reciprocals of L(i,i) and broadcast in registers - //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); - //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); - //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); - //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); - - //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); - //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); - //mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); - //mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); - //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); - - //reciprocal of diagnol elements - //reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); - - //Start loop for cols of B to be processed in size of blk_width - for (j = 0; j < numCols_b; j += 8) - { - ptr_b_dup = ptr_b; - - /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ - - ////unpacklow//// - mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); - mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); - mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); - mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); - mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); - mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); - mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); - - ////unpackhigh//// - mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); - mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); - mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); - mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); - mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); -#else - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); - mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); - mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); - mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); - mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); -#endif - - //extract diag a00 from a - //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); - //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); - - //Merge rearranged high elements into complete rows - mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); - mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); - mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); - mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); - - mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); - mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); - mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); - mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); - - //extract diag a11 from a - //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); - //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); - - //extract diag a22 from a - //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); - //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); - - //extract diag a33 from a - //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); - //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); - - //extract diag a44 from a - //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); - //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); - - //extract diag a55 from a - //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); - //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); - - //extract diag a66 from a - //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); - //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); - //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); - - //--> Transpose and store results of columns of B block <--// - ////unpacklow//// - mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); - mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); -#else - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); - mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); - mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); - mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); - mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); - mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); - mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); - mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); - - ////unpackhigh//// - mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - - //Merge rearranged high elements into complete rows - mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - //Read next set of B columns - ptr_b += (cs_b + cs_b_offset[5]); - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); - - //end loop of cols - } - - //Last block trsm processing - ptr_b_dup = ptr_b; - - /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ - - ////unpacklow//// - mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); - mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); - mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); - mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); - mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); - mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); - mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); - - ////unpackhigh//// - mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); - mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); - mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); - mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); - mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); -#else - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); - mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); - mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); - mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); - mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); -#endif - - //extract diag a00 from a - //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); - //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); - - //Merge rearranged high elements into complete rows - mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); - mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); - mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); - mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); - - mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); - mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); - mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); - mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); - - //extract diag a11 from a - //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); - //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); - - //extract diag a22 from a - //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); - //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); - - //extract diag a33 from a - //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); - //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); - - //extract diag a44 from a - //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); - //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); - - //extract diag a55 from a - //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); - //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); - - //extract diag a66 from a - //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); - //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); - //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); - - //--> Transpose and store results of columns of B block <--// - ////unpacklow//// - mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); - mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); -#else - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); - mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); - mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); - mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); - mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); - mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); - mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); - mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); - - ////unpackhigh//// - mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - - //Merge rearranged high elements into complete rows - mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); - - //end loop of cols -} - -static void blis_strsm_microkernel_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) -{ - //float ones = 1.0; - int j; - int cs_b_offset[6]; - //int row2, row4, row6; - float *ptr_b_dup; - - //70 number of ymm(256 bits) registers used - __m256 mat_b_col[8]; - __m256 mat_b_rearr[8]; - __m256 mat_a_cols[8]; - __m256 mat_a_cols_rearr[36]; - //__m256 mat_a_diag_inv[8]; - //__m256 reciprocal_diags; - - cs_b_offset[0] = (cs_b << 1); - cs_b_offset[1] = cs_b + cs_b_offset[0]; - cs_b_offset[2] = (cs_b << 2); - cs_b_offset[3] = cs_b + cs_b_offset[2]; - cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; - cs_b_offset[5] = cs_b + cs_b_offset[4]; - - //reciprocal_diags = _mm256_loadu_ps((float const *)ones); - //reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); - - // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // - - //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); - //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); - //row2 = (cs_l << 1); - //row4 = (cs_l << 2); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); - //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); - //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); - //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); - //row6 = row2 + row4; - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); - //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); - //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); - //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); - //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); - - //reciprocal_diags = _mm256_loadu_ps((float const *)ones); - - //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L - /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ - - //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers - //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. - //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); - //1st col - mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); - mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); - mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); - mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); - mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); - mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); - mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); - mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); - //2nd col - ptr_l += cs_l; - mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); - mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //3rd col - ptr_l += cs_l; - mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //4rth col - ptr_l += cs_l; - mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //5th col - ptr_l += cs_l; - mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //6th col - ptr_l += cs_l; - mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //7th col - ptr_l += cs_l; - mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //8th col - //ptr_l += cs_l; - //mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - - numCols_b -= 8; // blk_width = 8 - - //compute reciprocals of L(i,i) and broadcast in registers - //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); - //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); - //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); - //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); - - //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); - //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); - //mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); - //mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); - //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); - - //reciprocal of diagnol elements - //reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); - - //Start loop for cols of B to be processed in size of blk_width - for (j = 0; j < numCols_b; j += 8) - { - ptr_b_dup = ptr_b; - - /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ - - ////unpacklow//// - mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); - mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); - mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); - mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - ////unpackhigh//// - mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); - mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); - mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); - mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); - mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); -#else - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); - mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); - mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); - mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); - mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); -#endif - - //extract diag a00 from a - //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); - //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); - - //Merge rearranged high elements into complete rows - mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); - mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); - mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); - mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); - - //extract diag a11 from a - //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); - //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); - - //extract diag a22 from a - //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); - //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); - - //extract diag a33 from a - //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); - //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); - - //extract diag a44 from a - //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); - //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); - - //extract diag a55 from a - //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); - //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); - - //extract diag a66 from a - //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); - //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); - //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); - - //--> Transpose and store results of columns of B block <--// - ////unpacklow//// - mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); - mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); -#else - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); - mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); - mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); - mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); - mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); - mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); - mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); - mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); - - ////unpackhigh//// - mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - - //Merge rearranged high elements into complete rows - mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - //Read next set of B columns - ptr_b += (cs_b + cs_b_offset[5]); - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); - //end loop of cols - } - - //Last block trsm processing - ptr_b_dup = ptr_b; - - /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ - - ////unpacklow//// - mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); - mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); - mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); - mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - ////unpackhigh//// - mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); - mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); - mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); - mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); - mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); -#else - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); - mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); - mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); - mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); - mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); -#endif - - //extract diag a00 from a - //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); - //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); - - //Merge rearranged high elements into complete rows - mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); - mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); - mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); - mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); - - //extract diag a11 from a - //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); - //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); - - //extract diag a22 from a - //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); - //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); - - //extract diag a33 from a - //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); - //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); - - //extract diag a44 from a - //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); - //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); - - //extract diag a55 from a - //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); - //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); - - //extract diag a66 from a - //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); - //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); - //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); - - //--> Transpose and store results of columns of B block <--// - ////unpacklow//// - mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); - mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); -#else - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); - mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); - mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); - mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); - mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); - mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); - mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); - mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); - - ////unpackhigh//// - mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - - //Merge rearranged high elements into complete rows - mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); - //end loop of cols -} - -static void blis_strsm_microkernel(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) -{ - float ones = 1.0; - int j; - int cs_b_offset[6]; - //int row2, row4, row6; - float *ptr_b_dup; - - //70 number of ymm(256 bits) registers used - __m256 mat_b_col[8]; - __m256 mat_b_rearr[8]; - __m256 mat_a_cols[8]; - __m256 mat_a_cols_rearr[36]; - __m256 mat_a_diag_inv[8]; - __m256 reciprocal_diags; - - cs_b_offset[0] = (cs_b << 1); - cs_b_offset[1] = cs_b + cs_b_offset[0]; - cs_b_offset[2] = (cs_b << 2); - cs_b_offset[3] = cs_b + cs_b_offset[2]; - cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; - cs_b_offset[5] = cs_b + cs_b_offset[4]; - - //reciprocal_diags = _mm256_loadu_ps((float const *)ones); - reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); - - // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // - - //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); - //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); - //row2 = (cs_l << 1); - //row4 = (cs_l << 2); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); - //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); - //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); - //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); - //row6 = row2 + row4; - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); - //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); - //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); - //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); - //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); - - //reciprocal_diags = _mm256_loadu_ps((float const *)ones); - - //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L - /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); - ptr_l += cs_l; - mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ - - //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers - //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. - //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); - //1st col - mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); - mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); - mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); - mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); - mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); - mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); - mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); - mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); - //2nd col - ptr_l += cs_l; - mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); - mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //3rd col - ptr_l += cs_l; - mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //4rth col - ptr_l += cs_l; - mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //5th col - ptr_l += cs_l; - mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //6th col - ptr_l += cs_l; - mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //7th col - ptr_l += cs_l; - mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - //7th col - ptr_l += cs_l; - mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - - numCols_b -= 8; // blk_width = 8 - - //compute reciprocals of L(i,i) and broadcast in registers - mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); - mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); - mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); - mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); - - //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); - //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); - mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); - - //reciprocal of diagnol elements - reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); - - //Start loop for cols of B to be processed in size of blk_width - for (j = 0; j < numCols_b; j += 8) - { - ptr_b_dup = ptr_b; - - /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ - - ////unpacklow//// - mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); - mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); - mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); - mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - ////unpackhigh//// - mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); - mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); - mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); - mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); - mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); -#else - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); - mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); - mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); - mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); - mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); -#endif - - //extract diag a00 from a - mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); - - //Merge rearranged high elements into complete rows - mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); - mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); - mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); - mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); - - //extract diag a11 from a - mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); - mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); - - //extract diag a22 from a - mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); - mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); - - //extract diag a33 from a - mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); - mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); - - //extract diag a44 from a - mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); - mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); - - //extract diag a55 from a - mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); - mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); - - //extract diag a66 from a - mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); - mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); - mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); - - //--> Transpose and store results of columns of B block <--// - ////unpacklow//// - mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); - mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); -#else - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); - mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); - mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); - mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); - mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); - mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); - mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); - mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); - - ////unpackhigh//// - mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - - //Merge rearranged high elements into complete rows - mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - //Read next set of B columns - ptr_b += (cs_b + cs_b_offset[5]); - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); - //end loop of cols - } - - //Last block trsm processing - ptr_b_dup = ptr_b; - - /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ - - ////unpacklow//// - mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); - mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); - mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); - mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - ////unpackhigh//// - mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); - mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); - mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); - mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); - mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); -#else - mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); - mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); - mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); - mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); - mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); - mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); -#endif - - //extract diag a00 from a - mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); - - //Merge rearranged high elements into complete rows - mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); - mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); - mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); - mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); - - //extract diag a11 from a - mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); - mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); - - //extract diag a22 from a - mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); - mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); - - //extract diag a33 from a - mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); - mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); - - //extract diag a44 from a - mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); - mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); - - //extract diag a55 from a - mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); - mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); - - //extract diag a66 from a - mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); - mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); - mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); - - //--> Transpose and store results of columns of B block <--// - ////unpacklow//// - mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange low elements -#if REARRANGE_SHFL == 1 - mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); - mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); -#else - mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); - mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); - mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); - mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); - mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); - mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); -#endif - //Merge rearranged low elements into complete rows - mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); - mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); - mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); - mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); - - ////unpackhigh//// - mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); - mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); - mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); - mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); - - //Rearrange high elements -#if REARRANGE_SHFL == 1 - mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); - mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); -#else - mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); - mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); - mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); - mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); - mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); - mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); -#endif - - //Merge rearranged high elements into complete rows - mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); - mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); - mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); - mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); - //end loop of cols -} - -///////////////////////////////////// XA'=B functions //////////////////////////////// - -static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) -{ - float ones = 1.0; - int i, i1, i2, i3, i4, j, k, l; - int cs_b_offset[7]; - int cs_l_offset[7]; - float *ptr_b_dup; - - //57 number of ymm(256 bits) registers used - __m256 mat_b_col[8]; - __m256 mat_b_rearr[16][8]; - __m256 mat_a_cols_rearr[8]; - __m256 mat_a_blk_elems[64]; - __m256 mat_a_diag_inv[8]; - __m256 reciprocal_diags[2]; - - reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); - - // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // - - //L matrix offsets - cs_l_offset[0] = (cs_l << 1); - cs_l_offset[1] = cs_l + cs_l_offset[0]; - cs_l_offset[2] = (cs_l << 2); - cs_l_offset[3] = cs_l + cs_l_offset[2]; - cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; - cs_l_offset[5] = cs_l + cs_l_offset[4]; - cs_l_offset[6] = (cs_l_offset[5] + cs_l); - - //read diag elems of L 16x16 block - mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l); - mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); - mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); - mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); - mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); - mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); - mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); - mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); - - cs_b_offset[0] = (cs_b << 1); - cs_b_offset[1] = cs_b + cs_b_offset[0]; - cs_b_offset[2] = (cs_b << 2); - cs_b_offset[3] = cs_b + cs_b_offset[2]; - cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; - cs_b_offset[5] = cs_b + cs_b_offset[4]; - cs_b_offset[6] = (cs_b_offset[5] + cs_b); - - reciprocal_diags[1] = reciprocal_diags[0]; - - //pack first 8 diags together - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 - mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 - mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 - mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 - mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 - - //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 - reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); - - //Broadcast A10 to A70 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); - - //Broadcast A32 to A72 to registers - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); - - //Broadcast A43 to A73 to registers - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); - - //Broadcast A54 to A74 to registers - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); - - //Broadcast A65 to A75 to registers - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); - - //Broadcast A76 to register - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); - - //extract diag a00 from a - mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); - //extract diag a11 from a - mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); - mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); - //extract diag a22 from a - mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); - mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); - //extract diag a33 from a - mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); - mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); - //extract diag a44 from a - mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); - mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); - //extract diag a55 from a - mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); - mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); - //extract diag a66 from a - mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); - mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); - //extract diag a77 from a - mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); - mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); - - - /***************** first set of 8 rows of B processing starts *****************/ - ptr_b_dup = ptr_b; - i = 0; - for (j = 0; j < numCols_b; j += 8) - { - /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A - //read 8x8 block of B into registers - mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); - mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); - mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); - mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); - mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); - mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); - mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); - mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - mat_b_col[0] = _mm256_mul_ps(mat_b_rearr[0][0], mat_a_diag_inv[0]); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) - mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - mat_b_col[1] = _mm256_mul_ps(mat_b_rearr[1][0], mat_a_diag_inv[1]); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - mat_b_col[2] = _mm256_mul_ps(mat_b_rearr[2][0], mat_a_diag_inv[2]); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - mat_b_col[3] = _mm256_mul_ps(mat_b_rearr[3][0], mat_a_diag_inv[3]); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - mat_b_col[4] = _mm256_mul_ps(mat_b_rearr[4][0], mat_a_diag_inv[4]); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - mat_b_col[5] = _mm256_mul_ps(mat_b_rearr[5][0], mat_a_diag_inv[5]); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - mat_b_col[6] = _mm256_mul_ps(mat_b_rearr[6][0], mat_a_diag_inv[6]); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - mat_b_col[7] = _mm256_mul_ps(mat_b_rearr[7][0], mat_a_diag_inv[7]); - - //////////////////////////////////////////////////////////////////////////////// - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); - - //i += cs_b_offset[6]; - //ptr_b_dup += cs_b_offset[6]; - i += 8; - ptr_b_dup += 8; - } - - //c = 0; - /***************** first set of 8 cols of B processing done *****************/ - ptr_b_dup = ptr_b; - i3 = 0; - i1 = 0; - //Start loop for cols of B to be processed in size of blk_width - for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row - { - ptr_l += 8; - //ptr_b += j; - //ptr_b_dup += 8; - ptr_b_dup += cs_b_offset[6]; - i1 += cs_b_offset[6]; - - //Read next 8x8 block of A to get diag elements - i3 += cs_l_offset[6]; - mat_a_cols_rearr[8] = _mm256_loadu_ps((float const *)ptr_l + i3); - mat_a_cols_rearr[9] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); - mat_a_cols_rearr[10] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); - mat_a_cols_rearr[11] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); - mat_a_cols_rearr[12] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); - mat_a_cols_rearr[13] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); - mat_a_cols_rearr[14] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); - mat_a_cols_rearr[15] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); - - //pack 8 diags of A together - reciprocal_diags[0] = reciprocal_diags[1]; - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[8], mat_a_cols_rearr[9], 0xAA);//diag 0,1 - mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[10], mat_a_cols_rearr[11], 0xAA);//diag 2,3 - mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[12], mat_a_cols_rearr[13], 0xAA);//diag 4,5 - mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[15], 0xAA);//diag 6,7 - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 - mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 - - //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 - reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); - - i = 0; - i2 = 0; - for (k = 0; k < numCols_b; k += 8) - { - i = i1 + k; - //Read 8 cols of B columns of Block-to-be-solved - mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); - mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); - mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); - mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); - mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); - mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); - mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); - mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); - i2++; - } - - i = 0; - i2 = 0; - for (l = 0; l < j; l += 8) // move across m - { - //Broadcast A8,0 to A15,0 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); - - //Broadcast A8,2 to A15,2 to registers - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); - - //Broadcast A8,3 to A15,3 to registers - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); - mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); - mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); - mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); - mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); - - // _mm256_permute2f128_ps() - - //Broadcast A8,4 to A15,4 to registers - mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); - mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); - mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); - mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); - mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); - mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); - mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); - mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); - - //Broadcast A8,5 to A15,5 to registers - mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); - mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); - mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); - mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); - mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); - mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); - mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); - mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); - - //Broadcast A8,6 to A15,6 to registers - mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); - mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); - mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); - mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); - mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); - mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); - mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); - mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); - - //Broadcast A8,7 to A15,7 to registers - mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); - mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); - mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); - mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); - mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); - mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); - mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); - mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); - - i += cs_l_offset[6]; - - - for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) - { - /////////////////// Partial Lower 8x8 block trsm of B - - i4 = i2 + k; - //Read current 8 cols of B columns from specified 8x8 current-block of B - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); - - i4 = k >> 3; - - //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) - - //end loop of cols - } - i2 += cs_b_offset[6]; - } - - //Broadcast A10 to A70 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a00 from a - mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a11 from a - mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); - mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); - - //Broadcast A32 to A72 to registers - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a22 from a - mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); - mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); - - //Broadcast A43 to A73 to registers - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a33 from a - mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); - mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); - - //Broadcast A54 to A74 to registers - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a44 from a - mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); - mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); - - //Broadcast A65 to A75 to registers - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a55 from a - mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); - mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); - - //Broadcast A76 to register - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - //extract diag a66 from a - mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); - mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); - mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); - - k = 0; - for (i = 0; i < numCols_b; i+=8) - { - /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - mat_b_rearr[k][0] = _mm256_mul_ps(mat_b_rearr[k][0], mat_a_diag_inv[0]); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) - mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - mat_b_rearr[k][1] = _mm256_mul_ps(mat_b_rearr[k][1], mat_a_diag_inv[1]); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - mat_b_rearr[k][2] = _mm256_mul_ps(mat_b_rearr[k][2], mat_a_diag_inv[2]); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - mat_b_rearr[k][3] = _mm256_mul_ps(mat_b_rearr[k][3], mat_a_diag_inv[3]); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - mat_b_rearr[k][4] = _mm256_mul_ps(mat_b_rearr[k][4], mat_a_diag_inv[4]); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - mat_b_rearr[k][5] = _mm256_mul_ps(mat_b_rearr[k][5], mat_a_diag_inv[5]); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - mat_b_rearr[k][6] = _mm256_mul_ps(mat_b_rearr[k][6], mat_a_diag_inv[6]); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - mat_b_rearr[k][7] = _mm256_mul_ps(mat_b_rearr[k][7], mat_a_diag_inv[7]); - - //////////////////////////////////////////////////////////////////////////////// - - //Store the computed B columns - - _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); - //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); - k++; - } - - - } - ///////////////////loop ends ///////////////////// -} - -static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) -{ - float ones = 1.0; - int i, i1, i2, i3, i4, j, k, l; - int cs_b_offset[7]; - int cs_l_offset[7]; - float *ptr_b_dup; - - //57 number of ymm(256 bits) registers used - __m256 mat_b_col[8]; - __m256 mat_b_rearr[16][8]; - __m256 mat_a_cols_rearr[8]; - __m256 mat_a_blk_elems[64]; - __m256 mat_a_diag_inv[8]; - __m256 reciprocal_diags[2]; - __m256 alphaReg; - - reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); - alphaReg = _mm256_broadcast_ss((float const *)&alpha); - - // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // - - //L matrix offsets - cs_l_offset[0] = (cs_l << 1); - cs_l_offset[1] = cs_l + cs_l_offset[0]; - cs_l_offset[2] = (cs_l << 2); - cs_l_offset[3] = cs_l + cs_l_offset[2]; - cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; - cs_l_offset[5] = cs_l + cs_l_offset[4]; - cs_l_offset[6] = (cs_l_offset[5] + cs_l); - - //read diag elems of L 16x16 block - mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l); - mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); - mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); - mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); - mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); - mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); - mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); - mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); - - cs_b_offset[0] = (cs_b << 1); - cs_b_offset[1] = cs_b + cs_b_offset[0]; - cs_b_offset[2] = (cs_b << 2); - cs_b_offset[3] = cs_b + cs_b_offset[2]; - cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; - cs_b_offset[5] = cs_b + cs_b_offset[4]; - cs_b_offset[6] = (cs_b_offset[5] + cs_b); - - reciprocal_diags[1] = reciprocal_diags[0]; - - //pack first 8 diags together - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 - mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 - mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 - mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 - mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 - - //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 - reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); - - //Broadcast A10 to A70 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); - - //Broadcast A32 to A72 to registers - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); - - //Broadcast A43 to A73 to registers - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); - - //Broadcast A54 to A74 to registers - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); - - //Broadcast A65 to A75 to registers - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); - - //Broadcast A76 to register - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); - - //extract diag a00 from a - mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); - //extract diag a11 from a - mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); - mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); - //extract diag a22 from a - mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); - mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); - //extract diag a33 from a - mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); - mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); - //extract diag a44 from a - mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); - mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); - //extract diag a55 from a - mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); - mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); - //extract diag a66 from a - mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); - mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); - //extract diag a77 from a - mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); - mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); - - - /***************** first set of 8 rows of B processing starts *****************/ - ptr_b_dup = ptr_b; - i = 0; - for (j = 0; j < numCols_b; j += 8) - { - /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A - //read 8x8 block of B into registers - mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); - mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); - mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); - mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); - mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); - mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); - mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); - mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); - - mat_b_rearr[0][0] = _mm256_mul_ps(mat_b_rearr[0][0], alphaReg); - mat_b_rearr[1][0] = _mm256_mul_ps(mat_b_rearr[1][0], alphaReg); - mat_b_rearr[2][0] = _mm256_mul_ps(mat_b_rearr[2][0], alphaReg); - mat_b_rearr[3][0] = _mm256_mul_ps(mat_b_rearr[3][0], alphaReg); - mat_b_rearr[4][0] = _mm256_mul_ps(mat_b_rearr[4][0], alphaReg); - mat_b_rearr[5][0] = _mm256_mul_ps(mat_b_rearr[5][0], alphaReg); - mat_b_rearr[6][0] = _mm256_mul_ps(mat_b_rearr[6][0], alphaReg); - mat_b_rearr[7][0] = _mm256_mul_ps(mat_b_rearr[7][0], alphaReg); - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - mat_b_col[0] = _mm256_mul_ps(mat_b_rearr[0][0], mat_a_diag_inv[0]); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[1][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) - mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - mat_b_col[1] = _mm256_mul_ps(mat_b_rearr[1][0], mat_a_diag_inv[1]); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - mat_b_col[2] = _mm256_mul_ps(mat_b_rearr[2][0], mat_a_diag_inv[2]); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - mat_b_col[3] = _mm256_mul_ps(mat_b_rearr[3][0], mat_a_diag_inv[3]); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - mat_b_col[4] = _mm256_mul_ps(mat_b_rearr[4][0], mat_a_diag_inv[4]); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - mat_b_col[5] = _mm256_mul_ps(mat_b_rearr[5][0], mat_a_diag_inv[5]); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - mat_b_col[6] = _mm256_mul_ps(mat_b_rearr[6][0], mat_a_diag_inv[6]); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - mat_b_col[7] = _mm256_mul_ps(mat_b_rearr[7][0], mat_a_diag_inv[7]); - - //////////////////////////////////////////////////////////////////////////////// - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); - - //i += cs_b_offset[6]; - //ptr_b_dup += cs_b_offset[6]; - i += 8; - ptr_b_dup += 8; - } - - //c = 0; - /***************** first set of 8 cols of B processing done *****************/ - ptr_b_dup = ptr_b; - i3 = 0; - i1 = 0; - //Start loop for cols of B to be processed in size of blk_width - for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row - { - ptr_l += 8; - //ptr_b += j; - //ptr_b_dup += 8; - ptr_b_dup += cs_b_offset[6]; - i1 += cs_b_offset[6]; - - //Read next 8x8 block of A to get diag elements - i3 += cs_l_offset[6]; - mat_a_cols_rearr[8] = _mm256_loadu_ps((float const *)ptr_l + i3); - mat_a_cols_rearr[9] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); - mat_a_cols_rearr[10] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); - mat_a_cols_rearr[11] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); - mat_a_cols_rearr[12] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); - mat_a_cols_rearr[13] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); - mat_a_cols_rearr[14] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); - mat_a_cols_rearr[15] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); - - //pack 8 diags of A together - reciprocal_diags[0] = reciprocal_diags[1]; - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[8], mat_a_cols_rearr[9], 0xAA);//diag 0,1 - mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[10], mat_a_cols_rearr[11], 0xAA);//diag 2,3 - mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[12], mat_a_cols_rearr[13], 0xAA);//diag 4,5 - mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[15], 0xAA);//diag 6,7 - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 - mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 - mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 - - //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 - reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); - - i = 0; - i2 = 0; - for (k = 0; k < numCols_b; k += 8) - { - i = i1 + k; - //Read 8 cols of B columns of Block-to-be-solved - mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); - mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); - mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); - mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); - mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); - mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); - mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); - mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); - - mat_b_rearr[i2][0] = _mm256_mul_ps(mat_b_rearr[i2][0], alphaReg); - mat_b_rearr[i2][1] = _mm256_mul_ps(mat_b_rearr[i2][1], alphaReg); - mat_b_rearr[i2][2] = _mm256_mul_ps(mat_b_rearr[i2][2], alphaReg); - mat_b_rearr[i2][3] = _mm256_mul_ps(mat_b_rearr[i2][3], alphaReg); - mat_b_rearr[i2][4] = _mm256_mul_ps(mat_b_rearr[i2][4], alphaReg); - mat_b_rearr[i2][5] = _mm256_mul_ps(mat_b_rearr[i2][5], alphaReg); - mat_b_rearr[i2][6] = _mm256_mul_ps(mat_b_rearr[i2][6], alphaReg); - mat_b_rearr[i2][7] = _mm256_mul_ps(mat_b_rearr[i2][7], alphaReg); - - i2++; - } - - i = 0; - i2 = 0; - for (l = 0; l < j; l += 8) // move across m - { - //Broadcast A8,0 to A15,0 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); - - //Broadcast A8,2 to A15,2 to registers - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); - - //Broadcast A8,3 to A15,3 to registers - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); - mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); - mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); - mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); - mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); - - // _mm256_permute2f128_ps() - - //Broadcast A8,4 to A15,4 to registers - mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); - mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); - mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); - mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); - mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); - mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); - mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); - mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); - - //Broadcast A8,5 to A15,5 to registers - mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); - mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); - mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); - mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); - mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); - mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); - mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); - mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); - - //Broadcast A8,6 to A15,6 to registers - mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); - mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); - mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); - mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); - mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); - mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); - mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); - mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); - - //Broadcast A8,7 to A15,7 to registers - mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); - mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); - mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); - mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); - mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); - mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); - mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); - mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); - - i += cs_l_offset[6]; - - - for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) - { - /////////////////// Partial Lower 8x8 block trsm of B - - i4 = i2 + k; - //Read current 8 cols of B columns from specified 8x8 current-block of B - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); - - i4 = k >> 3; - - //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) - - //end loop of cols - } - i2 += cs_b_offset[6]; - } - - //Broadcast A10 to A70 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a00 from a - mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); - mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); - //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a11 from a - mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); - mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); - //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); - - //Broadcast A32 to A72 to registers - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a22 from a - mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); - mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); - //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); - - //Broadcast A43 to A73 to registers - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a33 from a - mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); - mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); - //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); - - //Broadcast A54 to A74 to registers - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a44 from a - mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); - mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); - //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); - - //Broadcast A65 to A75 to registers - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - //extract diag a55 from a - mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); - mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); - //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); - - //Broadcast A76 to register - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - //extract diag a66 from a - mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); - mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); - //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); - - //extract diag a77 from a - mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); - mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); - //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); - - k = 0; - for (i = 0; i < numCols_b; i+=8) - { - /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A - - //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B - mat_b_rearr[k][0] = _mm256_mul_ps(mat_b_rearr[k][0], mat_a_diag_inv[0]); - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) - mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B - mat_b_rearr[k][1] = _mm256_mul_ps(mat_b_rearr[k][1], mat_a_diag_inv[1]); - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B - mat_b_rearr[k][2] = _mm256_mul_ps(mat_b_rearr[k][2], mat_a_diag_inv[2]); - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B - mat_b_rearr[k][3] = _mm256_mul_ps(mat_b_rearr[k][3], mat_a_diag_inv[3]); - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B - mat_b_rearr[k][4] = _mm256_mul_ps(mat_b_rearr[k][4], mat_a_diag_inv[4]); - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B - mat_b_rearr[k][5] = _mm256_mul_ps(mat_b_rearr[k][5], mat_a_diag_inv[5]); - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B - mat_b_rearr[k][6] = _mm256_mul_ps(mat_b_rearr[k][6], mat_a_diag_inv[6]); - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) - - //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B - mat_b_rearr[k][7] = _mm256_mul_ps(mat_b_rearr[k][7], mat_a_diag_inv[7]); - - //////////////////////////////////////////////////////////////////////////////// - - //Store the computed B columns - - _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); - //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); - k++; - } - - - } - ///////////////////loop ends ///////////////////// -} - -static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) -{ - //float ones = 1.0; - int i, i1, i2, i3, i4, j, k, l; - int cs_b_offset[7]; - int cs_l_offset[7]; - float *ptr_b_dup; - - //57 number of ymm(256 bits) registers used - __m256 mat_b_col[8]; - __m256 mat_b_rearr[16][8]; - //__m256 mat_a_cols_rearr[8]; - __m256 mat_a_blk_elems[64]; - //__m256 mat_a_diag_inv[8]; - //__m256 reciprocal_diags[2]; - - // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // - - //L matrix offsets - cs_l_offset[0] = (cs_l << 1); - cs_l_offset[1] = cs_l + cs_l_offset[0]; - cs_l_offset[2] = (cs_l << 2); - cs_l_offset[3] = cs_l + cs_l_offset[2]; - cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; - cs_l_offset[5] = cs_l + cs_l_offset[4]; - cs_l_offset[6] = (cs_l_offset[5] + cs_l); - - cs_b_offset[0] = (cs_b << 1); - cs_b_offset[1] = cs_b + cs_b_offset[0]; - cs_b_offset[2] = (cs_b << 2); - cs_b_offset[3] = cs_b + cs_b_offset[2]; - cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; - cs_b_offset[5] = cs_b + cs_b_offset[4]; - cs_b_offset[6] = (cs_b_offset[5] + cs_b); - - //Broadcast A10 to A70 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); - - //Broadcast A32 to A72 to registers - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); - - //Broadcast A43 to A73 to registers - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); - - //Broadcast A54 to A74 to registers - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); - - //Broadcast A65 to A75 to registers - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); - - //Broadcast A76 to register - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); - - - /***************** first set of 8 rows of B processing starts *****************/ - ptr_b_dup = ptr_b; - i = 0; - for (j = 0; j < numCols_b; j += 8) - { - /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A - //read 8x8 block of B into registers - mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); - mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); - mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); - mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); - mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); - mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); - mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); - mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); - - //(Row0) - mat_b_col[0] = mat_b_rearr[0][0]; - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) - mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) - - //////////////////////////////////////////////////////////////////////////////// - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); - - //i += cs_b_offset[6]; - //ptr_b_dup += cs_b_offset[6]; - i += 8; - ptr_b_dup += 8; - } - - //c = 0; - /***************** first set of 8 cols of B processing done *****************/ - ptr_b_dup = ptr_b; - i3 = 0; - i1 = 0; - //Start loop for cols of B to be processed in size of blk_width - for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row - { - ptr_l += 8; - //ptr_b += j; - //ptr_b_dup += 8; - ptr_b_dup += cs_b_offset[6]; - i1 += cs_b_offset[6]; - i3 += cs_l_offset[6]; - - i = 0; - i2 = 0; - for (k = 0; k < numCols_b; k += 8) - { - i = i1 + k; - //Read 8 cols of B columns of Block-to-be-solved - mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); - mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); - mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); - mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); - mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); - mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); - mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); - mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); - i2++; - } - - i = 0; - i2 = 0; - for (l = 0; l < j; l += 8) // move across m - { - //Broadcast A8,0 to A15,0 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); - - //Broadcast A8,2 to A15,2 to registers - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); - - //Broadcast A8,3 to A15,3 to registers - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); - mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); - mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); - mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); - mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); - - // _mm256_permute2f128_ps() - - //Broadcast A8,4 to A15,4 to registers - mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); - mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); - mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); - mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); - mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); - mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); - mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); - mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); - - //Broadcast A8,5 to A15,5 to registers - mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); - mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); - mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); - mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); - mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); - mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); - mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); - mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); - - //Broadcast A8,6 to A15,6 to registers - mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); - mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); - mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); - mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); - mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); - mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); - mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); - mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); - - //Broadcast A8,7 to A15,7 to registers - mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); - mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); - mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); - mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); - mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); - mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); - mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); - mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); - - i += cs_l_offset[6]; - - for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) - { - /////////////////// Partial Lower 8x8 block trsm of B - - i4 = i2 + k; - //Read current 8 cols of B columns from specified 8x8 current-block of B - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); - - i4 = k >> 3; - - //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) - - //end loop of cols - } - i2 += cs_b_offset[6]; - } - - //Broadcast A10 to A70 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A32 to A72 to registers - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A43 to A73 to registers - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A54 to A74 to registers - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A65 to A75 to registers - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A76 to register - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - - k = 0; - for (i = 0; i < numCols_b; i+=8) - { - /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A - - //(Row0): already done - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) - mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) - - //////////////////////////////////////////////////////////////////////////////// - - //Store the computed B columns - - _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); - //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); - k++; - } - - - } - ///////////////////loop ends ///////////////////// -} - -static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) -{ - //float ones = 1.0; - int i, i1, i2, i3, i4, j, k, l; - int cs_b_offset[7]; - int cs_l_offset[7]; - float *ptr_b_dup; - - //57 number of ymm(256 bits) registers used - __m256 mat_b_col[8]; - __m256 mat_b_rearr[16][8]; - //__m256 mat_a_cols_rearr[8]; - __m256 mat_a_blk_elems[64]; - //__m256 mat_a_diag_inv[8]; - //__m256 reciprocal_diags[2]; - __m256 alphaReg; - alphaReg = _mm256_broadcast_ss((float const *)&alpha); - - // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // - - //L matrix offsets - cs_l_offset[0] = (cs_l << 1); - cs_l_offset[1] = cs_l + cs_l_offset[0]; - cs_l_offset[2] = (cs_l << 2); - cs_l_offset[3] = cs_l + cs_l_offset[2]; - cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; - cs_l_offset[5] = cs_l + cs_l_offset[4]; - cs_l_offset[6] = (cs_l_offset[5] + cs_l); - - cs_b_offset[0] = (cs_b << 1); - cs_b_offset[1] = cs_b + cs_b_offset[0]; - cs_b_offset[2] = (cs_b << 2); - cs_b_offset[3] = cs_b + cs_b_offset[2]; - cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; - cs_b_offset[5] = cs_b + cs_b_offset[4]; - cs_b_offset[6] = (cs_b_offset[5] + cs_b); - - //Broadcast A10 to A70 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); - - //Broadcast A32 to A72 to registers - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); - - //Broadcast A43 to A73 to registers - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); - - //Broadcast A54 to A74 to registers - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); - - //Broadcast A65 to A75 to registers - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); - - //Broadcast A76 to register - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); - - - /***************** first set of 8 rows of B processing starts *****************/ - ptr_b_dup = ptr_b; - i = 0; - for (j = 0; j < numCols_b; j += 8) - { - /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A - //read 8x8 block of B into registers - mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); - mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); - mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); - mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); - mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); - mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); - mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); - mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); - - mat_b_rearr[0][0] = _mm256_mul_ps(mat_b_rearr[0][0], alphaReg); - mat_b_rearr[1][0] = _mm256_mul_ps(mat_b_rearr[1][0], alphaReg); - mat_b_rearr[2][0] = _mm256_mul_ps(mat_b_rearr[2][0], alphaReg); - mat_b_rearr[3][0] = _mm256_mul_ps(mat_b_rearr[3][0], alphaReg); - mat_b_rearr[4][0] = _mm256_mul_ps(mat_b_rearr[4][0], alphaReg); - mat_b_rearr[5][0] = _mm256_mul_ps(mat_b_rearr[5][0], alphaReg); - mat_b_rearr[6][0] = _mm256_mul_ps(mat_b_rearr[6][0], alphaReg); - mat_b_rearr[7][0] = _mm256_mul_ps(mat_b_rearr[7][0], alphaReg); - - //(Row0) - mat_b_col[0] = mat_b_rearr[0][0]; - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) - mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) - mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) - mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) - mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) - mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) - mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) - - //////////////////////////////////////////////////////////////////////////////// - - //Store the computed B columns - _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); - - //i += cs_b_offset[6]; - //ptr_b_dup += cs_b_offset[6]; - i += 8; - ptr_b_dup += 8; - } - - //c = 0; - /***************** first set of 8 cols of B processing done *****************/ - ptr_b_dup = ptr_b; - i3 = 0; - i1 = 0; - //Start loop for cols of B to be processed in size of blk_width - for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row - { - ptr_l += 8; - //ptr_b += j; - //ptr_b_dup += 8; - ptr_b_dup += cs_b_offset[6]; - i1 += cs_b_offset[6]; - i3 += cs_l_offset[6]; - - i = 0; - i2 = 0; - for (k = 0; k < numCols_b; k += 8) - { - i = i1 + k; - //Read 8 cols of B columns of Block-to-be-solved - mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); - mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); - mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); - mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); - mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); - mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); - mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); - mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); - - mat_b_rearr[i2][0] = _mm256_mul_ps(mat_b_rearr[i2][0], alphaReg); - mat_b_rearr[i2][1] = _mm256_mul_ps(mat_b_rearr[i2][1], alphaReg); - mat_b_rearr[i2][2] = _mm256_mul_ps(mat_b_rearr[i2][2], alphaReg); - mat_b_rearr[i2][3] = _mm256_mul_ps(mat_b_rearr[i2][3], alphaReg); - mat_b_rearr[i2][4] = _mm256_mul_ps(mat_b_rearr[i2][4], alphaReg); - mat_b_rearr[i2][5] = _mm256_mul_ps(mat_b_rearr[i2][5], alphaReg); - mat_b_rearr[i2][6] = _mm256_mul_ps(mat_b_rearr[i2][6], alphaReg); - mat_b_rearr[i2][7] = _mm256_mul_ps(mat_b_rearr[i2][7], alphaReg); - - i2++; - } - - i = 0; - i2 = 0; - for (l = 0; l < j; l += 8) // move across m - { - //Broadcast A8,0 to A15,0 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); - - //Broadcast A8,2 to A15,2 to registers - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); - - //Broadcast A8,3 to A15,3 to registers - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); - mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); - mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); - mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); - mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); - - // _mm256_permute2f128_ps() - - //Broadcast A8,4 to A15,4 to registers - mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); - mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); - mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); - mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); - mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); - mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); - mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); - mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); - - //Broadcast A8,5 to A15,5 to registers - mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); - mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); - mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); - mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); - mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); - mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); - mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); - mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); - - //Broadcast A8,6 to A15,6 to registers - mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); - mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); - mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); - mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); - mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); - mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); - mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); - mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); - - //Broadcast A8,7 to A15,7 to registers - mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); - mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); - mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); - mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); - mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); - mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); - mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); - mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); - - i += cs_l_offset[6]; - - for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) - { - /////////////////// Partial Lower 8x8 block trsm of B - - i4 = i2 + k; - //Read current 8 cols of B columns from specified 8x8 current-block of B - mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); - mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); - mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); - mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); - mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); - mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); - mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); - mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); - - i4 = k >> 3; - - //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) - - //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) - mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) - mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) - mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) - mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) - mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) - mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) - mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) - - //end loop of cols - } - i2 += cs_b_offset[6]; - } - - //Broadcast A10 to A70 to registers - mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); - mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A21 to A71 to registers - mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); - mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A32 to A72 to registers - mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); - mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A43 to A73 to registers - mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); - mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A54 to A74 to registers - mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); - mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A65 to A75 to registers - mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); - mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - i += cs_l; - - //Broadcast A76 to register - mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); - - k = 0; - for (i = 0; i < numCols_b; i+=8) - { - /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A - - //(Row0): already done - - //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) - mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) - mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) - mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) - mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) - mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) - mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) - mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) - - //(Row7): FMA operations of b7 with elements of index (7, 0) - mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) - - //////////////////////////////////////////////////////////////////////////////// - - //Store the computed B columns - - _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); - _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); - _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); - //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); - k++; - } - - - } - ///////////////////loop ends ///////////////////// -} - - -#endif - ->>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B