diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 41de6fd4e..84757ecae 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -111,15 +112,42 @@ void bli_cntx_init_zen( cntx_t* cntx ) // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + +/* + Multi Instance performance improvement of DGEMM when binded to a CCX + In Multi instance each thread runs a sequential DGEMM. + + a) If BLIS is run in a multi instance mode with + CPU freq 2.6/2.2 Ghz + DDR4 clock frequency 2400Mhz + mc = 240, kc = 512, and nc = 2040 + has better performance on EPYC server, over the default block sizes. + + b) If BLIS is run in Single Instance mode + mc = 510, kc = 1024 and nc = 4080 + +*/ + #ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES // Zen optmized level 3 cache block sizes - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 ); + #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES + + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + + #else + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); + #endif #else - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); #endif - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h index c872a21eb..d69f5270a 100644 --- a/config/zen/bli_family_zen.h +++ b/config/zen/bli_family_zen.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2016, Advanced Micro Devices, Inc. + Copyright (C) 2018, Advanced Micro Devices, Inc Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -43,14 +43,21 @@ #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES -//#define BLIS_ENABLE_SMALL_MATRIX +#define BLIS_ENABLE_SMALL_MATRIX +#define BLIS_ENABLE_SMALL_MATRIX_TRSM + // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 +#define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) +#define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 - +//This macro will enable BLIS DGEMM to choose block sizes for a single instance mode +#define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 //#endif diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk index 40b07661b..a485af27b 100644 --- a/config/zen/make_defs.mk +++ b/config/zen/make_defs.mk @@ -64,13 +64,13 @@ endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) # gcc 6.0 (clang 4.0) or later: -#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 +CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 # gcc 4.9 (clang 3.5) or later: # possibly add zen-specific instructions: -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt -CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else $(error gcc or clang are required for this configuration.) endif diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 534848e33..cc2163fae 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -46,7 +47,9 @@ void bli_syrk_front ) { bli_init_once(); - +#ifdef BLIS_ENABLE_SMALL_MATRIX + gint_t status = BLIS_FAILURE; +#endif obj_t a_local; obj_t at_local; obj_t c_local; @@ -68,6 +71,29 @@ void bli_syrk_front bli_obj_set_as_root( &c_local ); // For syrk, the right-hand "B" operand is simply A^T. +#ifdef BLIS_ENABLE_SMALL_MATRIX + bli_obj_alias_to( a, &at_local ); + if (bli_obj_has_trans(a) != 0) + {//At*A operation + bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &at_local ); + //call small syrk. + //syrk small matrix threshold check is done inside bli_syrk_small(). + status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local, cntx, cntl ); + } + else if ((a->dim[0] <= BLIS_SMALL_MATRIX_A_THRES_M_SYRK && a->dim[1] < BLIS_SMALL_MATRIX_A_THRES_N_SYRK) || + (a->dim[0] < BLIS_SMALL_MATRIX_A_THRES_M_SYRK && a->dim[1] <= BLIS_SMALL_MATRIX_A_THRES_N_SYRK)) + {//A*At operation + bli_obj_set_conjtrans( BLIS_TRANSPOSE, &at_local ); + //call small syrk. + //Explicit matrix dimension threshold check in this else if section before calling bli_syrk_small(). + status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local, cntx, cntl ); + } + if ( status == BLIS_SUCCESS ) + { + return; + } +#endif + bli_obj_alias_to( a, &at_local ); bli_obj_induce_trans( &at_local ); diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h index 28d1e13f6..98b1e1251 100644 --- a/frame/3/syrk/bli_syrk_front.h +++ b/frame/3/syrk/bli_syrk_front.h @@ -42,3 +42,14 @@ void bli_syrk_front rntm_t* rntm, cntl_t* cntl ); + +err_t bli_syrk_small + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); \ No newline at end of file diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 5093d1a4a..303570179 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,6 +35,8 @@ #include "blis.h" +//#define PRINT_SMALL_TRSM_INFO + void bli_trsm_front ( side_t side, @@ -50,6 +53,47 @@ void bli_trsm_front obj_t a_local; obj_t b_local; obj_t c_local; + + +#ifdef PRINT_SMALL_TRSM_INFO + printf("Side:: %c\n", side ? 'R' : 'L'); + if (bli_obj_datatype(*a) == BLIS_FLOAT) + printf("Alpha:: %9.2e\n", *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, *alpha))); + else if (bli_obj_datatype(*a) == BLIS_DOUBLE) + printf("Alpha is double:: %9.2e\n", *((double *)bli_obj_buffer_for_const(BLIS_DOUBLE, *alpha))); + else + printf("Unsupported datatype for Alpha\n"); + + printf("A:: M = %d, N = %d, elem_size = %d, row_off = %ld, col_off = %ld, rs = %d, cs = %d, trans = %c, TRIANG = %c, unit diag = %c\n", a->dim[0], a->dim[1], bli_obj_elem_size(*a ), bli_obj_row_off(*a), bli_obj_col_off(*a), a->rs, a->cs, bli_obj_has_trans(*a) ? 'Y' : 'N', bli_obj_is_upper(*a) ? 'U' : bli_obj_is_lower(*a) ? 'L' : 'N', bli_obj_has_unit_diag(*a) ? 'Y' : 'N'); +#ifdef PRINT_SMALL_TRSM + //bli_printm("a", a, "%4.1f", ""); +#endif + printf("B:: M = %d, N = %d, elem_size = %d, row_off = %ld, col_off = %ld, rs = %d, cs = %d, trans = %c\n", b->dim[0], b->dim[1], bli_obj_elem_size(*a ), bli_obj_row_off(*a), bli_obj_col_off(*a), b->rs, b->cs, bli_obj_has_trans(*b) ? 'Y' : 'N'); +#ifdef PRINT_SMALL_TRSM + //bli_printm("b", b, "%4.1f", ""); +#endif + fflush(stdout); +#endif +#if 0 +for (i = 0; i < m; i++) //no. of cols of B +{ + for (j = 0; j < n; j++) //no. of rows of B + { + B[i*n + j] = 1001 + j + (i*n); + } +} +for (i = 0; i < m; i++) //no. of cols of B +{ + for (j = i; j < m; j++) //no. of rows of B + { + L[i*m + j] = 2001 + j + (i*m); + } +} +#endif +#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM + gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl ); + if ( status == BLIS_SUCCESS ) return; +#endif // Check parameters. if ( bli_error_checking_is_enabled() ) diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h index 1a08b7c75..cd65e4454 100644 --- a/frame/3/trsm/bli_trsm_front.h +++ b/frame/3/trsm/bli_trsm_front.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -42,3 +43,13 @@ void bli_trsm_front rntm_t* rntm, cntl_t* cntl ); + +err_t bli_trsm_small + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c index aa1aa0e66..ccf6919cc 100644 --- a/kernels/zen/1/bli_amaxv_zen_int.c +++ b/kernels/zen/1/bli_amaxv_zen_int.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2016, Advanced Micro Devices, Inc. + Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -219,6 +219,12 @@ void bli_samaxv_zen_int } } + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // later, especially if BLIS is compiled with -mfpmath=sse). + _mm256_zeroupper(); + /* Store final index to output variable. */ *i_max = i_max_l; } @@ -370,6 +376,12 @@ void bli_damaxv_zen_int } } + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // later, especially if BLIS is compiled with -mfpmath=sse). + _mm256_zeroupper(); + /* Store final index to output variable. */ *i_max = i_max_l; } diff --git a/kernels/zen/1/bli_axpyv_zen_int.c b/kernels/zen/1/bli_axpyv_zen_int.c index 42668a0a7..bd7cec06a 100644 --- a/kernels/zen/1/bli_axpyv_zen_int.c +++ b/kernels/zen/1/bli_axpyv_zen_int.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017, Advanced Micro Devices, Inc. + Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -136,6 +136,13 @@ void bli_saxpyv_zen_int y0 += n_elem_per_reg * n_iter_unroll; } + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + const float alphac = *alpha; // If there are leftover iterations, perform them with scalar code. @@ -233,6 +240,13 @@ void bli_daxpyv_zen_int y0 += n_elem_per_reg * n_iter_unroll; } + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + const double alphac = *alpha; // If there are leftover iterations, perform them with scalar code. diff --git a/kernels/zen/1/bli_axpyv_zen_int10.c b/kernels/zen/1/bli_axpyv_zen_int10.c index d2780d39c..cacbcc6fb 100644 --- a/kernels/zen/1/bli_axpyv_zen_int10.c +++ b/kernels/zen/1/bli_axpyv_zen_int10.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017, Advanced Micro Devices, Inc. + Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -228,6 +228,13 @@ void bli_saxpyv_zen_int10 y0 += 1*n_elem_per_reg; } + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + for ( ; (i + 0) < n; i += 1 ) { *y0 += (*alpha) * (*x0); @@ -427,6 +434,13 @@ void bli_daxpyv_zen_int10 y0 += 1*n_elem_per_reg; } + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + for ( ; i < n; i += 1 ) { *y0 += (*alpha) * (*x0); diff --git a/kernels/zen/1/bli_dotv_zen_int.c b/kernels/zen/1/bli_dotv_zen_int.c index 1c87a0f87..32b43bd10 100644 --- a/kernels/zen/1/bli_dotv_zen_int.c +++ b/kernels/zen/1/bli_dotv_zen_int.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017, Advanced Micro Devices, Inc. + Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -151,6 +151,13 @@ void bli_sdotv_zen_int rho0 += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] + rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7]; + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { @@ -265,6 +272,13 @@ void bli_ddotv_zen_int // Accumulate the final rho vector into a single scalar result. rho0 += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3]; + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { diff --git a/kernels/zen/1/bli_dotv_zen_int10.c b/kernels/zen/1/bli_dotv_zen_int10.c index 79fdde969..8874303af 100644 --- a/kernels/zen/1/bli_dotv_zen_int10.c +++ b/kernels/zen/1/bli_dotv_zen_int10.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017, Advanced Micro Devices, Inc. + Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -224,6 +224,13 @@ void bli_sdotv_zen_int10 // Manually add the results from above to finish the sum. rho0 += rhov[0].f[0] + rhov[0].f[4]; rho0 += rhov[1].f[0] + rhov[1].f[4]; + + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // later, especially if BLIS is compiled with -mfpmath=sse). + _mm256_zeroupper(); + } else { @@ -407,6 +414,12 @@ void bli_ddotv_zen_int10 // Manually add the results from above to finish the sum. rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3]; rho0 += rhov[1].d[0] + rhov[1].d[1] + rhov[1].d[2] + rhov[1].d[3]; + + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // later, especially if BLIS is compiled with -mfpmath=sse). + _mm256_zeroupper(); } else { diff --git a/kernels/zen/1/bli_dotxv_zen_int.c b/kernels/zen/1/bli_dotxv_zen_int.c index 53b582b77..d27225934 100644 --- a/kernels/zen/1/bli_dotxv_zen_int.c +++ b/kernels/zen/1/bli_dotxv_zen_int.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017, Advanced Micro Devices, Inc. + Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -157,6 +157,13 @@ void bli_sdotxv_zen_int rho0 = rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] + rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7]; + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { @@ -277,6 +284,13 @@ void bli_ddotxv_zen_int // Accumulate the final rho vector into a single scalar result. rho0 = rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3]; + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { diff --git a/kernels/zen/3/bli_syrk_small.c b/kernels/zen/3/bli_syrk_small.c new file mode 100644 index 000000000..d10114b40 --- /dev/null +++ b/kernels/zen/3/bli_syrk_small.c @@ -0,0 +1,4186 @@ +/* + +BLIS +An object-based framework for developing high-performance BLAS-like +libraries. + +Copyright (C) 2018, Advanced Micro Devices, Inc. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of The University of Texas at Austin nor the names +of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "xmmintrin.h" +#include "blis.h" + +#ifdef BLIS_ENABLE_SMALL_MATRIX + +#define MR 32 +#define D_MR (MR >> 1) +#define NR 3 + +#define BLIS_ENABLE_PREFETCH +#define F_SCRATCH_DIM (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES) +static float A_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); +static float C_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); +#define D_BLIS_SMALL_MATRIX_THRES (BLIS_SMALL_MATRIX_THRES / 2 ) +#define D_BLIS_SMALL_M_RECT_MATRIX_THRES (BLIS_SMALL_M_RECT_MATRIX_THRES / 2) +#define D_BLIS_SMALL_K_RECT_MATRIX_THRES (BLIS_SMALL_K_RECT_MATRIX_THRES / 2) +#define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES) +static double D_A_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); +static double D_C_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); +#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. +#define AT_MR 4 // The kernel dimension of the A transpose SYRK kernel.(AT_MR * NR). +static err_t bli_ssyrk_small + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); + +static err_t bli_dsyrk_small + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); + +static err_t bli_ssyrk_small_atbn + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); + +static err_t bli_dsyrk_small_atbn + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); +/* +* The bli_syrk_small function will use the +* custom MRxNR kernels, to perform the computation. +* The custom kernels are used if the [M * N] < 240 * 240 +*/ +err_t bli_syrk_small + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) +{ +#ifdef BLIS_ENABLE_MULTITHREADING + return BLIS_NOT_YET_IMPLEMENTED; +#endif + // If alpha is zero, scale by beta and return. + if (bli_obj_equals(alpha, &BLIS_ZERO)) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + + // if row major format return. + if ((bli_obj_row_stride( a ) != 1) || + (bli_obj_row_stride( b ) != 1) || + (bli_obj_row_stride( c ) != 1)) + { + return BLIS_INVALID_ROW_STRIDE; + } + + num_t dt = ((*c).info & (0x7 << 0)); + + if (bli_obj_has_trans( a )) + { + if (bli_obj_has_notrans( b )) + { + if (dt == BLIS_FLOAT) + { + return bli_ssyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl); + } + else if (dt == BLIS_DOUBLE) + { + return bli_dsyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl); + } + } + + return BLIS_NOT_YET_IMPLEMENTED; + } + + if (dt == BLIS_DOUBLE) + { + return bli_dsyrk_small(alpha, a, b, beta, c, cntx, cntl); + } + + if (dt == BLIS_FLOAT) + { + return bli_ssyrk_small(alpha, a, b, beta, c, cntx, cntl); + } + + return BLIS_NOT_YET_IMPLEMENTED; +}; + + +static err_t bli_ssyrk_small + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) +{ + + int M = bli_obj_length( c ); // number of rows of Matrix C + int N = bli_obj_width( c ); // number of columns of Matrix C + int K = bli_obj_width( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) . + int L = M * N; + + if ((((L) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES)) + || ((M < BLIS_SMALL_M_RECT_MATRIX_THRES) && (K < BLIS_SMALL_K_RECT_MATRIX_THRES))) && ((L!=0) && (K!=0))) + { + + int lda = bli_obj_col_stride(a); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. + int ldb = bli_obj_col_stride(b); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. + int ldc_matC = bli_obj_col_stride( c ); // column stride of matrix C + int ldc = M;//bli_obj_col_stride( c ); // column stride of static buffer for matrix C + int row_idx, col_idx, k; + int rs_matC = bli_obj_row_stride( c ); + int rsc = 1; + float *A = a->buffer; // pointer to elements of Matrix A + float *B = b->buffer; // pointer to elements of Matrix B + float *C = C_pack; // pointer to elements of Matrix C + float *matCbuf = c->buffer; + + float *tA = A, *tB = B, *tC = C;//, *tA_pack; + float *tA_packed; // temprorary pointer to hold packed A memory pointer + int row_idx_packed; //packed A memory row index + int lda_packed; //lda of packed A + int col_idx_start; //starting index after A matrix is packed. + dim_t tb_inc_row = 1; // row stride of matrix B + dim_t tb_inc_col = ldb; // column stride of matrix B + __m256 ymm4, ymm5, ymm6, ymm7; + __m256 ymm8, ymm9, ymm10, ymm11; + __m256 ymm12, ymm13, ymm14, ymm15; + __m256 ymm0, ymm1, ymm2, ymm3; + + int n_remainder; // If the N is non multiple of 3.(N%3) + int m_remainder; // If the M is non multiple of 32.(M%32) + + float *alpha_cast, *beta_cast; // alpha, beta multiples + alpha_cast = (alpha->buffer); + beta_cast = (beta->buffer); + int required_packing_A = 1; + + // when N is equal to 1 call GEMV instead of SYRK + if (N == 1) + { + bli_gemv + ( + alpha, + a, + b, + beta, + c + ); + return BLIS_SUCCESS; + } + + //update the pointer math if matrix B needs to be transposed. + if (bli_obj_has_trans( b )) + { + tb_inc_col = 1; //switch row and column strides + tb_inc_row = ldb; + } + + if ((N <= 3) || ((MR * K) > F_SCRATCH_DIM)) + { + required_packing_A = 0; + } + /* + * The computation loop runs for MRxN columns of C matrix, thus + * accessing the MRxK A matrix data and KxNR B matrix data. + * The computation is organized as inner loops of dimension MRxNR. + */ + // Process MR rows of C matrix at a time. + for (row_idx = 0; (row_idx + (MR - 1)) < M; row_idx += MR) + { + + col_idx_start = 0; + tA_packed = A; + row_idx_packed = row_idx; + lda_packed = lda; + + // This is the part of the pack and compute optimization. + // During the first column iteration, we store the accessed A matrix into + // contiguous static memory. This helps to keep te A matrix in Cache and + // aviods the TLB misses. + if (required_packing_A) + { + col_idx = 0; + + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + tA_packed = A_pack; + +#if 0//def BLIS_ENABLE_PREFETCH + _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 16), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc + 16), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc + 16), _MM_HINT_T0); +#endif + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + ymm6 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + ymm8 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm11 = _mm256_setzero_ps(); + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + ymm15 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + // This loop is processing MR x K + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + _mm256_storeu_ps(tA_packed, ymm3); // the packing of matrix A + // ymm4 += ymm0 * ymm3; + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + // ymm8 += ymm1 * ymm3; + ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); + // ymm12 += ymm2 * ymm3; + ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 8); + _mm256_storeu_ps(tA_packed + 8, ymm3); // the packing of matrix A + // ymm5 += ymm0 * ymm3; + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + // ymm9 += ymm1 * ymm3; + ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); + // ymm13 += ymm2 * ymm3; + ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); + + ymm3 = _mm256_loadu_ps(tA + 16); + _mm256_storeu_ps(tA_packed + 16, ymm3); // the packing of matrix A + // ymm6 += ymm0 * ymm3; + ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); + // ymm10 += ymm1 * ymm3; + ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); + // ymm14 += ymm2 * ymm3; + ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); + + ymm3 = _mm256_loadu_ps(tA + 24); + _mm256_storeu_ps(tA_packed + 24, ymm3); // the packing of matrix A + // ymm7 += ymm0 * ymm3; + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + // ymm11 += ymm1 * ymm3; + ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); + // ymm15 += ymm2 * ymm3; + ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); + + tA += lda; + tA_packed += MR; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_ps(ymm4, ymm0); + ymm5 = _mm256_mul_ps(ymm5, ymm0); + ymm6 = _mm256_mul_ps(ymm6, ymm0); + ymm7 = _mm256_mul_ps(ymm7, ymm0); + ymm8 = _mm256_mul_ps(ymm8, ymm0); + ymm9 = _mm256_mul_ps(ymm9, ymm0); + ymm10 = _mm256_mul_ps(ymm10, ymm0); + ymm11 = _mm256_mul_ps(ymm11, ymm0); + ymm12 = _mm256_mul_ps(ymm12, ymm0); + ymm13 = _mm256_mul_ps(ymm13, ymm0); + ymm14 = _mm256_mul_ps(ymm14, ymm0); + ymm15 = _mm256_mul_ps(ymm15, ymm0); + + // multiply C by beta and accumulate col 1. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ + _mm256_storeu_ps(tC, ymm4); + _mm256_storeu_ps(tC + 8, ymm5); + _mm256_storeu_ps(tC + 16, ymm6); + _mm256_storeu_ps(tC + 24, ymm7); + + // multiply C by beta and accumulate, col 2. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11);*/ + _mm256_storeu_ps(tC, ymm8); + _mm256_storeu_ps(tC + 8, ymm9); + _mm256_storeu_ps(tC + 16, ymm10); + _mm256_storeu_ps(tC + 24, ymm11); + + // multiply C by beta and accumulate, col 3. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15);*/ + _mm256_storeu_ps(tC, ymm12); + _mm256_storeu_ps(tC + 8, ymm13); + _mm256_storeu_ps(tC + 16, ymm14); + _mm256_storeu_ps(tC + 24, ymm15); + + // modify the pointer arithematic to use packed A matrix. + col_idx_start = NR; + tA_packed = A_pack; + row_idx_packed = 0; + lda_packed = MR; + } + // Process NR columns of C matrix at a time. + for (col_idx = col_idx_start; (col_idx + (NR - 1)) < N; col_idx += NR) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + +#if 0//def BLIS_ENABLE_PREFETCH + _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 16), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc + 16), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc + 16), _MM_HINT_T0); +#endif + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + ymm6 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + ymm8 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm11 = _mm256_setzero_ps(); + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + ymm15 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + // This loop is processing MR x K + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + // ymm4 += ymm0 * ymm3; + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + // ymm8 += ymm1 * ymm3; + ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); + // ymm12 += ymm2 * ymm3; + ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 8); + // ymm5 += ymm0 * ymm3; + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + // ymm9 += ymm1 * ymm3; + ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); + // ymm13 += ymm2 * ymm3; + ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); + + ymm3 = _mm256_loadu_ps(tA + 16); + // ymm6 += ymm0 * ymm3; + ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); + // ymm10 += ymm1 * ymm3; + ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); + // ymm14 += ymm2 * ymm3; + ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); + + ymm3 = _mm256_loadu_ps(tA + 24); + // ymm7 += ymm0 * ymm3; + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + // ymm11 += ymm1 * ymm3; + ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); + // ymm15 += ymm2 * ymm3; + ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); + + tA += lda_packed; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_ps(ymm4, ymm0); + ymm5 = _mm256_mul_ps(ymm5, ymm0); + ymm6 = _mm256_mul_ps(ymm6, ymm0); + ymm7 = _mm256_mul_ps(ymm7, ymm0); + ymm8 = _mm256_mul_ps(ymm8, ymm0); + ymm9 = _mm256_mul_ps(ymm9, ymm0); + ymm10 = _mm256_mul_ps(ymm10, ymm0); + ymm11 = _mm256_mul_ps(ymm11, ymm0); + ymm12 = _mm256_mul_ps(ymm12, ymm0); + ymm13 = _mm256_mul_ps(ymm13, ymm0); + ymm14 = _mm256_mul_ps(ymm14, ymm0); + ymm15 = _mm256_mul_ps(ymm15, ymm0); + + // multiply C by beta and accumulate col 1. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ + _mm256_storeu_ps(tC, ymm4); + _mm256_storeu_ps(tC + 8, ymm5); + _mm256_storeu_ps(tC + 16, ymm6); + _mm256_storeu_ps(tC + 24, ymm7); + + // multiply C by beta and accumulate, col 2. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11);*/ + _mm256_storeu_ps(tC, ymm8); + _mm256_storeu_ps(tC + 8, ymm9); + _mm256_storeu_ps(tC + 16, ymm10); + _mm256_storeu_ps(tC + 24, ymm11); + + // multiply C by beta and accumulate, col 3. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15);*/ + _mm256_storeu_ps(tC, ymm12); + _mm256_storeu_ps(tC + 8, ymm13); + _mm256_storeu_ps(tC + 16, ymm14); + _mm256_storeu_ps(tC + 24, ymm15); + + } + n_remainder = N - col_idx; + + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm8 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm11 = _mm256_setzero_ps(); + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + ymm15 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm8 = _mm256_fmadd_ps(ymm0, ymm3, ymm8); + ymm12 = _mm256_fmadd_ps(ymm1, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 8); + ymm9 = _mm256_fmadd_ps(ymm0, ymm3, ymm9); + ymm13 = _mm256_fmadd_ps(ymm1, ymm3, ymm13); + + ymm3 = _mm256_loadu_ps(tA + 16); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); + + ymm3 = _mm256_loadu_ps(tA + 24); + ymm11 = _mm256_fmadd_ps(ymm0, ymm3, ymm11); + ymm15 = _mm256_fmadd_ps(ymm1, ymm3, ymm15); + + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm8 = _mm256_mul_ps(ymm8, ymm0); + ymm9 = _mm256_mul_ps(ymm9, ymm0); + ymm10 = _mm256_mul_ps(ymm10, ymm0); + ymm11 = _mm256_mul_ps(ymm11, ymm0); + ymm12 = _mm256_mul_ps(ymm12, ymm0); + ymm13 = _mm256_mul_ps(ymm13, ymm0); + ymm14 = _mm256_mul_ps(ymm14, ymm0); + ymm15 = _mm256_mul_ps(ymm15, ymm0); + + // multiply C by beta and accumulate, col 1. + /*ymm2 = _mm256_loadu_ps(tC + 0); + ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11);*/ + _mm256_storeu_ps(tC + 0, ymm8); + _mm256_storeu_ps(tC + 8, ymm9); + _mm256_storeu_ps(tC + 16, ymm10); + _mm256_storeu_ps(tC + 24, ymm11); + + // multiply C by beta and accumulate, col 2. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15);*/ + _mm256_storeu_ps(tC, ymm12); + _mm256_storeu_ps(tC + 8, ymm13); + _mm256_storeu_ps(tC + 16, ymm14); + _mm256_storeu_ps(tC + 24, ymm15); + + col_idx += 2; + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + ymm15 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm12 = _mm256_fmadd_ps(ymm0, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 8); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + + ymm3 = _mm256_loadu_ps(tA + 16); + ymm14 = _mm256_fmadd_ps(ymm0, ymm3, ymm14); + + ymm3 = _mm256_loadu_ps(tA + 24); + ymm15 = _mm256_fmadd_ps(ymm0, ymm3, ymm15); + + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm12 = _mm256_mul_ps(ymm12, ymm0); + ymm13 = _mm256_mul_ps(ymm13, ymm0); + ymm14 = _mm256_mul_ps(ymm14, ymm0); + ymm15 = _mm256_mul_ps(ymm15, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC + 0); + ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); + ymm2 = _mm256_loadu_ps(tC + 24); + ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15);*/ + + _mm256_storeu_ps(tC + 0, ymm12); + _mm256_storeu_ps(tC + 8, ymm13); + _mm256_storeu_ps(tC + 16, ymm14); + _mm256_storeu_ps(tC + 24, ymm15); + } + } + + m_remainder = M - row_idx; + + if (m_remainder >= 24) + { + m_remainder -= 24; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + ymm6 = _mm256_setzero_ps(); + ymm8 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + // ymm4 += ymm0 * ymm3; + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + // ymm8 += ymm1 * ymm3; + ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); + // ymm12 += ymm2 * ymm3; + ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 8); + // ymm5 += ymm0 * ymm3; + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + // ymm9 += ymm1 * ymm3; + ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); + // ymm13 += ymm2 * ymm3; + ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); + + ymm3 = _mm256_loadu_ps(tA + 16); + // ymm6 += ymm0 * ymm3; + ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); + // ymm10 += ymm1 * ymm3; + ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); + // ymm14 += ymm2 * ymm3; + ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_ps(ymm4, ymm0); + ymm5 = _mm256_mul_ps(ymm5, ymm0); + ymm6 = _mm256_mul_ps(ymm6, ymm0); + ymm8 = _mm256_mul_ps(ymm8, ymm0); + ymm9 = _mm256_mul_ps(ymm9, ymm0); + ymm10 = _mm256_mul_ps(ymm10, ymm0); + ymm12 = _mm256_mul_ps(ymm12, ymm0); + ymm13 = _mm256_mul_ps(ymm13, ymm0); + ymm14 = _mm256_mul_ps(ymm14, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6);*/ + _mm256_storeu_ps(tC, ymm4); + _mm256_storeu_ps(tC + 8, ymm5); + _mm256_storeu_ps(tC + 16, ymm6); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10);*/ + _mm256_storeu_ps(tC, ymm8); + _mm256_storeu_ps(tC + 8, ymm9); + _mm256_storeu_ps(tC + 16, ymm10); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14);*/ + _mm256_storeu_ps(tC, ymm12); + _mm256_storeu_ps(tC + 8, ymm13); + _mm256_storeu_ps(tC + 16, ymm14); + + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm8 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm8 = _mm256_fmadd_ps(ymm0, ymm3, ymm8); + ymm12 = _mm256_fmadd_ps(ymm1, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 8); + ymm9 = _mm256_fmadd_ps(ymm0, ymm3, ymm9); + ymm13 = _mm256_fmadd_ps(ymm1, ymm3, ymm13); + + ymm3 = _mm256_loadu_ps(tA + 16); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); + + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm8 = _mm256_mul_ps(ymm8, ymm0); + ymm9 = _mm256_mul_ps(ymm9, ymm0); + ymm10 = _mm256_mul_ps(ymm10, ymm0); + ymm12 = _mm256_mul_ps(ymm12, ymm0); + ymm13 = _mm256_mul_ps(ymm13, ymm0); + ymm14 = _mm256_mul_ps(ymm14, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC + 0); + ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10);*/ + _mm256_storeu_ps(tC + 0, ymm8); + _mm256_storeu_ps(tC + 8, ymm9); + _mm256_storeu_ps(tC + 16, ymm10); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14);*/ + _mm256_storeu_ps(tC, ymm12); + _mm256_storeu_ps(tC + 8, ymm13); + _mm256_storeu_ps(tC + 16, ymm14); + + col_idx += 2; + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm12 = _mm256_fmadd_ps(ymm0, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 8); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + + ymm3 = _mm256_loadu_ps(tA + 16); + ymm14 = _mm256_fmadd_ps(ymm0, ymm3, ymm14); + + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm12 = _mm256_mul_ps(ymm12, ymm0); + ymm13 = _mm256_mul_ps(ymm13, ymm0); + ymm14 = _mm256_mul_ps(ymm14, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC + 0); + ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_ps(tC + 16); + ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14);*/ + + _mm256_storeu_ps(tC + 0, ymm12); + _mm256_storeu_ps(tC + 8, ymm13); + _mm256_storeu_ps(tC + 16, ymm14); + } + + row_idx += 24; + } + + if (m_remainder >= 16) + { + m_remainder -= 16; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + ymm6 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + ymm8 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + ymm6 = _mm256_fmadd_ps(ymm1, ymm3, ymm6); + ymm8 = _mm256_fmadd_ps(ymm2, ymm3, ymm8); + + ymm3 = _mm256_loadu_ps(tA + 8); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); + ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_ps(ymm4, ymm0); + ymm5 = _mm256_mul_ps(ymm5, ymm0); + ymm6 = _mm256_mul_ps(ymm6, ymm0); + ymm7 = _mm256_mul_ps(ymm7, ymm0); + ymm8 = _mm256_mul_ps(ymm8, ymm0); + ymm9 = _mm256_mul_ps(ymm9, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ + _mm256_storeu_ps(tC, ymm4); + _mm256_storeu_ps(tC + 8, ymm5); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ + _mm256_storeu_ps(tC, ymm6); + _mm256_storeu_ps(tC + 8, ymm7); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9);*/ + _mm256_storeu_ps(tC, ymm8); + _mm256_storeu_ps(tC + 8, ymm9); + + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + ymm6 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + ymm6 = _mm256_fmadd_ps(ymm1, ymm3, ymm6); + + ymm3 = _mm256_loadu_ps(tA + 8); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_ps(ymm4, ymm0); + ymm5 = _mm256_mul_ps(ymm5, ymm0); + ymm6 = _mm256_mul_ps(ymm6, ymm0); + ymm7 = _mm256_mul_ps(ymm7, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ + _mm256_storeu_ps(tC, ymm4); + _mm256_storeu_ps(tC + 8, ymm5); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ + _mm256_storeu_ps(tC, ymm6); + _mm256_storeu_ps(tC + 8, ymm7); + + col_idx += 2; + + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + ymm3 = _mm256_loadu_ps(tA + 8); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + ymm4 = _mm256_mul_ps(ymm4, ymm0); + ymm5 = _mm256_mul_ps(ymm5, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_ps(tC + 8); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ + _mm256_storeu_ps(tC, ymm4); + _mm256_storeu_ps(tC + 8, ymm5); + + } + + row_idx += 16; + } + + if (m_remainder >= 8) + { + m_remainder -= 8; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + ymm6 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_ps(ymm4, ymm0); + ymm5 = _mm256_mul_ps(ymm5, ymm0); + ymm6 = _mm256_mul_ps(ymm6, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4);*/ + _mm256_storeu_ps(tC, ymm4); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ + _mm256_storeu_ps(tC, ymm5); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6);*/ + _mm256_storeu_ps(tC, ymm6); + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_ps(ymm4, ymm0); + ymm5 = _mm256_mul_ps(ymm5, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4);*/ + _mm256_storeu_ps(tC, ymm4); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_ps(tC); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ + _mm256_storeu_ps(tC, ymm5); + + col_idx += 2; + + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm4 = _mm256_setzero_ps(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + ymm4 = _mm256_mul_ps(ymm4, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_ps(tC); + ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4);*/ + _mm256_storeu_ps(tC, ymm4); + + } + + row_idx += 8; + } + // M is not a multiple of 32. + // The handling of edge case where the remainder + // dimension is less than 8. The padding takes place + // to handle this case. + if ((m_remainder) && (lda > 7)) + { + float f_temp[8]; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm5 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + + for (k = 0; k < (K - 1); ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_ps(tA); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); + ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); + tB += tb_inc_row; + + for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tA[i]; + } + ymm3 = _mm256_loadu_ps(f_temp); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); + ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); + + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + //multiply A*B by alpha. + ymm5 = _mm256_mul_ps(ymm5, ymm0); + ymm7 = _mm256_mul_ps(ymm7, ymm0); + ymm9 = _mm256_mul_ps(ymm9, ymm0); + + + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_ps(f_temp); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ + _mm256_storeu_ps(f_temp, ymm5); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + + tC += ldc; + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_ps(f_temp); + ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ + _mm256_storeu_ps(f_temp, ymm7); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + + tC += ldc; + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_ps(f_temp); + ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9);*/ + _mm256_storeu_ps(f_temp, ymm9); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm5 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + + for (k = 0; k < (K - 1); ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + tB += tb_inc_row; + + ymm3 = _mm256_loadu_ps(tA); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); + + tA += lda; + } + + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); + tB += tb_inc_row; + + for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tA[i]; + } + ymm3 = _mm256_loadu_ps(f_temp); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); + + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + ymm5 = _mm256_mul_ps(ymm5, ymm0); + ymm7 = _mm256_mul_ps(ymm7, ymm0); + + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_ps(f_temp); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ + _mm256_storeu_ps(f_temp, ymm5); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + + tC += ldc; + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_ps(f_temp); + ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ + _mm256_storeu_ps(f_temp, ymm7); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm5 = _mm256_setzero_ps(); + + for (k = 0; k < (K - 1); ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + tB += tb_inc_row; + + ymm3 = _mm256_loadu_ps(tA); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + + tA += lda; + } + + ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); + tB += tb_inc_row; + + for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tA[i]; + } + ymm3 = _mm256_loadu_ps(f_temp); + ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); + + ymm0 = _mm256_broadcast_ss(alpha_cast); + //ymm1 = _mm256_broadcast_ss(beta_cast); + + // multiply C by beta and accumulate. + ymm5 = _mm256_mul_ps(ymm5, ymm0); + + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_ps(f_temp); + ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ + _mm256_storeu_ps(f_temp, ymm5); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + } + m_remainder = 0; + } + + if (m_remainder) + { + float result; + for (; row_idx < M; row_idx += 1) + { + for (col_idx = 0; col_idx < N; col_idx += 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + result = 0; + for (k = 0; k < K; ++k) + { + result += (*tA) * (*tB); + tA += lda; + tB += tb_inc_row; + } + + result *= (*alpha_cast); + (*tC) = /*(*tC) * (*beta_cast) + */result; + } + } + } + + //copy/compute sryk values back to C using SIMD + if ( bli_seq0( *beta_cast ) ) + {//just copy in case of beta = 0 + dim_t _i, _j, k, _l; + if(bli_obj_is_lower(c)) // c is lower + { + //first column + _j = 0; + k = M >> 3; + _i = 0; + for ( _l = 0; _l < k; _l++ ) + { + ymm0 = _mm256_loadu_ps((C + _i*rsc)); + _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0); + _i += 8; + } + while (_i < M ) + { + bli_sscopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + _i++; + } + _j++; + while ( _j < N ) //next column + { + //k = (_j + (8 - (_j & 7))); + _l = _j & 7; + k = (_l != 0) ? (_j + (8 - _l)) : _j; + k = (k <= M) ? k : M; + for ( _i = _j; _i < k; ++_i ) + { + bli_sscopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + k = (M - _i) >> 3; + _l = 0; + while ( _l < k ) + { + ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); + _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); + + _i += 8; + _l++; + } + while (_i < M ) + { + bli_sscopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + _i++; + } + _j++; + } + } + else //c is upper + { + for ( _j = 0; _j < N; ++_j ) + { + k = (_j + 1) >> 3; + _i = 0; + _l = 0; + while ( _l < k ) + { + ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); + _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); + _i += 8; + _l++; + } + while (_i <= _j ) + { + bli_sscopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + ++_i; + } + } + } + } + else + {//when beta is non-zero, fmadd and store the results + dim_t _i, _j, k, _l; + ymm1 = _mm256_broadcast_ss(beta_cast); + if(bli_obj_is_lower(c)) //c is lower + { + //first column + _j = 0; + k = M >> 3; + _i = 0; + for ( _l = 0; _l < k; _l++ ) + { + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC)); + ymm0 = _mm256_loadu_ps((C + _i*rsc)); + ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); + _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0); + _i += 8; + } + while (_i < M ) + { + bli_sssxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + _i++; + } + _j++; + while ( _j < N ) //next column + { + //k = (_j + (8 - (_j & 7))); + _l = _j & 7; + k = (_l != 0) ? (_j + (8 - _l)) : _j; + k = (k <= M) ? k : M; + for ( _i = _j; _i < k; ++_i ) + { + bli_sssxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + k = (M - _i) >> 3; + _l = 0; + while ( _l < k ) + { + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); + ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); + _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); + + _i += 8; + _l++; + } + while (_i < M ) + { + bli_sssxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + _i++; + } + _j++; + } + } + else //c is upper + { + for ( _j = 0; _j < N; ++_j ) + { + k = (_j + 1) >> 3; + _i = 0; + _l = 0; + while ( _l < k ) + { + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); + ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); + _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); + _i += 8; + _l++; + } + while (_i <= _j ) + { + bli_sssxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + ++_i; + } + } + } + } + + return BLIS_SUCCESS; + } + else + return BLIS_NONCONFORMAL_DIMENSIONS; + + +}; + +static err_t bli_dsyrk_small + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) +{ + + int M = bli_obj_length( c ); // number of rows of Matrix C + int N = bli_obj_width( c ); // number of columns of Matrix C + int K = bli_obj_width( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) . + int L = M * N; + + // If alpha is zero, scale by beta and return. + if ((((L) < (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES)) + || ((M < D_BLIS_SMALL_M_RECT_MATRIX_THRES) && (K < D_BLIS_SMALL_K_RECT_MATRIX_THRES))) && ((L!=0) && (K!=0))) + { + + int lda = bli_obj_col_stride( a ); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. + int ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. + int ldc_matC = bli_obj_col_stride( c ); // column stride of matrix C + int ldc = M;//bli_obj_col_stride( c ); // column stride of static buffer for matrix C + int row_idx, col_idx, k; + int rs_matC = bli_obj_row_stride( c ); + int rsc = 1; + double *A = a->buffer; // pointer to elements of Matrix A + double *B = b->buffer; // pointer to elements of Matrix B + double *C = D_C_pack; // pointer to elements of Matrix C + double *matCbuf = c->buffer; + + double *tA = A, *tB = B, *tC = C;//, *tA_pack; + double *tA_packed; // temprorary pointer to hold packed A memory pointer + int row_idx_packed; //packed A memory row index + int lda_packed; //lda of packed A + int col_idx_start; //starting index after A matrix is packed. + dim_t tb_inc_row = 1; // row stride of matrix B + dim_t tb_inc_col = ldb; // column stride of matrix B + __m256d ymm4, ymm5, ymm6, ymm7; + __m256d ymm8, ymm9, ymm10, ymm11; + __m256d ymm12, ymm13, ymm14, ymm15; + __m256d ymm0, ymm1, ymm2, ymm3; + + int n_remainder; // If the N is non multiple of 3.(N%3) + int m_remainder; // If the M is non multiple of 16.(M%16) + + double *alpha_cast, *beta_cast; // alpha, beta multiples + alpha_cast = (alpha->buffer); + beta_cast = (beta->buffer); + int required_packing_A = 1; + + // when N is equal to 1 call GEMV instead of SYRK + if (N == 1) + { + bli_gemv + ( + alpha, + a, + b, + beta, + c + ); + return BLIS_SUCCESS; + } + + //update the pointer math if matrix B needs to be transposed. + if (bli_obj_has_trans( b )) + { + tb_inc_col = 1; //switch row and column strides + tb_inc_row = ldb; + } + + if ((N <= 3) || ((D_MR * K) > D_SCRATCH_DIM)) + { + required_packing_A = 0; + } + /* + * The computation loop runs for D_MRxN columns of C matrix, thus + * accessing the D_MRxK A matrix data and KxNR B matrix data. + * The computation is organized as inner loops of dimension D_MRxNR. + */ + // Process D_MR rows of C matrix at a time. + for (row_idx = 0; (row_idx + (D_MR - 1)) < M; row_idx += D_MR) + { + + col_idx_start = 0; + tA_packed = A; + row_idx_packed = row_idx; + lda_packed = lda; + + // This is the part of the pack and compute optimization. + // During the first column iteration, we store the accessed A matrix into + // contiguous static memory. This helps to keep te A matrix in Cache and + // aviods the TLB misses. + if (required_packing_A) + { + col_idx = 0; + + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + tA_packed = D_A_pack; + +#if 0//def BLIS_ENABLE_PREFETCH + _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); +#endif + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm11 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + _mm256_storeu_pd(tA_packed, ymm3); // the packing of matrix A + // ymm4 += ymm0 * ymm3; + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + // ymm8 += ymm1 * ymm3; + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + // ymm12 += ymm2 * ymm3; + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 4); + _mm256_storeu_pd(tA_packed + 4, ymm3); // the packing of matrix A + // ymm5 += ymm0 * ymm3; + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + // ymm9 += ymm1 * ymm3; + ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); + // ymm13 += ymm2 * ymm3; + ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); + + ymm3 = _mm256_loadu_pd(tA + 8); + _mm256_storeu_pd(tA_packed + 8, ymm3); // the packing of matrix A + // ymm6 += ymm0 * ymm3; + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + // ymm10 += ymm1 * ymm3; + ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); + // ymm14 += ymm2 * ymm3; + ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); + + ymm3 = _mm256_loadu_pd(tA + 12); + _mm256_storeu_pd(tA_packed + 12, ymm3); // the packing of matrix A + // ymm7 += ymm0 * ymm3; + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + // ymm11 += ymm1 * ymm3; + ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); + // ymm15 += ymm2 * ymm3; + ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); + + tA += lda; + tA_packed += D_MR; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + ymm7 = _mm256_mul_pd(ymm7, ymm0); + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm11 = _mm256_mul_pd(ymm11, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + ymm15 = _mm256_mul_pd(ymm15, ymm0); + + // multiply C by beta and accumulate col 1. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ + _mm256_storeu_pd(tC, ymm4); + _mm256_storeu_pd(tC + 4, ymm5); + _mm256_storeu_pd(tC + 8, ymm6); + _mm256_storeu_pd(tC + 12, ymm7); + + // multiply C by beta and accumulate, col 2. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);*/ + _mm256_storeu_pd(tC, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + _mm256_storeu_pd(tC + 8, ymm10); + _mm256_storeu_pd(tC + 12, ymm11); + + // multiply C by beta and accumulate, col 3. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);*/ + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + _mm256_storeu_pd(tC + 12, ymm15); + + // modify the pointer arithematic to use packed A matrix. + col_idx_start = NR; + tA_packed = D_A_pack; + row_idx_packed = 0; + lda_packed = D_MR; + } + // Process NR columns of C matrix at a time. + for (col_idx = col_idx_start; (col_idx + (NR - 1)) < N; col_idx += NR) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = tA_packed + row_idx_packed; + +#if 0//def BLIS_ENABLE_PREFETCH + _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); + _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); +#endif + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm11 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + // This loop is processing D_MR x K + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + // ymm4 += ymm0 * ymm3; + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + // ymm8 += ymm1 * ymm3; + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + // ymm12 += ymm2 * ymm3; + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 4); + // ymm5 += ymm0 * ymm3; + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + // ymm9 += ymm1 * ymm3; + ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); + // ymm13 += ymm2 * ymm3; + ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); + + ymm3 = _mm256_loadu_pd(tA + 8); + // ymm6 += ymm0 * ymm3; + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + // ymm10 += ymm1 * ymm3; + ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); + // ymm14 += ymm2 * ymm3; + ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); + + ymm3 = _mm256_loadu_pd(tA + 12); + // ymm7 += ymm0 * ymm3; + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + // ymm11 += ymm1 * ymm3; + ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); + // ymm15 += ymm2 * ymm3; + ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); + + tA += lda_packed; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + ymm7 = _mm256_mul_pd(ymm7, ymm0); + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm11 = _mm256_mul_pd(ymm11, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + ymm15 = _mm256_mul_pd(ymm15, ymm0); + + // multiply C by beta and accumulate col 1. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ + _mm256_storeu_pd(tC, ymm4); + _mm256_storeu_pd(tC + 4, ymm5); + _mm256_storeu_pd(tC + 8, ymm6); + _mm256_storeu_pd(tC + 12, ymm7); + + // multiply C by beta and accumulate, col 2. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);*/ + _mm256_storeu_pd(tC, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + _mm256_storeu_pd(tC + 8, ymm10); + _mm256_storeu_pd(tC + 12, ymm11); + + // multiply C by beta and accumulate, col 3. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);*/ + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + _mm256_storeu_pd(tC + 12, ymm15); + + } + n_remainder = N - col_idx; + + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm11 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); + ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 4); + ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); + ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); + + ymm3 = _mm256_loadu_pd(tA + 8); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); + + ymm3 = _mm256_loadu_pd(tA + 12); + ymm11 = _mm256_fmadd_pd(ymm0, ymm3, ymm11); + ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); + + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm11 = _mm256_mul_pd(ymm11, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + ymm15 = _mm256_mul_pd(ymm15, ymm0); + + // multiply C by beta and accumulate, col 1. + /*ymm2 = _mm256_loadu_pd(tC + 0); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);*/ + _mm256_storeu_pd(tC + 0, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + _mm256_storeu_pd(tC + 8, ymm10); + _mm256_storeu_pd(tC + 12, ymm11); + + // multiply C by beta and accumulate, col 2. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);*/ + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + _mm256_storeu_pd(tC + 12, ymm15); + + col_idx += 2; + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 4); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + + ymm3 = _mm256_loadu_pd(tA + 8); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + ymm3 = _mm256_loadu_pd(tA + 12); + ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); + + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + ymm15 = _mm256_mul_pd(ymm15, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC + 0); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); + ymm2 = _mm256_loadu_pd(tC + 12); + ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);*/ + + _mm256_storeu_pd(tC + 0, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + _mm256_storeu_pd(tC + 12, ymm15); + } + } + + m_remainder = M - row_idx; + + if (m_remainder >= 12) + { + m_remainder -= 12; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + // ymm4 += ymm0 * ymm3; + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + // ymm8 += ymm1 * ymm3; + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + // ymm12 += ymm2 * ymm3; + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 4); + // ymm5 += ymm0 * ymm3; + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + // ymm9 += ymm1 * ymm3; + ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); + // ymm13 += ymm2 * ymm3; + ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); + + ymm3 = _mm256_loadu_pd(tA + 8); + // ymm6 += ymm0 * ymm3; + ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); + // ymm10 += ymm1 * ymm3; + ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); + // ymm14 += ymm2 * ymm3; + ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);*/ + _mm256_storeu_pd(tC, ymm4); + _mm256_storeu_pd(tC + 4, ymm5); + _mm256_storeu_pd(tC + 8, ymm6); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);*/ + _mm256_storeu_pd(tC, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + _mm256_storeu_pd(tC + 8, ymm10); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);*/ + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); + ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 4); + ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); + ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); + + ymm3 = _mm256_loadu_pd(tA + 8); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); + + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + ymm10 = _mm256_mul_pd(ymm10, ymm0); + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC + 0); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);*/ + _mm256_storeu_pd(tC + 0, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + _mm256_storeu_pd(tC + 8, ymm10); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);*/ + _mm256_storeu_pd(tC, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + + col_idx += 2; + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 4); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + + ymm3 = _mm256_loadu_pd(tA + 8); + ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); + + tA += lda; + + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm12 = _mm256_mul_pd(ymm12, ymm0); + ymm13 = _mm256_mul_pd(ymm13, ymm0); + ymm14 = _mm256_mul_pd(ymm14, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC + 0); + ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); + ymm2 = _mm256_loadu_pd(tC + 8); + ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);*/ + + _mm256_storeu_pd(tC + 0, ymm12); + _mm256_storeu_pd(tC + 4, ymm13); + _mm256_storeu_pd(tC + 8, ymm14); + } + + row_idx += 12; + } + + if (m_remainder >= 8) + { + m_remainder -= 8; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm6 = _mm256_fmadd_pd(ymm1, ymm3, ymm6); + ymm8 = _mm256_fmadd_pd(ymm2, ymm3, ymm8); + + ymm3 = _mm256_loadu_pd(tA + 4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + ymm7 = _mm256_mul_pd(ymm7, ymm0); + ymm8 = _mm256_mul_pd(ymm8, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ + _mm256_storeu_pd(tC, ymm4); + _mm256_storeu_pd(tC + 4, ymm5); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ + _mm256_storeu_pd(tC, ymm6); + _mm256_storeu_pd(tC + 4, ymm7); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);*/ + _mm256_storeu_pd(tC, ymm8); + _mm256_storeu_pd(tC + 4, ymm9); + + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm6 = _mm256_fmadd_pd(ymm1, ymm3, ymm6); + + ymm3 = _mm256_loadu_pd(tA + 4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + ymm7 = _mm256_mul_pd(ymm7, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ + _mm256_storeu_pd(tC, ymm4); + _mm256_storeu_pd(tC + 4, ymm5); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ + _mm256_storeu_pd(tC, ymm6); + _mm256_storeu_pd(tC + 4, ymm7); + + col_idx += 2; + + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm3 = _mm256_loadu_pd(tA + 4); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); + ymm2 = _mm256_loadu_pd(tC + 4); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ + _mm256_storeu_pd(tC, ymm4); + _mm256_storeu_pd(tC + 4, ymm5); + + } + + row_idx += 8; + } + + if (m_remainder >= 4) + { + m_remainder -= 4; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm6 = _mm256_mul_pd(ymm6, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);*/ + _mm256_storeu_pd(tC, ymm4); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ + _mm256_storeu_pd(tC, ymm5); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);*/ + _mm256_storeu_pd(tC, ymm6); + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm4 = _mm256_mul_pd(ymm4, ymm0); + ymm5 = _mm256_mul_pd(ymm5, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);*/ + _mm256_storeu_pd(tC, ymm4); + + // multiply C by beta and accumulate. + tC += ldc; + /*ymm2 = _mm256_loadu_pd(tC); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ + _mm256_storeu_pd(tC, ymm5); + + col_idx += 2; + + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm4 = _mm256_setzero_pd(); + + for (k = 0; k < K; ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + ymm4 = _mm256_mul_pd(ymm4, ymm0); + + // multiply C by beta and accumulate. + /*ymm2 = _mm256_loadu_pd(tC); + ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);*/ + _mm256_storeu_pd(tC, ymm4); + + } + + row_idx += 4; + } + // M is not a multiple of 32. + // The handling of edge case where the remainder + // dimension is less than 8. The padding takes place + // to handle this case. + if ((m_remainder) && (lda > 3)) + { + double f_temp[8]; + + for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + // clear scratch registers. + ymm5 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + + for (k = 0; k < (K - 1); ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; + + //broadcasted matrix B elements are multiplied + //with matrix A columns. + ymm3 = _mm256_loadu_pd(tA); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + + tA += lda; + } + // alpha, beta multiplication. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); + tB += tb_inc_row; + + for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tA[i]; + } + ymm3 = _mm256_loadu_pd(f_temp); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + //multiply A*B by alpha. + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm7 = _mm256_mul_pd(ymm7, ymm0); + ymm9 = _mm256_mul_pd(ymm9, ymm0); + + + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_pd(f_temp); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ + _mm256_storeu_pd(f_temp, ymm5); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + + tC += ldc; + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_pd(f_temp); + ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ + _mm256_storeu_pd(f_temp, ymm7); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + + tC += ldc; + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_pd(f_temp); + ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);*/ + _mm256_storeu_pd(f_temp, ymm9); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + } + n_remainder = N - col_idx; + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 2) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm5 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + + for (k = 0; k < (K - 1); ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; + + ymm3 = _mm256_loadu_pd(tA); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + tA += lda; + } + + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); + tB += tb_inc_row; + + for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tA[i]; + } + ymm3 = _mm256_loadu_pd(f_temp); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); + + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + ymm5 = _mm256_mul_pd(ymm5, ymm0); + ymm7 = _mm256_mul_pd(ymm7, ymm0); + + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_pd(f_temp); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ + _mm256_storeu_pd(f_temp, ymm5); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + + tC += ldc; + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_pd(f_temp); + ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ + _mm256_storeu_pd(f_temp, ymm7); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + } + // if the N is not multiple of 3. + // handling edge case. + if (n_remainder == 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + ymm5 = _mm256_setzero_pd(); + + for (k = 0; k < (K - 1); ++k) + { + // The inner loop broadcasts the B matrix data and + // multiplies it with the A matrix. + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; + + ymm3 = _mm256_loadu_pd(tA); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + tA += lda; + } + + ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); + tB += tb_inc_row; + + for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tA[i]; + } + ymm3 = _mm256_loadu_pd(f_temp); + ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); + + ymm0 = _mm256_broadcast_sd(alpha_cast); + //ymm1 = _mm256_broadcast_sd(beta_cast); + + // multiply C by beta and accumulate. + ymm5 = _mm256_mul_pd(ymm5, ymm0); + + /*for (int i = 0; i < m_remainder; i++) + { + f_temp[i] = tC[i]; + } + ymm2 = _mm256_loadu_pd(f_temp); + ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ + _mm256_storeu_pd(f_temp, ymm5); + for (int i = 0; i < m_remainder; i++) + { + tC[i] = f_temp[i]; + } + } + m_remainder = 0; + } + + if (m_remainder) + { + double result; + for (; row_idx < M; row_idx += 1) + { + for (col_idx = 0; col_idx < N; col_idx += 1) + { + //pointer math to point to proper memory + tC = C + ldc * col_idx + row_idx; + tB = B + tb_inc_col * col_idx; + tA = A + row_idx; + + result = 0; + for (k = 0; k < K; ++k) + { + result += (*tA) * (*tB); + tA += lda; + tB += tb_inc_row; + } + + result *= (*alpha_cast); + (*tC) = /*(*tC) * (*beta_cast) + */result; + } + } + } + + //copy/compute sryk values back to C using SIMD + if ( bli_seq0( *beta_cast ) ) + {//just copy for beta = 0 + dim_t _i, _j, k, _l; + if(bli_obj_is_lower(c)) //c is lower + { + //first column + _j = 0; + k = M >> 2; + _i = 0; + for ( _l = 0; _l < k; _l++ ) + { + ymm0 = _mm256_loadu_pd((C + _i*rsc)); + _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0); + _i += 4; + } + while (_i < M ) + { + bli_ddcopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + _i++; + } + _j++; + while ( _j < N ) //next column + { + //k = (_j + (4 - (_j & 3))); + _l = _j & 3; + k = (_l != 0) ? (_j + (4 - _l)) : _j; + k = (k <= M) ? k : M; + for ( _i = _j; _i < k; ++_i ) + { + bli_ddcopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + k = (M - _i) >> 2; + _l = 0; + while ( _l < k ) + { + ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); + _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); + + _i += 4; + _l++; + } + while (_i < M ) + { + bli_ddcopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + _i++; + } + _j++; + } + } + else //c is upper + { + for ( _j = 0; _j < N; ++_j ) + { + k = (_j + 1) >> 2; + _i = 0; + _l = 0; + while ( _l < k ) + { + ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); + _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); + _i += 4; + _l++; + } + while (_i <= _j ) + { + bli_ddcopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + ++_i; + } + } + } + } + else + {//when beta is non-zero, fmadd and store the results + dim_t _i, _j, k, _l; + ymm1 = _mm256_broadcast_sd(beta_cast); + if(bli_obj_is_lower(c)) //c is lower + { + //first column + _j = 0; + k = M >> 2; + _i = 0; + for ( _l = 0; _l < k; _l++ ) + { + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC)); + ymm0 = _mm256_loadu_pd((C + _i*rsc)); + ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); + _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0); + _i += 4; + } + while (_i < M ) + { + bli_dddxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + _i++; + } + _j++; + while ( _j < N ) //next column + { + //k = (_j + (4 - (_j & 3))); + _l = _j & 3; + k = (_l != 0) ? (_j + (4 - _l)) : _j; + k = (k <= M) ? k : M; + for ( _i = _j; _i < k; ++_i ) + { + bli_dddxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + k = (M - _i) >> 2; + _l = 0; + while ( _l < k ) + { + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); + ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); + _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); + + _i += 4; + _l++; + } + while (_i < M ) + { + bli_dddxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + _i++; + } + _j++; + } + } + else //c is upper + { + for ( _j = 0; _j < N; ++_j ) + { + k = (_j + 1) >> 2; + _i = 0; + _l = 0; + while ( _l < k ) + { + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); + ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); + _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); + _i += 4; + _l++; + } + while (_i <= _j ) + { + bli_dddxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + ++_i; + } + } + } + } + + return BLIS_SUCCESS; + } + else + return BLIS_NONCONFORMAL_DIMENSIONS; + + +}; + +static err_t bli_ssyrk_small_atbn + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) +{ + int M = bli_obj_length(c); // number of rows of Matrix C + int N = bli_obj_width(c); // number of columns of Matrix C + int K = bli_obj_length(b); // number of rows of Matrix B + int lda = bli_obj_col_stride(a); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. + int ldb = bli_obj_col_stride(b); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. + int ldc_matC = bli_obj_col_stride( c ); // column stride of matrix C + int ldc = M;//bli_obj_col_stride( c ); // column stride of static buffer for matrix C + int row_idx = 0, col_idx = 0, k; + int rs_matC = bli_obj_row_stride( c ); + int rsc = 1; + float *A = a->buffer; // pointer to matrix A elements, stored in row major format + float *B = b->buffer; // pointer to matrix B elements, stored in column major format + float *C = C_pack; // pointer to matrix C elements, stored in column major format + float *matCbuf = c->buffer; + + float *tA = A, *tB = B, *tC = C; + + __m256 ymm4, ymm5, ymm6, ymm7; + __m256 ymm8, ymm9, ymm10, ymm11; + __m256 ymm12, ymm13, ymm14, ymm15; + __m256 ymm0, ymm1, ymm2, ymm3; + + float result, scratch[8]; + float *alpha_cast, *beta_cast; // alpha, beta multiples + alpha_cast = (alpha->buffer); + beta_cast = (beta->buffer); + + // The non-copy version of the A^T SYRK gives better performance for the small M cases. + // The threshold is controlled by BLIS_ATBN_M_THRES + if (M <= BLIS_ATBN_M_THRES) + { + for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) + { + for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + ymm6 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + ymm8 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm11 = _mm256_setzero_ps(); + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + ymm15 = _mm256_setzero_ps(); + + //The inner loop computes the 4x3 values of the matrix. + //The computation pattern is: + // ymm4 ymm5 ymm6 + // ymm7 ymm8 ymm9 + // ymm10 ymm11 ymm12 + // ymm13 ymm14 ymm15 + + //The Dot operation is performed in the inner loop, 8 float elements fit + //in the YMM register hence loop count incremented by 8 + for (k = 0; (k + 7) < K; k += 8) + { + ymm0 = _mm256_loadu_ps(tB + 0); + ymm1 = _mm256_loadu_ps(tB + ldb); + ymm2 = _mm256_loadu_ps(tB + 2 * ldb); + + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); + + ymm3 = _mm256_loadu_ps(tA + lda); + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); + + ymm3 = _mm256_loadu_ps(tA + 2 * lda); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); + ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 3 * lda); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); + ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); + + tA += 8; + tB += 8; + + } + + // if K is not a multiple of 8, padding is done before load using temproary array. + if (k < K) + { + int iter; + float data_feeder[8] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_ps(data_feeder); + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + ldb]; + ymm1 = _mm256_loadu_ps(data_feeder); + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + 2 * ldb]; + ymm2 = _mm256_loadu_ps(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); + ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); + ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); + + } + + //horizontal addition and storage of the data. + //Results for 4x3 blocks of C is stored here + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + _mm256_storeu_ps(scratch, ymm4); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + ymm7 = _mm256_hadd_ps(ymm7, ymm7); + ymm7 = _mm256_hadd_ps(ymm7, ymm7); + _mm256_storeu_ps(scratch, ymm7); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[1] = result/* + tC[1] * (*beta_cast)*/; + + ymm10 = _mm256_hadd_ps(ymm10, ymm10); + ymm10 = _mm256_hadd_ps(ymm10, ymm10); + _mm256_storeu_ps(scratch, ymm10); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[2] = result/* + tC[2] * (*beta_cast)*/; + + ymm13 = _mm256_hadd_ps(ymm13, ymm13); + ymm13 = _mm256_hadd_ps(ymm13, ymm13); + _mm256_storeu_ps(scratch, ymm13); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[3] = result/* + tC[3] * (*beta_cast)*/; + + tC += ldc; + ymm5 = _mm256_hadd_ps(ymm5, ymm5); + ymm5 = _mm256_hadd_ps(ymm5, ymm5); + _mm256_storeu_ps(scratch, ymm5); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + ymm8 = _mm256_hadd_ps(ymm8, ymm8); + ymm8 = _mm256_hadd_ps(ymm8, ymm8); + _mm256_storeu_ps(scratch, ymm8); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[1] = result/* + tC[1] * (*beta_cast)*/; + + ymm11 = _mm256_hadd_ps(ymm11, ymm11); + ymm11 = _mm256_hadd_ps(ymm11, ymm11); + _mm256_storeu_ps(scratch, ymm11); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[2] = result/* + tC[2] * (*beta_cast)*/; + + ymm14 = _mm256_hadd_ps(ymm14, ymm14); + ymm14 = _mm256_hadd_ps(ymm14, ymm14); + _mm256_storeu_ps(scratch, ymm14); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[3] = result/* + tC[3] * (*beta_cast)*/; + + tC += ldc; + ymm6 = _mm256_hadd_ps(ymm6, ymm6); + ymm6 = _mm256_hadd_ps(ymm6, ymm6); + _mm256_storeu_ps(scratch, ymm6); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + ymm9 = _mm256_hadd_ps(ymm9, ymm9); + ymm9 = _mm256_hadd_ps(ymm9, ymm9); + _mm256_storeu_ps(scratch, ymm9); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[1] = result/* + tC[1] * (*beta_cast)*/; + + ymm12 = _mm256_hadd_ps(ymm12, ymm12); + ymm12 = _mm256_hadd_ps(ymm12, ymm12); + _mm256_storeu_ps(scratch, ymm12); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[2] = result/* + tC[2] * (*beta_cast)*/; + + ymm15 = _mm256_hadd_ps(ymm15, ymm15); + ymm15 = _mm256_hadd_ps(ymm15, ymm15); + _mm256_storeu_ps(scratch, ymm15); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[3] = result/* + tC[3] * (*beta_cast)*/; + } + } + + int processed_col = col_idx; + int processed_row = row_idx; + + //The edge case handling where N is not a multiple of 3 + if (processed_col < N) + { + for (col_idx = processed_col; col_idx < N; col_idx += 1) + { + for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + + //The inner loop computes the 4x1 values of the matrix. + //The computation pattern is: + // ymm4 + // ymm7 + // ymm10 + // ymm13 + + for (k = 0; (k + 7) < K; k += 8) + { + ymm0 = _mm256_loadu_ps(tB + 0); + + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + ymm3 = _mm256_loadu_ps(tA + lda); + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + + ymm3 = _mm256_loadu_ps(tA + 2 * lda); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + + ymm3 = _mm256_loadu_ps(tA + 3 * lda); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + + tA += 8; + tB += 8; + } + + // if K is not a multiple of 8, padding is done before load using temproary array. + if (k < K) + { + int iter; + float data_feeder[8] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_ps(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + + } + + //horizontal addition and storage of the data. + //Results for 4x1 blocks of C is stored here + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + _mm256_storeu_ps(scratch, ymm4); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + ymm7 = _mm256_hadd_ps(ymm7, ymm7); + ymm7 = _mm256_hadd_ps(ymm7, ymm7); + _mm256_storeu_ps(scratch, ymm7); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[1] = result/* + tC[1] * (*beta_cast)*/; + + ymm10 = _mm256_hadd_ps(ymm10, ymm10); + ymm10 = _mm256_hadd_ps(ymm10, ymm10); + _mm256_storeu_ps(scratch, ymm10); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[2] = result/* + tC[2] * (*beta_cast)*/; + + ymm13 = _mm256_hadd_ps(ymm13, ymm13); + ymm13 = _mm256_hadd_ps(ymm13, ymm13); + _mm256_storeu_ps(scratch, ymm13); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[3] = result/* + tC[3] * (*beta_cast)*/; + + } + } + processed_row = row_idx; + } + + //The edge case handling where M is not a multiple of 4 + if (processed_row < M) + { + for (row_idx = processed_row; row_idx < M; row_idx += 1) + { + for (col_idx = 0; col_idx < N; col_idx += 1) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + + for (k = 0; (k + 7) < K; k += 8) + { + ymm0 = _mm256_loadu_ps(tB + 0); + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + tA += 8; + tB += 8; + } + + // if K is not a multiple of 8, padding is done before load using temproary array. + if (k < K) + { + int iter; + float data_feeder[8] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_ps(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + } + + //horizontal addition and storage of the data. + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + _mm256_storeu_ps(scratch, ymm4); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + } + } + } + + //copy/compute sryk values back to C + if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C + { + dim_t _i, _j; + if(bli_obj_is_lower(c)) //c is lower + { + for ( _j = 0; _j < N; ++_j ) + for ( _i = 0; _i < M; ++_i ) + if ( (doff_t)_j - (doff_t)_i <= 0 ) + { + bli_sscopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + } + else //c is upper + { + for ( _j = 0; _j < N; ++_j ) + for ( _i = 0; _i < M; ++_i ) + if ( (doff_t)_j - (doff_t)_i >= 0 ) + { + bli_sscopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + } + } + else //when beta is non-zero, multiply and store result to C + { + dim_t _i, _j; + if(bli_obj_is_lower(c)) //c is lower + { + for ( _j = 0; _j < N; ++_j ) + for ( _i = 0; _i < M; ++_i ) + if ( (doff_t)_j - (doff_t)_i <= 0 ) + { + bli_sssxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + } + else //c is upper + { + for ( _j = 0; _j < N; ++_j ) + for ( _i = 0; _i < M; ++_i ) + if ( (doff_t)_j - (doff_t)_i >= 0 ) + { + bli_sssxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + } + } + + return BLIS_SUCCESS; + } + else + return BLIS_NONCONFORMAL_DIMENSIONS; +} + +static err_t bli_dsyrk_small_atbn + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) +{ + int M = bli_obj_length( c ); // number of rows of Matrix C + int N = bli_obj_width( c ); // number of columns of Matrix C + int K = bli_obj_length( b ); // number of rows of Matrix B + int lda = bli_obj_col_stride( a ); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. + int ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. + int ldc_matC = bli_obj_col_stride( c ); // column stride of matrix C + int ldc = M;//bli_obj_col_stride( c ); // column stride of static buffer for matrix C + int row_idx = 0, col_idx = 0, k; + int rs_matC = bli_obj_row_stride( c ); + int rsc = 1; + double *A = a->buffer; // pointer to matrix A elements, stored in row major format + double *B = b->buffer; // pointer to matrix B elements, stored in column major format + double *C = D_C_pack; // pointer to matrix C elements, stored in column major format + double *matCbuf = c->buffer; + + double *tA = A, *tB = B, *tC = C; + + __m256d ymm4, ymm5, ymm6, ymm7; + __m256d ymm8, ymm9, ymm10, ymm11; + __m256d ymm12, ymm13, ymm14, ymm15; + __m256d ymm0, ymm1, ymm2, ymm3; + + double result, scratch[8]; + double *alpha_cast, *beta_cast; // alpha, beta multiples + alpha_cast = (alpha->buffer); + beta_cast = (beta->buffer); + + // The non-copy version of the A^T SYRK gives better performance for the small M cases. + // The threshold is controlled by BLIS_ATBN_M_THRES + if (M <= BLIS_ATBN_M_THRES) + { + for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) + { + for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm11 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); + + //The inner loop computes the 4x3 values of the matrix. + //The computation pattern is: + // ymm4 ymm5 ymm6 + // ymm7 ymm8 ymm9 + // ymm10 ymm11 ymm12 + // ymm13 ymm14 ymm15 + + //The Dot operation is performed in the inner loop, 4 double elements fit + //in the YMM register hence loop count incremented by 4 + for (k = 0; (k + 3) < K; k += 4) + { + ymm0 = _mm256_loadu_pd(tB + 0); + ymm1 = _mm256_loadu_pd(tB + ldb); + ymm2 = _mm256_loadu_pd(tB + 2 * ldb); + + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); + + ymm3 = _mm256_loadu_pd(tA + lda); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + + ymm3 = _mm256_loadu_pd(tA + 2 * lda); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 3 * lda); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); + + tA += 4; + tB += 4; + + } + + // if K is not a multiple of 4, padding is done before load using temproary array. + if (k < K) + { + int iter; + double data_feeder[4] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_pd(data_feeder); + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + ldb]; + ymm1 = _mm256_loadu_pd(data_feeder); + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + 2 * ldb]; + ymm2 = _mm256_loadu_pd(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); + + } + + //horizontal addition and storage of the data. + //Results for 4x3 blocks of C is stored here + ymm4 = _mm256_hadd_pd(ymm4, ymm4); + _mm256_storeu_pd(scratch, ymm4); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + ymm7 = _mm256_hadd_pd(ymm7, ymm7); + _mm256_storeu_pd(scratch, ymm7); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[1] = result/* + tC[1] * (*beta_cast)*/; + + ymm10 = _mm256_hadd_pd(ymm10, ymm10); + _mm256_storeu_pd(scratch, ymm10); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[2] = result/* + tC[2] * (*beta_cast)*/; + + ymm13 = _mm256_hadd_pd(ymm13, ymm13); + _mm256_storeu_pd(scratch, ymm13); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[3] = result/* + tC[3] * (*beta_cast)*/; + + + tC += ldc; + ymm5 = _mm256_hadd_pd(ymm5, ymm5); + _mm256_storeu_pd(scratch, ymm5); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + ymm8 = _mm256_hadd_pd(ymm8, ymm8); + _mm256_storeu_pd(scratch, ymm8); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[1] = result/* + tC[1] * (*beta_cast)*/; + + ymm11 = _mm256_hadd_pd(ymm11, ymm11); + _mm256_storeu_pd(scratch, ymm11); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[2] = result/* + tC[2] * (*beta_cast)*/; + + ymm14 = _mm256_hadd_pd(ymm14, ymm14); + _mm256_storeu_pd(scratch, ymm14); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[3] = result/* + tC[3] * (*beta_cast)*/; + + + tC += ldc; + ymm6 = _mm256_hadd_pd(ymm6, ymm6); + _mm256_storeu_pd(scratch, ymm6); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + ymm9 = _mm256_hadd_pd(ymm9, ymm9); + _mm256_storeu_pd(scratch, ymm9); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[1] = result/* + tC[1] * (*beta_cast)*/; + + ymm12 = _mm256_hadd_pd(ymm12, ymm12); + _mm256_storeu_pd(scratch, ymm12); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[2] = result/* + tC[2] * (*beta_cast)*/; + + ymm15 = _mm256_hadd_pd(ymm15, ymm15); + _mm256_storeu_pd(scratch, ymm15); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[3] = result/* + tC[3] * (*beta_cast)*/; + } + } + + int processed_col = col_idx; + int processed_row = row_idx; + + //The edge case handling where N is not a multiple of 3 + if (processed_col < N) + { + for (col_idx = processed_col; col_idx < N; col_idx += 1) + { + for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + + //The inner loop computes the 4x1 values of the matrix. + //The computation pattern is: + // ymm4 + // ymm7 + // ymm10 + // ymm13 + + for (k = 0; (k + 3) < K; k += 4) + { + ymm0 = _mm256_loadu_pd(tB + 0); + + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm3 = _mm256_loadu_pd(tA + lda); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + ymm3 = _mm256_loadu_pd(tA + 2 * lda); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + + ymm3 = _mm256_loadu_pd(tA + 3 * lda); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + + tA += 4; + tB += 4; + } + // if K is not a multiple of 4, padding is done before load using temproary array. + if (k < K) + { + int iter; + double data_feeder[4] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_pd(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + + } + + //horizontal addition and storage of the data. + //Results for 4x1 blocks of C is stored here + ymm4 = _mm256_hadd_pd(ymm4, ymm4); + _mm256_storeu_pd(scratch, ymm4); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + ymm7 = _mm256_hadd_pd(ymm7, ymm7); + _mm256_storeu_pd(scratch, ymm7); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[1] = result/* + tC[1] * (*beta_cast)*/; + + ymm10 = _mm256_hadd_pd(ymm10, ymm10); + _mm256_storeu_pd(scratch, ymm10); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[2] = result/* + tC[2] * (*beta_cast)*/; + + ymm13 = _mm256_hadd_pd(ymm13, ymm13); + _mm256_storeu_pd(scratch, ymm13); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[3] = result/* + tC[3] * (*beta_cast)*/; + + } + } + processed_row = row_idx; + } + + // The edge case handling where M is not a multiple of 4 + if (processed_row < M) + { + for (row_idx = processed_row; row_idx < M; row_idx += 1) + { + for (col_idx = 0; col_idx < N; col_idx += 1) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + + for (k = 0; (k + 3) < K; k += 4) + { + ymm0 = _mm256_loadu_pd(tB + 0); + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tA += 4; + tB += 4; + } + + // if K is not a multiple of 4, padding is done before load using temproary array. + if (k < K) + { + int iter; + double data_feeder[4] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_pd(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + } + + //horizontal addition and storage of the data. + ymm4 = _mm256_hadd_pd(ymm4, ymm4); + _mm256_storeu_pd(scratch, ymm4); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result/* + tC[0] * (*beta_cast)*/; + + } + } + } + + //copy/compute sryk values back to C + if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C + { + dim_t _i, _j; + if(bli_obj_is_lower(c)) //c is lower + { + for ( _j = 0; _j < N; ++_j ) + for ( _i = 0; _i < M; ++_i ) + if ( (doff_t)_j - (doff_t)_i <= 0 ) + { + bli_ddcopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + } + else //c is upper + { + for ( _j = 0; _j < N; ++_j ) + for ( _i = 0; _i < M; ++_i ) + if ( (doff_t)_j - (doff_t)_i >= 0 ) + { + bli_ddcopys( *(C + _i*rsc + _j*ldc), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + } + } + else //when beta is non-zero, multiply and store result to C + { + dim_t _i, _j; + if(bli_obj_is_lower(c)) //c is lower + { + for ( _j = 0; _j < N; ++_j ) + for ( _i = 0; _i < M; ++_i ) + if ( (doff_t)_j - (doff_t)_i <= 0 ) + { + bli_dddxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + } + else //c is upper + { + for ( _j = 0; _j < N; ++_j ) + for ( _i = 0; _i < M; ++_i ) + if ( (doff_t)_j - (doff_t)_i >= 0 ) + { + bli_dddxpbys( *(C + _i*rsc + _j*ldc), + *(beta_cast), + *(matCbuf + _i*rs_matC + _j*ldc_matC) ); + } + } + } + + return BLIS_SUCCESS; + } + else + return BLIS_NONCONFORMAL_DIMENSIONS; +} + +#endif + diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c new file mode 100644 index 000000000..ca8e5b142 --- /dev/null +++ b/kernels/zen/3/bli_trsm_small.c @@ -0,0 +1,15021 @@ +/* + +BLIS +An object-based framework for developing high-performance BLAS-like +libraries. + +Copyright (C) 2018, Advanced Micro Devices, Inc. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of The University of Texas at Austin nor the names +of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM +#include "immintrin.h" + +#define GEMM_BLK_V1 8 //Block size to perform gemm and apply trsm +#define GEMM_ACCUM_A 1 //Peform B1=B1-(B0*A0) operation instead of B1'=(B0*A0) and then B1=B1-B1' +#define OPT_CACHE_BLOCKING_L1 1 //Perform trsm block-wise in blocks of GEMM_BLK_V1 instead of all columns of B together. +#define REARRANGE_SHFL 0 //Rearrange operations using blend or shuffle +#define BLI_AlXB_M_SP 16 +#define BLI_AlXB_M_DP 16 +#define BLI_XAltB_N_SP 128 +#define BLI_XAltB_N_DP 64 +#define BLI_AutXB_M_SP 64 +#define BLI_AutXB_N_SP 128 + +static void (*fp_blis_strsm_microkernel)( float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ); +static void blis_strsm_microkernel( float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ); +static void blis_strsm_microkernel_alpha( float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + float alphaVal + ); +static void blis_strsm_microkernel_unitDiag( float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ); +static void blis_strsm_microkernel_alpha_unitDiag( float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + float alphaVal + ); +static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b); +static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + float alphaVal); +static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b); +static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + float alphaVal); + +static void (*fp_blis_dtrsm_microkernel)( double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ); + +static void blis_dtrsm_microkernel( double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ); + +static void blis_dtrsm_microkernel_alpha( double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + double alphaVal + ); + +static void blis_dtrsm_microkernel_unitDiag( double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ); + +static void blis_dtrsm_microkernel_alpha_unitDiag( double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + double alphaVal + ); + +static void dtrsm_XAtB_block_allSmallSizedMatrices(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b); +static void dtrsm_XAtB_block_allSmallSizedMatrices_alpha(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + double alphaVal); +static void dtrsm_XAtB_block_allSmallSizedMatrices_unitDiag(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b); +static void dtrsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + double alphaVal); +static void trsm_AutXB_block_allSmallSizedMatrices(float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b); +static void trsm_AutXB_block_allSmallSizedMatrices_alpha(float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + float alpha); +static void trsm_AutXB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b); +static void trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, + float *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + float alpha); + +//AX = B; A is lower triangular; No transpose; single precision +static err_t bli_strsm_small_AlXB + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); +//A.'X = B; A is upper triangular; A has to be transposed; single precision +static err_t bli_strsm_small_AutXB + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); + +//XA.' = B; A is lower triangular; A has to be transposed; single precision +static err_t bli_strsm_small_XAltB + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); +//AX = B; A is lower triangular; No transpose; double precision +static err_t bli_dtrsm_small_AlXB + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); + + +//A.'X = B; A is upper triangular; A has to be transposed; double precision +static err_t bli_dtrsm_small_AutXB + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); + + +//XA.' = B; A is lower triangular; A has to be transposed; double precision +static err_t bli_dtrsm_small_XAltB + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); + void trsm_block_c(float *ptr_l, float *ptr_b, int blk_height, int blk_width, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b); +/* +* The bli_trsm_small implements unpacked version of TRSM +* Currently only column-major is supported, A & B are column-major +* Input: A: MxM (triangular matrix) +* B: MxN matrix +* Output: X: MxN matrix such that AX = alpha*B or XA = alpha*B or A'X = alpha*B or XA' = alpha*B +* Here the output X is stored in B +* The custom-kernel will be called only when M*(M+N)* sizeof(Matrix Elements) < L3 cache +*/ +err_t bli_trsm_small + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) +{ +#ifdef BLIS_ENABLE_MULTITHREADING + return BLIS_NOT_YET_IMPLEMENTED; +#endif + + // If alpha is zero, B matrix will become zero after scaling & hence solution is also zero matrix + if (bli_obj_equals(alpha, &BLIS_ZERO)) + { + return BLIS_NOT_YET_IMPLEMENTED; // scale B by alpha + } + // We have to call matrix scaling if alpha != 1.0 + + // if row major format return. Check this again. + if ((bli_obj_row_stride(a) != 1) || + (bli_obj_row_stride(b) != 1)) + { + return BLIS_INVALID_ROW_STRIDE; + } + + num_t dt = ((*b).info & (0x7 << 0)); + + // only float and double datatypes are supported as of now. + if (dt != BLIS_DOUBLE && dt != BLIS_FLOAT) + { + return BLIS_EXPECTED_REAL_DATATYPE; + } + + // A is expected to be triangular in trsm + if (!bli_obj_is_upper_or_lower (a)) + { + return BLIS_EXPECTED_TRIANGULAR_OBJECT; + } + + // can use other control structs - even can use array of function pointers, + // indexed by a number with bits formed by f('side', 'uplo', 'transa', dt). + // In the below implementation, based on the number of finally implemented + // cases, can move the checks with more cases higher up. + if (side == BLIS_LEFT) + { + if (bli_obj_has_trans(a)) + { + if (dt == BLIS_DOUBLE) + { + if (bli_obj_is_upper(a)) + { + //A.'X = B; A is upper triangular; A has to be transposed; double precision +#if 0 // planning to implement this in this iteration + return bli_dtrsm_small_AutXB(side, alpha, a, b, cntx, cntl); +#else + return BLIS_NOT_YET_IMPLEMENTED; +#endif + } + else + { + return BLIS_NOT_YET_IMPLEMENTED; + } + } + else if (dt == BLIS_FLOAT) + { + if (bli_obj_is_upper(a)) + { + //A.'X = B; A is upper triangular; A has to be transposed; single precision + return bli_strsm_small_AutXB(side, alpha, a, b, cntx, cntl); + } + else + { + return BLIS_NOT_YET_IMPLEMENTED; + } + } + } + else + { + if (dt == BLIS_DOUBLE) + { + if (bli_obj_is_upper(a)) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + else + { + //AX = B; A is lower triangular; No transpose; double precision + return bli_dtrsm_small_AlXB(side, alpha, a, b, cntx, cntl); + } + } + else if (dt == BLIS_FLOAT) + { + if (bli_obj_is_upper(a)) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + else + { + //AX = B; A is lower triangular; No transpose; single precision + return bli_strsm_small_AlXB(side, alpha, a, b, cntx, cntl); + } + } + } + } + else + { + if (bli_obj_has_trans(a)) + { + if (dt == BLIS_DOUBLE) + { + if (bli_obj_is_upper(a)) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + else + { + //XA.' = B; A is lower triangular; A has to be transposed; double precision + return bli_dtrsm_small_XAltB(side, alpha, a, b, cntx, cntl); + } + } + else if (dt == BLIS_FLOAT) + { + if (bli_obj_is_upper(a)) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + else + { + //XA.' = B; A is lower triangular; A has to be transposed; single precision + return bli_strsm_small_XAltB(side, alpha, a, b, cntx, cntl); + } + } + } + else + { + return BLIS_NOT_YET_IMPLEMENTED; + } + } + + return BLIS_NOT_YET_IMPLEMENTED; +}; + + +static void trsm_small_AlXB ( + float *A, + float *B, + int M, + int N, + int lda, + int ldb + ) +{ + int i; + int j; + int k; + + // Need to incorporate alpha + + for (k = 0; k < M; k++) + { + float lkk_inv = 1.0/A[k+k*lda]; + + for (j = 0; j < N; j++) + { + B[k + j*ldb] *= lkk_inv; + + for (i = k+1; i < M; i++) + { + B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb]; + } + } + }// k -loop + +}// end of function + + +// Test code: +void gemm_small( float *ptr_l, + float *ptr_b, + int blk_m, + int blk_n, + float *ptr_gemmOut, + int cs_l, + int cs_b, + int rs_l, + int rs_b, + float alpha, + float beta) +{ + int i, j, k; + + for (i = 0; i < blk_m; i++) + { + for (j = 0; j < blk_n; j++) + { + float t = 0.0; + for (k = 0; k < blk_m; k++) + { + t += (ptr_l[i*rs_l + k* cs_l] * ptr_b[k*rs_b + j*cs_b]); + } + ptr_gemmOut[i*rs_b + j*cs_b] = beta * ptr_gemmOut[i*rs_b + j*cs_b] + alpha * t; + } + } +} + +/* + * AX = Alpha*B, Double precision, A:lower triangular + * THIS KERNEL SUPPORTS MATRIX SIZE OF THE FORM BLI_AlXB_M_DPX4*i, WHERE i IS AN INTEGER + */ + +static err_t bli_dtrsm_small_AlXB ( + side_t side, + obj_t* AlphaObj, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) +{ + obj_t alpha, beta; // gemm parameters + obj_t Ga, Gb, Gc; // for GEMM + int m = bli_obj_length(b); // number of rows of matrix B + int n = bli_obj_width(b); // number of columns of matrix B + + int lda = bli_obj_col_stride(a); // column stride of A + int ldb = bli_obj_col_stride(b); // column stride of B + + int rsa = bli_obj_row_stride(a); // row stride of A + int rsb = bli_obj_row_stride(b); // row stride of B + + int i = 0; + int j; + int blk_size = 4; + int isUnitDiag = bli_obj_has_unit_diag(a); + + double alphaVal; + double *L = a->buffer; + double *B = b->buffer; + + if (m != BLI_AlXB_M_DP || (n&3) != 0) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + + alphaVal = *((double *)AlphaObj->buffer); + + /* Small _GEMM preparation code */ + bli_obj_create( BLIS_DOUBLE, 1, 1, 0, 0, &alpha ); + bli_obj_create( BLIS_DOUBLE, 1, 1, 0, 0, &beta ); + + /* B = B - A*B */ + bli_setsc( -(1.0), 0.0, &alpha ); + bli_setsc( (1.0), 0.0, &beta ); + + bli_obj_create_with_attached_buffer( BLIS_DOUBLE, blk_size, blk_size, a->buffer, rsa, lda, &Ga); + bli_obj_create_with_attached_buffer( BLIS_DOUBLE, blk_size, n, b->buffer, rsb, ldb, &Gb); + bli_obj_create_with_attached_buffer( BLIS_DOUBLE, blk_size, n, b->buffer, rsb, ldb, &Gc); + + bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Ga ); + bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Gb ); + bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Gc ); + + //first block of trsm + Gb.buffer = (void*)(B + i); + + if (alphaVal != 1) + { + if (isUnitDiag == 0) + { + blis_dtrsm_microkernel_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + fp_blis_dtrsm_microkernel = blis_dtrsm_microkernel; + } + else + { + blis_dtrsm_microkernel_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + fp_blis_dtrsm_microkernel = blis_dtrsm_microkernel_unitDiag; + } + bli_setsc( alphaVal, 0.0, &beta ); + } + else + { + if (isUnitDiag == 0) + { + blis_dtrsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + fp_blis_dtrsm_microkernel = blis_dtrsm_microkernel; + } + else + { + blis_dtrsm_microkernel_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + fp_blis_dtrsm_microkernel = blis_dtrsm_microkernel_unitDiag; + } + } + + +//gemm update + for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT + { + Ga.buffer = (void*)(L + j + i*lda); + Gc.buffer = (void*)(B + j); + bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb + } + bli_setsc( (1.0), 0.0, &beta ); + + //trsm of remaining blocks + for (i = blk_size; i < m; i += blk_size) + { + Gb.buffer = (void*)(B + i); + + fp_blis_dtrsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + + + for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT + { + Ga.buffer = (void*)(L + j + i*lda); + Gc.buffer = (void*)(B + j); + + bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb + } + + } // End of for loop - i + + return BLIS_SUCCESS; + +} + + +/* + * AX = Alpha*B, Single precision, A: lower triangular + * This kernel implementation supports matrices A and B such that m is equal to BLI_AlXB_M_SP and n is mutiple of 8 + */ +static err_t bli_strsm_small_AlXB ( + side_t side, + obj_t* AlphaObj, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) +{ + obj_t alpha, beta; // gemm parameters + obj_t Ga, Gb, Gc; // for GEMM + int m = bli_obj_length(b); // number of rows of matrix B + int n = bli_obj_width(b); // number of columns of matrix B + + int lda = bli_obj_col_stride(a); // column stride of A + int ldb = bli_obj_col_stride(b); // column stride of B + + int rsa = bli_obj_row_stride(a); // row stride of A + int rsb = bli_obj_row_stride(b); // row stride of B + + int i = 0; + int j; + int blk_size = 8; + int isUnitDiag = bli_obj_has_unit_diag(a); + + float alphaVal; + float *L = a->buffer; + float *B = b->buffer; + + if (m != BLI_AlXB_M_SP || (n&7) != 0) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + if ( (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + + alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, AlphaObj)); + + /* Small _GEMM preparation code */ + bli_obj_create( BLIS_FLOAT, 1, 1, 0, 0, &alpha ); + bli_obj_create( BLIS_FLOAT, 1, 1, 0, 0, &beta ); + + /* B = B - A*B */ + bli_setsc( -(1.0), 0.0, &alpha ); + bli_setsc( (1.0), 0.0, &beta ); + + + bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, blk_size, a->buffer, rsa, lda, &Ga); + bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, n, b->buffer, rsb, ldb, &Gb); + bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, n, b->buffer, rsb, ldb, &Gc); + + bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Ga ); + bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Gb ); + bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Gc ); + + //first block of trsm + Gb.buffer = (void*)(B + i); + + //trsm of first 8xn block + if (alphaVal != 1) + { + if (isUnitDiag == 0) + { + blis_strsm_microkernel_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + fp_blis_strsm_microkernel = blis_strsm_microkernel; + } + else + { + blis_strsm_microkernel_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + fp_blis_strsm_microkernel = blis_strsm_microkernel_unitDiag; + } + bli_setsc( alphaVal, 0.0, &beta ); + } + else + { + if (isUnitDiag == 0) + { + blis_strsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + fp_blis_strsm_microkernel = blis_strsm_microkernel; + } + else + { + blis_strsm_microkernel_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + fp_blis_strsm_microkernel = blis_strsm_microkernel_unitDiag; + } + } + + //gemm update + for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT + { + Ga.buffer = (void*)(L + j + i*lda); + Gc.buffer = (void*)(B + j); + + bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb + } + + //trsm of remaining blocks + for (i = blk_size; i < m; i += blk_size) + { + Gb.buffer = (void*)(B + i); + + fp_blis_strsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + + for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT + { + Ga.buffer = (void*)(L + j + i*lda); + Gc.buffer = (void*)(B + j); + + bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb + } + + } // End of for loop - i + + return BLIS_SUCCESS; +} + +void trsm_block_c(float *ptr_l, float *ptr_b, int blk_height, int blk_width, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + int i, j, k, l; + float inv_l; + + inv_l = 1.0 / *ptr_l; + + for (j = 0; j < numCols_b; j += blk_width) + { + for (l = j; l < (j+blk_width); l++) + { + ptr_b[l*cs_b] = ptr_b[l*cs_b] * inv_l; + } + + for (i = 1; i < blk_height; i++) + { + for (l = j; l < (j+blk_width); l++) + { + for (k = 0; k < i; k++) + { + ptr_b[i*rs_b + l*cs_b] -= (ptr_b[k*rs_b + l*cs_b] * ptr_l[i*rs_l + k*cs_l]); + } + ptr_b[i*rs_b + l*cs_b] = ptr_b[i*rs_b + l*cs_b] / ptr_l[i*rs_l + i*cs_l]; + } + } + } +} + + +/* + * XA' = Alpha*B, Double precision, A:lower triangular + * This kernel implementation supports matrices A and B such that + * m and n are multiples of 4 and n less than or equal to BLI_XAltB_N_DP + */ + +static err_t bli_dtrsm_small_XAltB( + side_t side, + obj_t* AlphaObj, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) +{ + + int m = bli_obj_length(a); // number of rows of matrix B + int n = bli_obj_length(b); // number of columns of matrix B + + int lda = bli_obj_col_stride(a); // column stride of A + int ldb = bli_obj_col_stride(b); // column stride of B + + int rsa = bli_obj_row_stride(a); // row stride of A + int rsb = bli_obj_row_stride(b); // row stride of B + + int i = 0; + int isUnitDiag = bli_obj_has_unit_diag(a); + + double alphaVal; + double *L = a->buffer; + double *B = b->buffer; + + if ((m&3) != 0 || (n&3) != 0) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + if ( n > BLI_XAltB_N_DP || (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + alphaVal = *((double *)AlphaObj->buffer); + if (alphaVal != 1) + { + if (isUnitDiag == 0) + { + dtrsm_XAtB_block_allSmallSizedMatrices_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + } + else + { + dtrsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + } + } + else + { + if (isUnitDiag == 0) + { + dtrsm_XAtB_block_allSmallSizedMatrices((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + } + else + { + dtrsm_XAtB_block_allSmallSizedMatrices_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + } + } + return BLIS_SUCCESS; + +} + + +/* + * XA' = Alpha*B, Single precision, A: lower triangular + * This kernel implementation supports matrices A and B such that + * m and n are multiples of 8 and n is less than or equal to BLI_XAltB_N_SP + */ +static err_t bli_strsm_small_XAltB( + side_t side, + obj_t* AlphaObj, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) +{ + int m = bli_obj_length(a); // number of rows of matrix B + int n = bli_obj_length(b); // number of columns of matrix B + + int lda = bli_obj_col_stride(a); // column stride of A + int ldb = bli_obj_col_stride(b); // column stride of B + + int rsa = bli_obj_row_stride(a); // row stride of A + int rsb = bli_obj_row_stride(b); // row stride of B + + int i = 0; + int isUnitDiag = bli_obj_has_unit_diag(a); + + float alphaVal; + float *L = a->buffer; + float *B = b->buffer; + + if ((m&7) != 0 || (n&7) != 0) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + if ( n > BLI_XAltB_N_SP || (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + + alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, AlphaObj)); + + if (alphaVal != 1) + { + if (isUnitDiag == 0) + { + trsm_XAtB_block_allSmallSizedMatrices_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + } + else + { + trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + } + } + else + { + if (isUnitDiag == 0) + { + trsm_XAtB_block_allSmallSizedMatrices((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + } + else + { + trsm_XAtB_block_allSmallSizedMatrices_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + } + } + return BLIS_SUCCESS; +} + +/* + * A'X = Alpha*B, Single precision, A: upper triangular + * This kernel implementation supports matrices A and B such that + * m and n are multiples of 8, m is less than or equal to BLI_AutXB_M_SP and n is less than or equal to BLI_AutXB_N_SP + */ +static err_t bli_strsm_small_AutXB( + side_t side, + obj_t* AlphaObj, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) +{ + int m = bli_obj_width(a); // number of rows of matrix A (since At, so width is taken) + int n = bli_obj_width(b); // number of columns of matrix B + + int lda = bli_obj_col_stride(a); // column stride of A + int ldb = bli_obj_col_stride(b); // column stride of B + + int rsa = bli_obj_row_stride(a); // row stride of A + int rsb = bli_obj_row_stride(b); // row stride of B + + int i = 0; + int isUnitDiag = bli_obj_has_unit_diag(a); + + float alphaVal; + float *L = a->buffer; + float *B = b->buffer; + + if ((m&7) != 0 || (n&7) != 0) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + if ( m > BLI_AutXB_M_SP || n > BLI_AutXB_N_SP || (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) + { + return BLIS_NOT_YET_IMPLEMENTED; + } + + alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, AlphaObj)); + + if (alphaVal != 1) + { + if (isUnitDiag == 0) + { + trsm_AutXB_block_allSmallSizedMatrices_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + } + else + { + trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); + } + } + else + { + if (isUnitDiag == 0) + { + trsm_AutXB_block_allSmallSizedMatrices((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + } + else + { + trsm_AutXB_block_allSmallSizedMatrices_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); + } + } + return BLIS_SUCCESS; +} +/* +* AX=B A=LOWER TRIANGULAR, NO TRANSPOSE, NON-UNITDIAGONAL +* ALPHA != 1; +*/ +static void blis_dtrsm_microkernel_alpha(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + double alphaVal + ) +{ + double ones = 1.0; + int j; + int cs_b_offset[2]; + double *ptr_b_dup; + + __m256d mat_b_col[4]; + __m256d mat_b_rearr[4]; + __m256d mat_a_cols[4]; + __m256d mat_a_cols_rearr[10]; + __m256d mat_a_diag_inv[4]; + __m256d reciprocal_diags; + __m256d alphaReg; + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + + reciprocal_diags = _mm256_broadcast_sd((double const *)&ones); + alphaReg = _mm256_broadcast_sd((double const *)&alphaVal); + + //read first set of 4x4 block of B into registers + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + (cs_b))); + //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0])); + //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1])); + + //1st col + mat_a_cols_rearr[0] = _mm256_broadcast_sd((double const *)(ptr_l+0)); + mat_a_cols_rearr[1] = _mm256_broadcast_sd((double const *)(ptr_l+1)); + mat_a_cols_rearr[3] = _mm256_broadcast_sd((double const *)(ptr_l+2)); + mat_a_cols_rearr[6] = _mm256_broadcast_sd((double const *)(ptr_l+3)); + + //2nd col + ptr_l += cs_l; + mat_a_cols_rearr[2] = _mm256_broadcast_sd((double const *)(ptr_l + 1)); + mat_a_cols_rearr[4] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_cols_rearr[7] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //3rd col + ptr_l += cs_l; + mat_a_cols_rearr[5] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_cols_rearr[8] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //4th col + ptr_l += cs_l; + mat_a_cols_rearr[9] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + numCols_b -= 4; // blk_width = 4 + + //compute reciprocals of L(i,i) and broadcast in registers + mat_a_diag_inv[0] = _mm256_unpacklo_pd(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); + mat_a_diag_inv[1] = _mm256_unpacklo_pd(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); + + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x0C); + reciprocal_diags = _mm256_div_pd(reciprocal_diags, mat_a_diag_inv[0]); + + for(j = 0;j < numCols_b; j += 4) + { + ptr_b_dup = ptr_b; + /*Shuffle to rearrange/transpose 8x4 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[1] = _mm256_unpacklo_pd(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[3] = _mm256_unpacklo_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange low elements + mat_b_rearr[0] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x20); + mat_b_rearr[2] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x31); + + mat_b_rearr[0] = _mm256_mul_pd(mat_b_rearr[0], alphaReg); + mat_b_rearr[2] = _mm256_mul_pd(mat_b_rearr[2], alphaReg); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_pd(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange high elements + mat_b_rearr[1] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x20); + mat_b_rearr[3] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x31); + + mat_b_rearr[1] = _mm256_mul_pd(mat_b_rearr[1], alphaReg); + mat_b_rearr[3] = _mm256_mul_pd(mat_b_rearr[3], alphaReg); + //extract a00 + mat_a_diag_inv[0] = _mm256_permute_pd(reciprocal_diags, 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_pd(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_pd(mat_b_rearr[0], mat_a_diag_inv[0]); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_pd(reciprocal_diags, 0x03); + mat_a_diag_inv[1] = _mm256_permute2f128_pd(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[1] = _mm256_fnmadd_pd(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_pd(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_pd(reciprocal_diags, 0x00); + mat_a_diag_inv[2] = _mm256_permute2f128_pd(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x11); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_pd(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_pd(reciprocal_diags, 0x0C); + mat_a_diag_inv[3] = _mm256_permute2f128_pd(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x11); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_pd(mat_b_rearr[3], mat_a_diag_inv[3]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[1] = _mm256_unpacklo_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[3] = _mm256_unpacklo_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange low elements + mat_a_cols[0] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x20); + mat_a_cols[2] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_pd(mat_b_rearr[0], mat_b_rearr[1]); + + mat_b_rearr[1] = _mm256_unpackhi_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange high elements + mat_a_cols[1] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x20); + mat_a_cols[3] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x31); + + //Read next set of B columns + ptr_b += (cs_b+cs_b_offset[1]); + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + (cs_b))); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1])); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + + } + //Last block trsm processing + + ptr_b_dup = ptr_b; + /*Shuffle to rearrange/transpose 8x4 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[1] = _mm256_unpacklo_pd(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[3] = _mm256_unpacklo_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange low elements + mat_b_rearr[0] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x20); + mat_b_rearr[2] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x31); + + mat_b_rearr[0] = _mm256_mul_pd(mat_b_rearr[0], alphaReg); + mat_b_rearr[2] = _mm256_mul_pd(mat_b_rearr[2], alphaReg); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_pd(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange high elements + mat_b_rearr[1] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x20); + mat_b_rearr[3] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x31); + + mat_b_rearr[1] = _mm256_mul_pd(mat_b_rearr[1], alphaReg); + mat_b_rearr[3] = _mm256_mul_pd(mat_b_rearr[3], alphaReg); + //extract a00 + mat_a_diag_inv[0] = _mm256_permute_pd(reciprocal_diags, 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_pd(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_pd(mat_b_rearr[0], mat_a_diag_inv[0]); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_pd(reciprocal_diags, 0x03); + mat_a_diag_inv[1] = _mm256_permute2f128_pd(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[1] = _mm256_fnmadd_pd(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_pd(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_pd(reciprocal_diags, 0x00); + mat_a_diag_inv[2] = _mm256_permute2f128_pd(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x11); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_pd(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_pd(reciprocal_diags, 0x0C); + mat_a_diag_inv[3] = _mm256_permute2f128_pd(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x11); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_pd(mat_b_rearr[3], mat_a_diag_inv[3]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[1] = _mm256_unpacklo_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[3] = _mm256_unpacklo_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange low elements + mat_a_cols[0] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x20); + mat_a_cols[2] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange high elements + mat_a_cols[1] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x20); + mat_a_cols[3] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x31); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + +} +/* +*AX=B A=LOWER TRIANGULAR, NO TRANSPOSE, UNITDIAGONAL +*ALPHA != 1; +*/ +static void blis_dtrsm_microkernel_alpha_unitDiag(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + double alphaVal + ) +{ + + int j; + int cs_b_offset[2]; + double *ptr_b_dup; + + __m256d mat_b_col[4]; + __m256d mat_b_rearr[4]; + __m256d mat_a_cols[4]; + __m256d mat_a_cols_rearr[10]; + __m256d alphaReg; + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + + alphaReg = _mm256_broadcast_sd((double const *)&alphaVal); + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + (cs_b))); + //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0])); + //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1])); + //1st col + mat_a_cols_rearr[0] = _mm256_broadcast_sd((double const *)(ptr_l+0)); + mat_a_cols_rearr[1] = _mm256_broadcast_sd((double const *)(ptr_l+1)); + mat_a_cols_rearr[3] = _mm256_broadcast_sd((double const *)(ptr_l+2)); + mat_a_cols_rearr[6] = _mm256_broadcast_sd((double const *)(ptr_l+3)); + + //2nd col + ptr_l += cs_l; + mat_a_cols_rearr[2] = _mm256_broadcast_sd((double const *)(ptr_l + 1)); + mat_a_cols_rearr[4] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_cols_rearr[7] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + //3rd col + ptr_l += cs_l; + mat_a_cols_rearr[5] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_cols_rearr[8] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //4th col + ptr_l += cs_l; + mat_a_cols_rearr[9] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + numCols_b -= 4; // blk_width = 4 + + for(j = 0;j < numCols_b; j += 4) + { + ptr_b_dup = ptr_b; + /*Shuffle to rearrange/transpose 8x4 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[1] = _mm256_unpacklo_pd(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[3] = _mm256_unpacklo_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange low elements + mat_b_rearr[0] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x20); + mat_b_rearr[2] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x31); + + mat_b_rearr[0] = _mm256_mul_pd(mat_b_rearr[0], alphaReg); + mat_b_rearr[2] = _mm256_mul_pd(mat_b_rearr[2], alphaReg); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_pd(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange high elements + mat_b_rearr[1] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x20); + mat_b_rearr[3] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x31); + + mat_b_rearr[1] = _mm256_mul_pd(mat_b_rearr[1], alphaReg); + mat_b_rearr[3] = _mm256_mul_pd(mat_b_rearr[3], alphaReg); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[1] = _mm256_fnmadd_pd(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[1] = _mm256_unpacklo_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[3] = _mm256_unpacklo_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange low elements + mat_a_cols[0] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x20); + mat_a_cols[2] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange high elements + mat_a_cols[1] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x20); + mat_a_cols[3] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x31); + + //Read next set of B columns + ptr_b += (cs_b+cs_b_offset[1]); + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + (cs_b))); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1])); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + + } + //Last block trsm processing + + ptr_b_dup = ptr_b; + /*Shuffle to rearrange/transpose 8x4 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[1] = _mm256_unpacklo_pd(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[3] = _mm256_unpacklo_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange low elements + mat_b_rearr[0] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x20); + mat_b_rearr[2] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x31); + + mat_b_rearr[0] = _mm256_mul_pd(mat_b_rearr[0], alphaReg); + mat_b_rearr[2] = _mm256_mul_pd(mat_b_rearr[2], alphaReg); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_pd(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange high elements + mat_b_rearr[1] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x20); + mat_b_rearr[3] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x31); + + mat_b_rearr[1] = _mm256_mul_pd(mat_b_rearr[1], alphaReg); + mat_b_rearr[3] = _mm256_mul_pd(mat_b_rearr[3], alphaReg); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[1] = _mm256_fnmadd_pd(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[1] = _mm256_unpacklo_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[3] = _mm256_unpacklo_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange low elements + mat_a_cols[0] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x20); + mat_a_cols[2] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange high elements + mat_a_cols[1] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x20); + mat_a_cols[3] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x31); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + +} +/* +*AX = B A= LOWERTRIANGULAR, NO TRANSPOSE, NON-UNITDIAGONAL +*ALPHA = 1 +*/ +static void blis_dtrsm_microkernel(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ) +{ + double ones = 1.0; + int j; + int cs_b_offset[2]; + double *ptr_b_dup; + + __m256d mat_b_col[4]; + __m256d mat_b_rearr[4]; + __m256d mat_a_cols[4]; + __m256d mat_a_cols_rearr[10]; + __m256d mat_a_diag_inv[4]; + __m256d reciprocal_diags; + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + + reciprocal_diags = _mm256_broadcast_sd((double const *)&ones); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b); + //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); + //row2 = (cs_l << 1); + //row4 = (cs_l << 2); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + (cs_b))); + //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0])); + //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1])); + + //1st col + mat_a_cols_rearr[0] = _mm256_broadcast_sd((double const *)(ptr_l+0)); + mat_a_cols_rearr[1] = _mm256_broadcast_sd((double const *)(ptr_l+1)); + mat_a_cols_rearr[3] = _mm256_broadcast_sd((double const *)(ptr_l+2)); + mat_a_cols_rearr[6] = _mm256_broadcast_sd((double const *)(ptr_l+3)); + + //2nd col + ptr_l += cs_l; + mat_a_cols_rearr[2] = _mm256_broadcast_sd((double const *)(ptr_l + 1)); + mat_a_cols_rearr[4] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_cols_rearr[7] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //3rd col + ptr_l += cs_l; + mat_a_cols_rearr[5] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_cols_rearr[8] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //4th col + ptr_l += cs_l; + mat_a_cols_rearr[9] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + numCols_b -= 4; // blk_width = 4 + + //compute reciprocals of L(i,i) and broadcast in registers + mat_a_diag_inv[0] = _mm256_unpacklo_pd(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); + mat_a_diag_inv[1] = _mm256_unpacklo_pd(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); + + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x0C); + reciprocal_diags = _mm256_div_pd(reciprocal_diags, mat_a_diag_inv[0]); + + for(j = 0;j < numCols_b; j += 4) + { + ptr_b_dup = ptr_b; + /*Shuffle to rearrange/transpose 8x4 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[1] = _mm256_unpacklo_pd(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[3] = _mm256_unpacklo_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange low elements + mat_b_rearr[0] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x20); + mat_b_rearr[2] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x31); + + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_pd(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange high elements + mat_b_rearr[1] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x20); + mat_b_rearr[3] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x31); + + //extract a00 + mat_a_diag_inv[0] = _mm256_permute_pd(reciprocal_diags, 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_pd(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_pd(mat_b_rearr[0], mat_a_diag_inv[0]); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_pd(reciprocal_diags, 0x03); + mat_a_diag_inv[1] = _mm256_permute2f128_pd(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[1] = _mm256_fnmadd_pd(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_pd(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_pd(reciprocal_diags, 0x00); + mat_a_diag_inv[2] = _mm256_permute2f128_pd(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x11); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_pd(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_pd(reciprocal_diags, 0x0C); + mat_a_diag_inv[3] = _mm256_permute2f128_pd(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x11); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_pd(mat_b_rearr[3], mat_a_diag_inv[3]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[1] = _mm256_unpacklo_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[3] = _mm256_unpacklo_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange low elements + mat_a_cols[0] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x20); + mat_a_cols[2] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange high elements + mat_a_cols[1] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x20); + mat_a_cols[3] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x31); + + //Read next set of B columns + ptr_b += (cs_b+cs_b_offset[1]); + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + (cs_b))); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1])); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + + } + //Last block trsm processing + + ptr_b_dup = ptr_b; + /*Shuffle to rearrange/transpose 8x4 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[1] = _mm256_unpacklo_pd(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[3] = _mm256_unpacklo_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange low elements + mat_b_rearr[0] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x20); + mat_b_rearr[2] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_pd(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange high elements + mat_b_rearr[1] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x20); + mat_b_rearr[3] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x31); + + //extract a00 + mat_a_diag_inv[0] = _mm256_permute_pd(reciprocal_diags, 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_pd(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_pd(mat_b_rearr[0], mat_a_diag_inv[0]); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_pd(reciprocal_diags, 0x03); + mat_a_diag_inv[1] = _mm256_permute2f128_pd(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[1] = _mm256_fnmadd_pd(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_pd(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_pd(reciprocal_diags, 0x00); + mat_a_diag_inv[2] = _mm256_permute2f128_pd(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x11); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_pd(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_pd(reciprocal_diags, 0x0C); + mat_a_diag_inv[3] = _mm256_permute2f128_pd(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x11); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_pd(mat_b_rearr[3], mat_a_diag_inv[3]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[1] = _mm256_unpacklo_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[3] = _mm256_unpacklo_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange low elements + mat_a_cols[0] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x20); + mat_a_cols[2] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange high elements + mat_a_cols[1] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x20); + mat_a_cols[3] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x31); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); +} +/* +*AX = B A=LOWER TRIANGULAR, NO TRANSPOSE, UNITDIAGONAL +*ALPHA = 1 +*/ +static void blis_dtrsm_microkernel_unitDiag(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ) +{ + //double ones = 1.0; + int j; + int cs_b_offset[2]; + double *ptr_b_dup; + + __m256d mat_b_col[4]; + __m256d mat_b_rearr[4]; + __m256d mat_a_cols[4]; + __m256d mat_a_cols_rearr[10]; + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b); + //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); + //row2 = (cs_l << 1); + //row4 = (cs_l << 2); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + (cs_b))); + //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0])); + //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1])); + + //1st col + mat_a_cols_rearr[0] = _mm256_broadcast_sd((double const *)(ptr_l+0)); + mat_a_cols_rearr[1] = _mm256_broadcast_sd((double const *)(ptr_l+1)); + mat_a_cols_rearr[3] = _mm256_broadcast_sd((double const *)(ptr_l+2)); + mat_a_cols_rearr[6] = _mm256_broadcast_sd((double const *)(ptr_l+3)); + + //2nd col + ptr_l += cs_l; + mat_a_cols_rearr[2] = _mm256_broadcast_sd((double const *)(ptr_l + 1)); + mat_a_cols_rearr[4] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_cols_rearr[7] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //3rd col + ptr_l += cs_l; + mat_a_cols_rearr[5] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_cols_rearr[8] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //4th col + ptr_l += cs_l; + mat_a_cols_rearr[9] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + numCols_b -= 4; // blk_width = 4 + + + for(j = 0;j < numCols_b; j += 4) + { + ptr_b_dup = ptr_b; + /*Shuffle to rearrange/transpose 8x4 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[1] = _mm256_unpacklo_pd(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[3] = _mm256_unpacklo_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange low elements + mat_b_rearr[0] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x20); + mat_b_rearr[2] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x31); + + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_pd(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange high elements + mat_b_rearr[1] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x20); + mat_b_rearr[3] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x31); + + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[1] = _mm256_fnmadd_pd(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[1] = _mm256_unpacklo_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[3] = _mm256_unpacklo_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange low elements + mat_a_cols[0] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x20); + mat_a_cols[2] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange high elements + mat_a_cols[1] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x20); + mat_a_cols[3] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x31); + + //Read next set of B columns + ptr_b += (cs_b+cs_b_offset[1]); + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + (cs_b))); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1])); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + } + //Last block trsm processing + + ptr_b_dup = ptr_b; + /*Shuffle to rearrange/transpose 8x4 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[1] = _mm256_unpacklo_pd(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[3] = _mm256_unpacklo_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange low elements + mat_b_rearr[0] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x20); + mat_b_rearr[2] = _mm256_permute2f128_pd(mat_b_rearr[1],mat_b_rearr[3],0x31); + + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_pd(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_pd(mat_b_col[2], mat_b_col[3]); + + //rearrange high elements + mat_b_rearr[1] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x20); + mat_b_rearr[3] = _mm256_permute2f128_pd(mat_b_col[0],mat_b_col[1],0x31); + + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[1] = _mm256_fnmadd_pd(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_pd(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_pd(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[1] = _mm256_unpacklo_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[3] = _mm256_unpacklo_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange low elements + mat_a_cols[0] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x20); + mat_a_cols[2] = _mm256_permute2f128_pd(mat_a_cols[1],mat_a_cols[3],0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_pd(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_pd(mat_b_rearr[2], mat_b_rearr[3]); + + //rearrange high elements + + mat_a_cols[1] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x20); + mat_a_cols[3] = _mm256_permute2f128_pd(mat_b_rearr[0],mat_b_rearr[1],0x31); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); +} +///////////////////////////// AX=B /////////////////////////////// +static void blis_strsm_microkernel_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal) +{ + float ones = 1.0; + int j; + int cs_b_offset[6]; + //int row2, row4, row6; + float *ptr_b_dup; + + //70 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_cols[8]; + __m256 mat_a_cols_rearr[36]; + __m256 mat_a_diag_inv[8]; + __m256 reciprocal_diags; + __m256 alphaReg; + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + + //reciprocal_diags = _mm256_loadu_ps((float const *)ones); + reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); + alphaReg = _mm256_broadcast_ss((float const *)&alphaVal); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); + //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); + //row2 = (cs_l << 1); + //row4 = (cs_l << 2); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); + //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); + //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); + //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); + //row6 = row2 + row4; + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); + //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); + //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); + //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); + //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); + + //reciprocal_diags = _mm256_loadu_ps((float const *)ones); + + //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L + /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ + + //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers + //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. + //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); + //1st col + mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); + mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); + mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); + mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); + mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); + mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); + mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); + mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); + //2nd col + ptr_l += cs_l; + mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //3rd col + ptr_l += cs_l; + mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //4rth col + ptr_l += cs_l; + mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //5th col + ptr_l += cs_l; + mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //6th col + ptr_l += cs_l; + mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //7th col + ptr_l += cs_l; + mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //7th col + ptr_l += cs_l; + mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + numCols_b -= 8; // blk_width = 8 + + //compute reciprocals of L(i,i) and broadcast in registers + mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); + mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); + mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); + mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); + + //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); + //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); + + //reciprocal of diagnol elements + reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); + + //Start loop for cols of B to be processed in size of blk_width + for (j = 0; j < numCols_b; j += 8) + { + ptr_b_dup = ptr_b; + + /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); + mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); +#else + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); + mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); + mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); + mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); + mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); + mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); + mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); + mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + //Read next set of B columns + ptr_b += (cs_b + cs_b_offset[5]); + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); + + //end loop of cols + } + + //Last block trsm processing + ptr_b_dup = ptr_b; + + /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); + mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); +#else + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); + mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); + mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); + mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); + mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); + mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); + mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); + mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); + + //end loop of cols +} + +static void blis_strsm_microkernel_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal) +{ + //float ones = 1.0; + int j; + int cs_b_offset[6]; + //int row2, row4, row6; + float *ptr_b_dup; + + //70 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_cols[8]; + __m256 mat_a_cols_rearr[36]; + //__m256 mat_a_diag_inv[8]; + //__m256 reciprocal_diags; + __m256 alphaReg; + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + + //reciprocal_diags = _mm256_loadu_ps((float const *)ones); + //reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); + alphaReg = _mm256_broadcast_ss((float const *)&alphaVal); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); + //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); + //row2 = (cs_l << 1); + //row4 = (cs_l << 2); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); + //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); + //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); + //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); + //row6 = row2 + row4; + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); + //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); + //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); + //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); + //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); + + //reciprocal_diags = _mm256_loadu_ps((float const *)ones); + + //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L + /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ + + //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers + //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. + //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); + //1st col + mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); + mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); + mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); + mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); + mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); + mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); + mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); + mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); + //2nd col + ptr_l += cs_l; + mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //3rd col + ptr_l += cs_l; + mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //4rth col + ptr_l += cs_l; + mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //5th col + ptr_l += cs_l; + mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //6th col + ptr_l += cs_l; + mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //7th col + ptr_l += cs_l; + mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //8th col + //ptr_l += cs_l; + //mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + numCols_b -= 8; // blk_width = 8 + + //compute reciprocals of L(i,i) and broadcast in registers + //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); + + //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); + //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); + //mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); + //mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); + //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); + + //reciprocal of diagnol elements + //reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); + + //Start loop for cols of B to be processed in size of blk_width + for (j = 0; j < numCols_b; j += 8) + { + ptr_b_dup = ptr_b; + + /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //extract diag a00 from a + //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); + //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); + + //extract diag a11 from a + //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); + //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); + //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); + //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); + //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); + //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); + //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); + //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); + mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); +#else + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); + mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); + mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); + mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); + mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); + mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); + mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); + mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + //Read next set of B columns + ptr_b += (cs_b + cs_b_offset[5]); + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); + + //end loop of cols + } + + //Last block trsm processing + ptr_b_dup = ptr_b; + + /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //extract diag a00 from a + //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); + //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); + + //extract diag a11 from a + //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); + //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); + //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); + //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); + //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); + //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); + //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); + //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); + mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); +#else + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); + mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); + mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); + mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); + mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); + mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); + mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); + mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); + + //end loop of cols +} + +static void blis_strsm_microkernel_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + //float ones = 1.0; + int j; + int cs_b_offset[6]; + //int row2, row4, row6; + float *ptr_b_dup; + + //70 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_cols[8]; + __m256 mat_a_cols_rearr[36]; + //__m256 mat_a_diag_inv[8]; + //__m256 reciprocal_diags; + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + + //reciprocal_diags = _mm256_loadu_ps((float const *)ones); + //reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); + //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); + //row2 = (cs_l << 1); + //row4 = (cs_l << 2); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); + //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); + //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); + //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); + //row6 = row2 + row4; + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); + //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); + //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); + //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); + //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); + + //reciprocal_diags = _mm256_loadu_ps((float const *)ones); + + //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L + /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ + + //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers + //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. + //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); + //1st col + mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); + mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); + mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); + mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); + mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); + mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); + mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); + mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); + //2nd col + ptr_l += cs_l; + mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //3rd col + ptr_l += cs_l; + mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //4rth col + ptr_l += cs_l; + mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //5th col + ptr_l += cs_l; + mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //6th col + ptr_l += cs_l; + mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //7th col + ptr_l += cs_l; + mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //8th col + //ptr_l += cs_l; + //mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + numCols_b -= 8; // blk_width = 8 + + //compute reciprocals of L(i,i) and broadcast in registers + //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); + + //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); + //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); + //mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); + //mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); + //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); + + //reciprocal of diagnol elements + //reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); + + //Start loop for cols of B to be processed in size of blk_width + for (j = 0; j < numCols_b; j += 8) + { + ptr_b_dup = ptr_b; + + /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //extract diag a00 from a + //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); + //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + //extract diag a11 from a + //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); + //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); + //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); + //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); + //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); + //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); + //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); + //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); + mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); +#else + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); + mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); + mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); + mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); + mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); + mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); + mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); + mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + //Read next set of B columns + ptr_b += (cs_b + cs_b_offset[5]); + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); + //end loop of cols + } + + //Last block trsm processing + ptr_b_dup = ptr_b; + + /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //extract diag a00 from a + //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); + //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + //extract diag a11 from a + //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); + //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); + //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); + //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); + //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); + //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); + //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); + //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); + mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); +#else + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); + mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); + mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); + mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); + mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); + mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); + mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); + mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); + //end loop of cols +} + +static void blis_strsm_microkernel(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + float ones = 1.0; + int j; + int cs_b_offset[6]; + //int row2, row4, row6; + float *ptr_b_dup; + + //70 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_cols[8]; + __m256 mat_a_cols_rearr[36]; + __m256 mat_a_diag_inv[8]; + __m256 reciprocal_diags; + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + + //reciprocal_diags = _mm256_loadu_ps((float const *)ones); + reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); + //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); + //row2 = (cs_l << 1); + //row4 = (cs_l << 2); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); + //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); + //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); + //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); + //row6 = row2 + row4; + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); + //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); + //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); + //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); + //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); + + //reciprocal_diags = _mm256_loadu_ps((float const *)ones); + + //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L + /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); + ptr_l += cs_l; + mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ + + //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers + //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. + //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); + //1st col + mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); + mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); + mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); + mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); + mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); + mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); + mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); + mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); + //2nd col + ptr_l += cs_l; + mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //3rd col + ptr_l += cs_l; + mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //4rth col + ptr_l += cs_l; + mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //5th col + ptr_l += cs_l; + mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //6th col + ptr_l += cs_l; + mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //7th col + ptr_l += cs_l; + mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + //7th col + ptr_l += cs_l; + mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + numCols_b -= 8; // blk_width = 8 + + //compute reciprocals of L(i,i) and broadcast in registers + mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); + mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); + mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); + mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); + + //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); + //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); + + //reciprocal of diagnol elements + reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); + + //Start loop for cols of B to be processed in size of blk_width + for (j = 0; j < numCols_b; j += 8) + { + ptr_b_dup = ptr_b; + + /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); + mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); +#else + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); + mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); + mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); + mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); + mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); + mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); + mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); + mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + //Read next set of B columns + ptr_b += (cs_b + cs_b_offset[5]); + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); + //end loop of cols + } + + //Last block trsm processing + ptr_b_dup = ptr_b; + + /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ + + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //--> Transpose and store results of columns of B block <--// + ////unpacklow//// + mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); + mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); +#else + mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); + mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); + mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); + mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); + mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); + mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); + mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); + mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); + mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); + //end loop of cols +} + +///////////////////////////////////// XA'=B functions //////////////////////////////// +static void dtrsm_XAtB_block_allSmallSizedMatrices_alpha(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + double alpha + ) + +{ + + double ones = 1.0; + int i,i1,i2,i3,i4,j,k,l; + int cs_b_offset[3]; + int cs_l_offset[3]; + double *ptr_b_dup; + + __m256d mat_b_col[4]; + __m256d mat_b_rearr[16][4]; + __m256d mat_a_cols_rearr[4]; + __m256d mat_a_blk_elems[16]; + __m256d mat_a_diag_inv[4]; + __m256d reciprocal_diags[2]; + __m256d alphaReg; + reciprocal_diags[0] = _mm256_broadcast_sd((double const *)(&ones)); + alphaReg = _mm256_broadcast_sd((double const *)&alpha); + + // ---> considering that the matrix size is multiple of 4 rows and 4 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + + //read diag elems of L 4x4 block + mat_a_cols_rearr[0] = _mm256_loadu_pd((double const *)ptr_l); + mat_a_cols_rearr[1] = _mm256_loadu_pd((double const *)ptr_l + cs_l); + mat_a_cols_rearr[2] = _mm256_loadu_pd((double const *)ptr_l + cs_l_offset[0]); + mat_a_cols_rearr[3] = _mm256_loadu_pd((double const *)ptr_l + cs_l_offset[1]); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + + reciprocal_diags[1] = reciprocal_diags[0]; + + //pack first 4 diags together + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0x0A);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_pd(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0x0A);//diag 2,3 + + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x0C);//diag 0,1,2,3 + + //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_pd(reciprocal_diags[0], mat_a_diag_inv[0]); + + //Broadcast A10 to A30 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //Broadcast A21 to A31 to registers + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + 3)); + + //Broadcast A32 to register + mat_a_blk_elems[6] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + 3)); + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_pd(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_pd(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_pd(reciprocal_diags[0], 0x03); + mat_a_diag_inv[1] = _mm256_permute2f128_pd(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_pd(reciprocal_diags[0], 0x00); + mat_a_diag_inv[2] = _mm256_permute2f128_pd(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x11); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_pd(reciprocal_diags[0], 0x0C); + mat_a_diag_inv[3] = _mm256_permute2f128_pd(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x11); + + /***************** first set of 4 cols of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 4) + { + /////////////////// Complete Upper 4x4 block trsm of B :- upper 4x4 block of B with upper 4x4 block of A + //read 4x4 block of B into registers + + mat_b_rearr[0][0] = _mm256_loadu_pd((double const *)ptr_b + i); + mat_b_rearr[1][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b + i)); + mat_b_rearr[2][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1] + i)); + + mat_b_rearr[0][0] = _mm256_mul_pd(mat_b_rearr[0][0], alphaReg); + mat_b_rearr[1][0] = _mm256_mul_pd(mat_b_rearr[1][0], alphaReg); + mat_b_rearr[2][0] = _mm256_mul_pd(mat_b_rearr[2][0], alphaReg); + mat_b_rearr[3][0] = _mm256_mul_pd(mat_b_rearr[3][0], alphaReg); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_col[0] = _mm256_mul_pd(mat_b_rearr[0][0], mat_a_diag_inv[0]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1][0] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) + mat_b_rearr[2][0] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_col[1] = _mm256_mul_pd(mat_b_rearr[1][0], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2][0] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_col[2] = _mm256_mul_pd(mat_b_rearr[2][0], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_col[3] = _mm256_mul_pd(mat_b_rearr[3][0], mat_a_diag_inv[3]); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + + i += 4; + ptr_b_dup += 4; + + } + + /***************** first set of 4 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width} + + for (j = 4; j < numRows_lb; j += 4)//m :- 4x4 block row + { + ptr_l += 4; + ptr_b_dup += cs_b_offset[2]; + i1 += cs_b_offset[2]; + //printf("i1 = i3 = %g\n",*(ptr_l+i1)); + //Read next 4x4 block of A to get diag elements + i3 += cs_l_offset[2]; + mat_a_cols_rearr[0] = _mm256_loadu_pd((double const *)ptr_l + i3); + mat_a_cols_rearr[1] = _mm256_loadu_pd((double const *)ptr_l + i3 + cs_l); + mat_a_cols_rearr[2] = _mm256_loadu_pd((double const *)ptr_l + i3 + cs_l_offset[0]); + mat_a_cols_rearr[3] = _mm256_loadu_pd((double const *)ptr_l + i3 + cs_l_offset[1]); + + //pack 4 diags of A together + reciprocal_diags[0] = reciprocal_diags[1]; + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0x0A);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_pd(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0x0A);//diag 2,3 + + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x0C);//diag 0,1,2,3 + + //reciprocal of diagnal elements of A :- 0,1,2,3 + reciprocal_diags[0] = _mm256_div_pd(reciprocal_diags[0], mat_a_diag_inv[0]); + + i = 0; + i2 = 0; + for (k = 0; k < numCols_b; k += 4) + { + i = i1 + k; + mat_b_rearr[i2][0] = _mm256_loadu_pd((double const *)ptr_b + i); + mat_b_rearr[i2][1] = _mm256_loadu_pd((double const *)(ptr_b + cs_b + i)); + mat_b_rearr[i2][2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[i2][3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1] + i)); + + mat_b_rearr[i2][0] = _mm256_mul_pd(mat_b_rearr[i2][0], alphaReg); + mat_b_rearr[i2][1] = _mm256_mul_pd(mat_b_rearr[i2][1], alphaReg); + mat_b_rearr[i2][2] = _mm256_mul_pd(mat_b_rearr[i2][2], alphaReg); + mat_b_rearr[i2][3] = _mm256_mul_pd(mat_b_rearr[i2][3], alphaReg); + i2++; + } + + + i = 0; + i2 = 0; + for (l = 0; l < j; l += 4) // move across m + { + + //Broadcast A4,0 to A7,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + i)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + + //Broadcast A41 to A71 to registers + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i)); + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 1)); + mat_a_blk_elems[6] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 2)); + mat_a_blk_elems[7] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 3)); + + //Broadcast A4,2 to A7,2 to registers + mat_a_blk_elems[8] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i)); + mat_a_blk_elems[9] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 1)); + mat_a_blk_elems[10] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 2)); + mat_a_blk_elems[11] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 3)); + + //Broadcast A4,3 to A7,3 to registers + mat_a_blk_elems[12] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i)); + mat_a_blk_elems[13] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 1)); + mat_a_blk_elems[14] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 2)); + mat_a_blk_elems[15] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 3)); + + i += cs_l_offset[2]; + + for (k = 0; k < numCols_b; k += 4) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + + i4 = i2 + k; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b_offset[1])); + + + i4 = k >> 2; + + //(Row4): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) + //(Row5): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) + + + //(Row6): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[8], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[9], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[10], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[11], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) + //(Row7): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[12], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[13], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[14], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[15], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) + //end loop of cols + + } + i2 += cs_b_offset[2]; + + } + + //Broadcast A10 to A30 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_pd(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_pd(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //Broadcast A21 to A31 to registers + mat_a_blk_elems[3] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_pd(reciprocal_diags[0], 0x03); + mat_a_diag_inv[1] = _mm256_permute2f128_pd(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_pd(reciprocal_diags[0], 0x00); + mat_a_diag_inv[2] = _mm256_permute2f128_pd(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x11); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_pd(reciprocal_diags[0], 0x0C); + mat_a_diag_inv[3] = _mm256_permute2f128_pd(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x11); + + k = 0; + for (i = 0; i < numCols_b; i+=4) + { + + + + /////////////////// Complete Lower 4x4 block trsm of B :- lower 4x4 block of B with lower right 4x4 block of A + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[k][0] = _mm256_mul_pd(mat_b_rearr[k][0], mat_a_diag_inv[0]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[k][1] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) + mat_b_rearr[k][2] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[k][1] = _mm256_mul_pd(mat_b_rearr[k][1], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[k][2] = _mm256_fnmadd_pd(mat_a_blk_elems[3], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[k][2] = _mm256_mul_pd(mat_b_rearr[k][2], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[k][3] = _mm256_mul_pd(mat_b_rearr[k][3], mat_a_diag_inv[3]); + + //Store the computed B columns + + _mm256_storeu_pd((double *)(ptr_b_dup + i), mat_b_rearr[k][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); + + k++; + } + } + +} + +static void dtrsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b, + double alpha + ) + +{ + + int i,i1,i2,i3,i4,j,k,l; + int cs_b_offset[3]; + int cs_l_offset[3]; + double *ptr_b_dup; + + __m256d mat_b_col[4]; + __m256d mat_b_rearr[16][4]; + __m256d mat_a_blk_elems[16]; + __m256d alphaReg; + alphaReg = _mm256_broadcast_sd((double const *)&alpha); + + // ---> considering that the matrix size is multiple of 4 rows and 4 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + + //Broadcast A10 to A30 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //Broadcast A21 to A31 to registers + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + 3)); + + //Broadcast A32 to register + mat_a_blk_elems[6] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + 3)); + + /***************** first set of 4 cols of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 4) + { + /////////////////// Complete Upper 4x4 block trsm of B :- upper 4x4 block of B with upper 4x4 block of A + //read 4x4 block of B into registers + + mat_b_rearr[0][0] = _mm256_loadu_pd((double const *)ptr_b + i); + mat_b_rearr[1][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b + i)); + mat_b_rearr[2][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1] + i)); + + mat_b_rearr[0][0] = _mm256_mul_pd(mat_b_rearr[0][0], alphaReg); + mat_b_rearr[1][0] = _mm256_mul_pd(mat_b_rearr[1][0], alphaReg); + mat_b_rearr[2][0] = _mm256_mul_pd(mat_b_rearr[2][0], alphaReg); + mat_b_rearr[3][0] = _mm256_mul_pd(mat_b_rearr[3][0], alphaReg); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1][0] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_rearr[0][0], mat_b_rearr[1][0]);//d = c - (a*b) + mat_b_rearr[2][0] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_rearr[0][0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_rearr[0][0], mat_b_rearr[3][0]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2][0] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_rearr[1][0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_rearr[1][0], mat_b_rearr[3][0]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[6], mat_b_rearr[2][0], mat_b_rearr[3][0]);//d = c - (a*b) + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_b_rearr[0][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_b_rearr[1][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3][0]); + + i += 4; + ptr_b_dup += 4; + + } + + /***************** first set of 4 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width} + + for (j = 4; j < numRows_lb; j += 4)//m :- 4x4 block row + { + ptr_l += 4; + ptr_b_dup += cs_b_offset[2]; + i1 += cs_b_offset[2]; + i3 += cs_l_offset[2]; + i = 0; + i2 = 0; + for (k = 0; k < numCols_b; k += 4) + { + i = i1 + k; + mat_b_rearr[i2][0] = _mm256_loadu_pd((double const *)ptr_b + i); + mat_b_rearr[i2][1] = _mm256_loadu_pd((double const *)(ptr_b + cs_b + i)); + mat_b_rearr[i2][2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[i2][3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1] + i)); + + mat_b_rearr[i2][0] = _mm256_mul_pd(mat_b_rearr[i2][0], alphaReg); + mat_b_rearr[i2][1] = _mm256_mul_pd(mat_b_rearr[i2][1], alphaReg); + mat_b_rearr[i2][2] = _mm256_mul_pd(mat_b_rearr[i2][2], alphaReg); + mat_b_rearr[i2][3] = _mm256_mul_pd(mat_b_rearr[i2][3], alphaReg); + i2++; + } + + + i = 0; + i2 = 0; + for (l = 0; l < j; l += 4) // move across m + { + + //Broadcast A4,0 to A7,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + i)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + + //Broadcast A41 to A71 to registers + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i)); + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 1)); + mat_a_blk_elems[6] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 2)); + mat_a_blk_elems[7] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 3)); + + //Broadcast A4,2 to A7,2 to registers + mat_a_blk_elems[8] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i)); + mat_a_blk_elems[9] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 1)); + mat_a_blk_elems[10] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 2)); + mat_a_blk_elems[11] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 3)); + + //Broadcast A4,3 to A7,3 to registers + mat_a_blk_elems[12] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i)); + mat_a_blk_elems[13] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 1)); + mat_a_blk_elems[14] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 2)); + mat_a_blk_elems[15] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 3)); + + i += cs_l_offset[2]; + + for (k = 0; k < numCols_b; k += 4) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + + i4 = i2 + k; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b_offset[1])); + + + i4 = k >> 2; + + //(Row4): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) + //(Row5): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) + + + //(Row6): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[8], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[9], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[10], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[11], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) + //(Row7): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[12], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[13], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[14], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[15], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) + //end loop of cols + + } + i2 += cs_b_offset[2]; + + } + + //Broadcast A10 to A30 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //Broadcast A21 to A31 to registers + mat_a_blk_elems[3] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + k = 0; + for (i = 0; i < numCols_b; i+=4) + { + + + + /////////////////// Complete Lower 4x4 block trsm of B :- lower 4x4 block of B with lower right 4x4 block of A + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[k][1] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) + mat_b_rearr[k][2] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[k][2] = _mm256_fnmadd_pd(mat_a_blk_elems[3], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) + + //Store the computed B columns + + _mm256_storeu_pd((double *)(ptr_b_dup + i), mat_b_rearr[k][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); + + k++; + } + + } + + +} + +static void dtrsm_XAtB_block_allSmallSizedMatrices_unitDiag(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ) + +{ + + int i,i1,i2,i3,i4,j,k,l; + int cs_b_offset[3]; + int cs_l_offset[3]; + double *ptr_b_dup; + + __m256d mat_b_col[4]; + __m256d mat_b_rearr[16][4]; + __m256d mat_a_blk_elems[16]; + + // ---> considering that the matrix size is multiple of 4 rows and 4 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + + //Broadcast A10 to A30 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //Broadcast A21 to A31 to registers + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + 3)); + + //Broadcast A32 to register + mat_a_blk_elems[6] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + 3)); + + /***************** first set of 4 cols of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 4) + { + /////////////////// Complete Upper 4x4 block trsm of B :- upper 4x4 block of B with upper 4x4 block of A + //read 4x4 block of B into registers + + mat_b_rearr[0][0] = _mm256_loadu_pd((double const *)ptr_b + i); + mat_b_rearr[1][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b + i)); + mat_b_rearr[2][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1] + i)); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1][0] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_rearr[0][0], mat_b_rearr[1][0]);//d = c - (a*b) + mat_b_rearr[2][0] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_rearr[0][0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_rearr[0][0], mat_b_rearr[3][0]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2][0] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_rearr[1][0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_rearr[1][0], mat_b_rearr[3][0]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[6], mat_b_rearr[2][0], mat_b_rearr[3][0]);//d = c - (a*b) + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_b_rearr[0][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_b_rearr[1][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3][0]); + + i += 4; + ptr_b_dup += 4; + + } + + /***************** first set of 4 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width} + + for (j = 4; j < numRows_lb; j += 4)//m :- 4x4 block row + { + ptr_l += 4; + ptr_b_dup += cs_b_offset[2]; + i1 += cs_b_offset[2]; + i3 += cs_l_offset[2]; + i = 0; + i2 = 0; + for (k = 0; k < numCols_b; k += 4) + { + i = i1 + k; + mat_b_rearr[i2][0] = _mm256_loadu_pd((double const *)ptr_b + i); + mat_b_rearr[i2][1] = _mm256_loadu_pd((double const *)(ptr_b + cs_b + i)); + mat_b_rearr[i2][2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[i2][3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1] + i)); + + i2++; + } + + + i = 0; + i2 = 0; + for (l = 0; l < j; l += 4) // move across m + { + + //Broadcast A4,0 to A7,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + i)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + + //Broadcast A41 to A71 to registers + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i)); + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 1)); + mat_a_blk_elems[6] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 2)); + mat_a_blk_elems[7] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 3)); + + //Broadcast A4,2 to A7,2 to registers + mat_a_blk_elems[8] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i)); + mat_a_blk_elems[9] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 1)); + mat_a_blk_elems[10] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 2)); + mat_a_blk_elems[11] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 3)); + + //Broadcast A4,3 to A7,3 to registers + mat_a_blk_elems[12] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i)); + mat_a_blk_elems[13] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 1)); + mat_a_blk_elems[14] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 2)); + mat_a_blk_elems[15] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 3)); + + i += cs_l_offset[2]; + + for (k = 0; k < numCols_b; k += 4) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + + i4 = i2 + k; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b_offset[1])); + + + i4 = k >> 2; + + //(Row4): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) + //(Row5): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) + + + //(Row6): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[8], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[9], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[10], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[11], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) + //(Row7): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[12], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[13], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[14], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[15], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) + //end loop of cols + + } + i2 += cs_b_offset[2]; + + } + + //Broadcast A10 to A30 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //Broadcast A21 to A31 to registers + mat_a_blk_elems[3] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + k = 0; + for (i = 0; i < numCols_b; i+=4) + { + + + + /////////////////// Complete Lower 4x4 block trsm of B :- lower 4x4 block of B with lower right 4x4 block of A + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[k][1] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) + mat_b_rearr[k][2] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[k][2] = _mm256_fnmadd_pd(mat_a_blk_elems[3], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) + + //Store the computed B columns + + _mm256_storeu_pd((double *)(ptr_b_dup + i), mat_b_rearr[k][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); + + k++; + } + + } + +} +static void dtrsm_XAtB_block_allSmallSizedMatrices(double *ptr_l, + double *ptr_b, + int numRows_lb, + int numCols_b, + int rs_l, + int rs_b, + int cs_l, + int cs_b + ) + +{ + + double ones = 1.0; + int i,i1,i2,i3,i4,j,k,l; + int cs_b_offset[3]; + int cs_l_offset[3]; + double *ptr_b_dup; + + __m256d mat_b_col[4]; + __m256d mat_b_rearr[16][4]; + __m256d mat_a_cols_rearr[4]; + __m256d mat_a_blk_elems[16]; + __m256d mat_a_diag_inv[4]; + __m256d reciprocal_diags[2]; + + reciprocal_diags[0] = _mm256_broadcast_sd((double const *)(&ones)); + + // ---> considering that the matrix size is multiple of 4 rows and 4 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + + //read diag elems of L 4x4 block + mat_a_cols_rearr[0] = _mm256_loadu_pd((double const *)ptr_l); + mat_a_cols_rearr[1] = _mm256_loadu_pd((double const *)ptr_l + cs_l); + mat_a_cols_rearr[2] = _mm256_loadu_pd((double const *)ptr_l + cs_l_offset[0]); + mat_a_cols_rearr[3] = _mm256_loadu_pd((double const *)ptr_l + cs_l_offset[1]); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + + reciprocal_diags[1] = reciprocal_diags[0]; + + //pack first 4 diags together + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0x0A);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_pd(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0x0A);//diag 2,3 + + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x0C);//diag 0,1,2,3 + + //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_pd(reciprocal_diags[0], mat_a_diag_inv[0]); + + //Broadcast A10 to A30 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + 3)); + + //Broadcast A21 to A31 to registers + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + 3)); + + //Broadcast A32 to register + mat_a_blk_elems[6] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + 3)); + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_pd(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_pd(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_pd(reciprocal_diags[0], 0x03); + mat_a_diag_inv[1] = _mm256_permute2f128_pd(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_pd(reciprocal_diags[0], 0x00); + mat_a_diag_inv[2] = _mm256_permute2f128_pd(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x11); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_pd(reciprocal_diags[0], 0x0C); + mat_a_diag_inv[3] = _mm256_permute2f128_pd(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x11); + + /***************** first set of 4 cols of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 4) + { + /////////////////// Complete Upper 4x4 block trsm of B :- upper 4x4 block of B with upper 4x4 block of A + //read 4x4 block of B into registers + + mat_b_rearr[0][0] = _mm256_loadu_pd((double const *)ptr_b + i); + mat_b_rearr[1][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b + i)); + mat_b_rearr[2][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3][0] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1] + i)); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_col[0] = _mm256_mul_pd(mat_b_rearr[0][0], mat_a_diag_inv[0]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1][0] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) + mat_b_rearr[2][0] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_col[1] = _mm256_mul_pd(mat_b_rearr[1][0], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2][0] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_col[2] = _mm256_mul_pd(mat_b_rearr[2][0], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3][0] = _mm256_fnmadd_pd(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_col[3] = _mm256_mul_pd(mat_b_rearr[3][0], mat_a_diag_inv[3]); + + //Store the computed B columns + _mm256_storeu_pd((double *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + + i += 4; + ptr_b_dup += 4; + + } + + /***************** first set of 4 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width} + + for (j = 4; j < numRows_lb; j += 4)//m :- 4x4 block row + { + ptr_l += 4; + ptr_b_dup += cs_b_offset[2]; + i1 += cs_b_offset[2]; + //printf("i1 = i3 = %g\n",*(ptr_l+i1)); + //Read next 4x4 block of A to get diag elements + i3 += cs_l_offset[2]; + mat_a_cols_rearr[0] = _mm256_loadu_pd((double const *)ptr_l + i3); + mat_a_cols_rearr[1] = _mm256_loadu_pd((double const *)ptr_l + i3 + cs_l); + mat_a_cols_rearr[2] = _mm256_loadu_pd((double const *)ptr_l + i3 + cs_l_offset[0]); + mat_a_cols_rearr[3] = _mm256_loadu_pd((double const *)ptr_l + i3 + cs_l_offset[1]); + + //pack 4 diags of A together + reciprocal_diags[0] = reciprocal_diags[1]; + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0x0A);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_pd(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0x0A);//diag 2,3 + + mat_a_diag_inv[0] = _mm256_blend_pd(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x0C);//diag 0,1,2,3 + + //reciprocal of diagnal elements of A :- 0,1,2,3 + reciprocal_diags[0] = _mm256_div_pd(reciprocal_diags[0], mat_a_diag_inv[0]); + + i = 0; + i2 = 0; + for (k = 0; k < numCols_b; k += 4) + { + i = i1 + k; + mat_b_rearr[i2][0] = _mm256_loadu_pd((double const *)ptr_b + i); + mat_b_rearr[i2][1] = _mm256_loadu_pd((double const *)(ptr_b + cs_b + i)); + mat_b_rearr[i2][2] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[i2][3] = _mm256_loadu_pd((double const *)(ptr_b + cs_b_offset[1] + i)); + + i2++; + } + + + i = 0; + i2 = 0; + for (l = 0; l < j; l += 4) // move across m + { + + //Broadcast A4,0 to A7,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + i)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + + //Broadcast A41 to A71 to registers + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i)); + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 1)); + mat_a_blk_elems[6] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 2)); + mat_a_blk_elems[7] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l + i + 3)); + + //Broadcast A4,2 to A7,2 to registers + mat_a_blk_elems[8] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i)); + mat_a_blk_elems[9] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 1)); + mat_a_blk_elems[10] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 2)); + mat_a_blk_elems[11] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[0] + i + 3)); + + //Broadcast A4,3 to A7,3 to registers + mat_a_blk_elems[12] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i)); + mat_a_blk_elems[13] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 1)); + mat_a_blk_elems[14] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 2)); + mat_a_blk_elems[15] = _mm256_broadcast_sd((double const *)(ptr_l + cs_l_offset[1] + i + 3)); + + i += cs_l_offset[2]; + + for (k = 0; k < numCols_b; k += 4) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + + i4 = i2 + k; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_pd((double const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_pd((double const *)(ptr_b + i4 + cs_b_offset[1])); + + + i4 = k >> 2; + + //(Row4): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) + //(Row5): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) + + + //(Row6): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[8], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[9], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[10], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[11], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) + //(Row7): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_pd(mat_a_blk_elems[12], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_pd(mat_a_blk_elems[13], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_pd(mat_a_blk_elems[14], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_pd(mat_a_blk_elems[15], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) + //end loop of cols + + } + i2 += cs_b_offset[2]; + + } + + //Broadcast A10 to A30 to registers + mat_a_blk_elems[0] = _mm256_broadcast_sd((double const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_pd(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_pd(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + + //Broadcast A21 to A31 to registers + mat_a_blk_elems[3] = _mm256_broadcast_sd((double const *)(ptr_l + i + 2)); + mat_a_blk_elems[4] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_pd(reciprocal_diags[0], 0x03); + mat_a_diag_inv[1] = _mm256_permute2f128_pd(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[5] = _mm256_broadcast_sd((double const *)(ptr_l + i + 3)); + i += cs_l; + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_pd(reciprocal_diags[0], 0x00); + mat_a_diag_inv[2] = _mm256_permute2f128_pd(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x11); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_pd(reciprocal_diags[0], 0x0C); + mat_a_diag_inv[3] = _mm256_permute2f128_pd(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x11); + + k = 0; + for (i = 0; i < numCols_b; i+=4) + { + + + + /////////////////// Complete Lower 4x4 block trsm of B :- lower 4x4 block of B with lower right 4x4 block of A + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[k][0] = _mm256_mul_pd(mat_b_rearr[k][0], mat_a_diag_inv[0]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) + mat_b_rearr[k][1] = _mm256_fnmadd_pd(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) + mat_b_rearr[k][2] = _mm256_fnmadd_pd(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[k][1] = _mm256_mul_pd(mat_b_rearr[k][1], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[k][2] = _mm256_fnmadd_pd(mat_a_blk_elems[3], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[4], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[k][2] = _mm256_mul_pd(mat_b_rearr[k][2], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[k][3] = _mm256_fnmadd_pd(mat_a_blk_elems[5], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[k][3] = _mm256_mul_pd(mat_b_rearr[k][3], mat_a_diag_inv[3]); + + //Store the computed B columns + + _mm256_storeu_pd((double *)(ptr_b_dup + i), mat_b_rearr[k][0]); + _mm256_storeu_pd((double *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); + _mm256_storeu_pd((double *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); + + k++; + } + + } + +} +#if OPT_CACHE_BLOCKING_L1 //new intrinsic kernels +static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l, r; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup, *ptr_l_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_blk_elems[8]; + __m256 mat_a_diag_inv[8]; + __m256 reciprocal_diags[2]; + + reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + //read diag elems of L 16x16 block + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + reciprocal_diags[1] = reciprocal_diags[0]; + + //pack first 8 diags together + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], mat_a_diag_inv[0]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], mat_a_diag_inv[1]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], mat_a_diag_inv[2]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], mat_a_diag_inv[3]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], mat_a_diag_inv[4]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], mat_a_diag_inv[5]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], mat_a_diag_inv[6]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); + + //i += cs_b_offset[6]; + //ptr_b_dup += cs_b_offset[6]; + i += 8; + ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += 8; + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += cs_b_offset[6]; + i1 += cs_b_offset[6]; + + //Read next 8x8 block of A to get diag elements + i3 += cs_l_offset[6]; + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l + i3); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); + + //pack 8 diags of A together + reciprocal_diags[0] = reciprocal_diags[1]; + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + for (r = 0; r < numCols_b; r += GEMM_BLK_V1) + { +#if GEMM_ACCUM_A + i = i1 + r; + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); +#endif + i = 0; + i2 = 0; + for (l = 0; l < j; l += 8) // move across m + { + //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + ptr_l_dup = ptr_l; + i4 = i2 + r; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + i4 = k >> 3; + ptr_l_dup += cs_l; + +#if GEMM_ACCUM_A + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); + mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); + mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); + mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); + mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); + mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); + mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); + mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#endif + //end loop of cols + } + i2 += cs_b_offset[6]; + i += cs_l_offset[6]; + } + //trsm solve + + k = 0; + //for (i2 = 0; i2 < numCols_b; i2 += 8) + { + i2 = i1 + r; + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A +#if !GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i2); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i2)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i2)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i2)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i2)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i2)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i2)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i2)); +#endif + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + +#if GEMM_ACCUM_A + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); +#else + mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); +#endif + +#if GEMM_ACCUM_A + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); + mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); + mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); + mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); + mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); + mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); + mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A76 to register + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup + r, mat_b_rearr[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+r), mat_b_rearr[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + r), mat_b_rearr[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + r), mat_b_rearr[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + r), mat_b_rearr[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + r), mat_b_rearr[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + r), mat_b_rearr[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + r), mat_b_rearr[7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + } + } + } //numRows of A + ///////////////////loop ends ///////////////////// +} + +static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) +{ + float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l, r; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup, *ptr_l_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_blk_elems[8]; + __m256 mat_a_diag_inv[8]; + __m256 reciprocal_diags[2]; + __m256 alphaReg; + + reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); + alphaReg = _mm256_broadcast_ss((float const *)&alpha); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + //read diag elems of L 16x16 block + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + reciprocal_diags[1] = reciprocal_diags[0]; + + //pack first 8 diags together + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); +#if 0 + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); +#endif + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], mat_a_diag_inv[0]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], mat_a_diag_inv[1]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], mat_a_diag_inv[2]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], mat_a_diag_inv[3]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], mat_a_diag_inv[4]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], mat_a_diag_inv[5]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], mat_a_diag_inv[6]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); + + //i += cs_b_offset[6]; + //ptr_b_dup += cs_b_offset[6]; + i += 8; + ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += 8; + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += cs_b_offset[6]; + i1 += cs_b_offset[6]; + + //Read next 8x8 block of A to get diag elements + i3 += cs_l_offset[6]; + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l + i3); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); + + //pack 8 diags of A together + reciprocal_diags[0] = reciprocal_diags[1]; + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + for (r = 0; r < numCols_b; r += GEMM_BLK_V1) + { +#if GEMM_ACCUM_A + i = i1 + r; + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); +#endif + i = 0; + i2 = 0; + for (l = 0; l < j; l += 8) // move across m + { + //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + ptr_l_dup = ptr_l; + i4 = i2 + r; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + i4 = k >> 3; + ptr_l_dup += cs_l; + +#if GEMM_ACCUM_A + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); + mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); + mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); + mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); + mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); + mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); + mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); + mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#endif + //end loop of cols + } + i2 += cs_b_offset[6]; + i += cs_l_offset[6]; + } + //trsm solve + + k = 0; + //for (i2 = 0; i2 < numCols_b; i2 += 8) + { + i2 = i1 + r; + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A +#if !GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i2); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i2)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i2)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i2)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i2)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i2)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i2)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i2)); + + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); +#endif + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + +#if GEMM_ACCUM_A + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); +#else + mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); +#endif + +#if GEMM_ACCUM_A + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); + mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); + mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); + mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); + mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); + mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); + mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A76 to register + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + + _mm256_storeu_ps((float *)ptr_b_dup + r, mat_b_rearr[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+r), mat_b_rearr[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + r), mat_b_rearr[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + r), mat_b_rearr[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + r), mat_b_rearr[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + r), mat_b_rearr[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + r), mat_b_rearr[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + r), mat_b_rearr[7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + } + } + } //numRows of A + ///////////////////loop ends ///////////////////// +} + +static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + //float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l, r; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup, *ptr_l_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_blk_elems[8]; + //__m256 mat_a_diag_inv[8]; + //__m256 reciprocal_diags[2]; + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + //(Row0) + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); + + //i += cs_b_offset[6]; + //ptr_b_dup += cs_b_offset[6]; + i += 8; + ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += 8; + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += cs_b_offset[6]; + i1 += cs_b_offset[6]; + i3 += cs_l_offset[6]; + + i = 0; + i2 = 0; + for (r = 0; r < numCols_b; r += GEMM_BLK_V1) + { +#if GEMM_ACCUM_A + i = i1 + r; + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); +#endif + i = 0; + i2 = 0; + for (l = 0; l < j; l += 8) // move across m + { + //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + ptr_l_dup = ptr_l; + i4 = i2 + r; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + i4 = k >> 3; + ptr_l_dup += cs_l; + +#if GEMM_ACCUM_A + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); + mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); + mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); + mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); + mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); + mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); + mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); + mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#endif + //end loop of cols + } + i2 += cs_b_offset[6]; + i += cs_l_offset[6]; + } + //trsm solve + + k = 0; + //for (i2 = 0; i2 < numCols_b; i2 += 8) + { + i2 = i1 + r; + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A +#if !GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i2); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i2)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i2)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i2)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i2)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i2)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i2)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i2)); +#endif + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + +#if GEMM_ACCUM_A + //(Row0): already done +#else + mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); +#endif + +#if GEMM_ACCUM_A + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); + mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); + mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); + mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); + mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); + mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); + mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A76 to register + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup + r, mat_b_rearr[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+r), mat_b_rearr[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + r), mat_b_rearr[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + r), mat_b_rearr[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + r), mat_b_rearr[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + r), mat_b_rearr[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + r), mat_b_rearr[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + r), mat_b_rearr[7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + } + } + } //numRows of A + ///////////////////loop ends ///////////////////// +} + +static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) +{ + //float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l, r; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup, *ptr_l_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_blk_elems[8]; + //__m256 mat_a_diag_inv[8]; + //__m256 reciprocal_diags[2]; + __m256 alphaReg; + alphaReg = _mm256_broadcast_ss((float const *)&alpha); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + +#if 0 + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); +#endif + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); + + //(Row0) + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); + + //i += cs_b_offset[6]; + //ptr_b_dup += cs_b_offset[6]; + i += 8; + ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += 8; + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += cs_b_offset[6]; + i1 += cs_b_offset[6]; + i3 += cs_l_offset[6]; + + i = 0; + i2 = 0; + for (r = 0; r < numCols_b; r += GEMM_BLK_V1) + { +#if GEMM_ACCUM_A + i = i1 + r; + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); +#endif + i = 0; + i2 = 0; + for (l = 0; l < j; l += 8) // move across m + { + //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + ptr_l_dup = ptr_l; + i4 = i2 + r; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + i4 = k >> 3; + ptr_l_dup += cs_l; + +#if GEMM_ACCUM_A + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); + mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); + mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); + mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); + mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); + mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); + mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); + mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); + ptr_l_dup += cs_l; +#if GEMM_ACCUM_A + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#endif + //end loop of cols + } + i2 += cs_b_offset[6]; + i += cs_l_offset[6]; + } + //trsm solve + + k = 0; + //for (i2 = 0; i2 < numCols_b; i2 += 8) + { + i2 = i1 + r; + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A +#if !GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i2); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i2)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i2)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i2)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i2)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i2)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i2)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i2)); + + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); +#endif + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + +#if GEMM_ACCUM_A + //(Row0): already done + +#else + mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); +#endif + +#if GEMM_ACCUM_A + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); + mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); + mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); + mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); + mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); + mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); + mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A76 to register + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup + r, mat_b_rearr[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+r), mat_b_rearr[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + r), mat_b_rearr[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + r), mat_b_rearr[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + r), mat_b_rearr[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + r), mat_b_rearr[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + r), mat_b_rearr[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + r), mat_b_rearr[7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + } + } + } //numRows of A + ///////////////////loop ends ///////////////////// +} +#else //rel 1.0 intrisic kernels (NOT OPT_CACHE_BLOCKING_L1) +static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[16][8]; + __m256 mat_a_cols_rearr[8]; + __m256 mat_a_blk_elems[64]; + __m256 mat_a_diag_inv[8]; + __m256 reciprocal_diags[2]; + + reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + //read diag elems of L 16x16 block + mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l); + mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); + mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); + mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); + mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); + mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); + mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); + mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + reciprocal_diags[1] = reciprocal_diags[0]; + + //pack first 8 diags together + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_col[0] = _mm256_mul_ps(mat_b_rearr[0][0], mat_a_diag_inv[0]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) + mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_col[1] = _mm256_mul_ps(mat_b_rearr[1][0], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_col[2] = _mm256_mul_ps(mat_b_rearr[2][0], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_col[3] = _mm256_mul_ps(mat_b_rearr[3][0], mat_a_diag_inv[3]); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_col[4] = _mm256_mul_ps(mat_b_rearr[4][0], mat_a_diag_inv[4]); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_col[5] = _mm256_mul_ps(mat_b_rearr[5][0], mat_a_diag_inv[5]); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_col[6] = _mm256_mul_ps(mat_b_rearr[6][0], mat_a_diag_inv[6]); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_col[7] = _mm256_mul_ps(mat_b_rearr[7][0], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); + + //i += cs_b_offset[6]; + //ptr_b_dup += cs_b_offset[6]; + i += 8; + ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += 8; + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += cs_b_offset[6]; + i1 += cs_b_offset[6]; + + //Read next 8x8 block of A to get diag elements + i3 += cs_l_offset[6]; + mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l + i3); + mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); + mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); + mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); + mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); + mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); + mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); + mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); + + //pack 8 diags of A together + reciprocal_diags[0] = reciprocal_diags[1]; + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + i = 0; + i2 = 0; + for (k = 0; k < numCols_b; k += 8) + { + i = i1 + k; + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + i2++; + } + + i = 0; + i2 = 0; + for (l = 0; l < j; l += 8) // move across m + { + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); + + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); + + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); + mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); + mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); + mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); + mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); + + // _mm256_permute2f128_ps() + + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); + mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); + mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); + mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); + mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); + mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); + mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); + mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); + + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); + mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); + mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); + mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); + mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); + mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); + mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); + mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); + + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); + mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); + mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); + mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); + mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); + mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); + mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); + mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); + + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); + mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); + mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); + mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); + mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); + mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); + mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); + mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); + + i += cs_l_offset[6]; + + + for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + + i4 = i2 + k; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + i4 = k >> 3; + + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) + + //end loop of cols + } + i2 += cs_b_offset[6]; + } + + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + k = 0; + for (i = 0; i < numCols_b; i+=8) + { + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[k][0] = _mm256_mul_ps(mat_b_rearr[k][0], mat_a_diag_inv[0]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) + mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[k][1] = _mm256_mul_ps(mat_b_rearr[k][1], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[k][2] = _mm256_mul_ps(mat_b_rearr[k][2], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[k][3] = _mm256_mul_ps(mat_b_rearr[k][3], mat_a_diag_inv[3]); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[k][4] = _mm256_mul_ps(mat_b_rearr[k][4], mat_a_diag_inv[4]); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[k][5] = _mm256_mul_ps(mat_b_rearr[k][5], mat_a_diag_inv[5]); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[k][6] = _mm256_mul_ps(mat_b_rearr[k][6], mat_a_diag_inv[6]); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[k][7] = _mm256_mul_ps(mat_b_rearr[k][7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + + _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + } + + + } + ///////////////////loop ends ///////////////////// +} + +static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) +{ + float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[16][8]; + __m256 mat_a_cols_rearr[8]; + __m256 mat_a_blk_elems[64]; + __m256 mat_a_diag_inv[8]; + __m256 reciprocal_diags[2]; + __m256 alphaReg; + + reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); + alphaReg = _mm256_broadcast_ss((float const *)&alpha); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + //read diag elems of L 16x16 block + mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l); + mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); + mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); + mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); + mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); + mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); + mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); + mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + reciprocal_diags[1] = reciprocal_diags[0]; + + //pack first 8 diags together + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + mat_b_rearr[0][0] = _mm256_mul_ps(mat_b_rearr[0][0], alphaReg); + mat_b_rearr[1][0] = _mm256_mul_ps(mat_b_rearr[1][0], alphaReg); + mat_b_rearr[2][0] = _mm256_mul_ps(mat_b_rearr[2][0], alphaReg); + mat_b_rearr[3][0] = _mm256_mul_ps(mat_b_rearr[3][0], alphaReg); + mat_b_rearr[4][0] = _mm256_mul_ps(mat_b_rearr[4][0], alphaReg); + mat_b_rearr[5][0] = _mm256_mul_ps(mat_b_rearr[5][0], alphaReg); + mat_b_rearr[6][0] = _mm256_mul_ps(mat_b_rearr[6][0], alphaReg); + mat_b_rearr[7][0] = _mm256_mul_ps(mat_b_rearr[7][0], alphaReg); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_col[0] = _mm256_mul_ps(mat_b_rearr[0][0], mat_a_diag_inv[0]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) + mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_col[1] = _mm256_mul_ps(mat_b_rearr[1][0], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_col[2] = _mm256_mul_ps(mat_b_rearr[2][0], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_col[3] = _mm256_mul_ps(mat_b_rearr[3][0], mat_a_diag_inv[3]); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_col[4] = _mm256_mul_ps(mat_b_rearr[4][0], mat_a_diag_inv[4]); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_col[5] = _mm256_mul_ps(mat_b_rearr[5][0], mat_a_diag_inv[5]); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_col[6] = _mm256_mul_ps(mat_b_rearr[6][0], mat_a_diag_inv[6]); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_col[7] = _mm256_mul_ps(mat_b_rearr[7][0], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); + + //i += cs_b_offset[6]; + //ptr_b_dup += cs_b_offset[6]; + i += 8; + ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += 8; + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += cs_b_offset[6]; + i1 += cs_b_offset[6]; + + //Read next 8x8 block of A to get diag elements + i3 += cs_l_offset[6]; + mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l + i3); + mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); + mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); + mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); + mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); + mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); + mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); + mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); + + //pack 8 diags of A together + reciprocal_diags[0] = reciprocal_diags[1]; + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + i = 0; + i2 = 0; + for (k = 0; k < numCols_b; k += 8) + { + i = i1 + k; + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + mat_b_rearr[i2][0] = _mm256_mul_ps(mat_b_rearr[i2][0], alphaReg); + mat_b_rearr[i2][1] = _mm256_mul_ps(mat_b_rearr[i2][1], alphaReg); + mat_b_rearr[i2][2] = _mm256_mul_ps(mat_b_rearr[i2][2], alphaReg); + mat_b_rearr[i2][3] = _mm256_mul_ps(mat_b_rearr[i2][3], alphaReg); + mat_b_rearr[i2][4] = _mm256_mul_ps(mat_b_rearr[i2][4], alphaReg); + mat_b_rearr[i2][5] = _mm256_mul_ps(mat_b_rearr[i2][5], alphaReg); + mat_b_rearr[i2][6] = _mm256_mul_ps(mat_b_rearr[i2][6], alphaReg); + mat_b_rearr[i2][7] = _mm256_mul_ps(mat_b_rearr[i2][7], alphaReg); + + i2++; + } + + i = 0; + i2 = 0; + for (l = 0; l < j; l += 8) // move across m + { + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); + + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); + + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); + mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); + mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); + mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); + mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); + + // _mm256_permute2f128_ps() + + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); + mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); + mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); + mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); + mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); + mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); + mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); + mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); + + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); + mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); + mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); + mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); + mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); + mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); + mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); + mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); + + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); + mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); + mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); + mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); + mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); + mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); + mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); + mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); + + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); + mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); + mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); + mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); + mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); + mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); + mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); + mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); + + i += cs_l_offset[6]; + + + for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + + i4 = i2 + k; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + i4 = k >> 3; + + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) + + //end loop of cols + } + i2 += cs_b_offset[6]; + } + + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + k = 0; + for (i = 0; i < numCols_b; i+=8) + { + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[k][0] = _mm256_mul_ps(mat_b_rearr[k][0], mat_a_diag_inv[0]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) + mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[k][1] = _mm256_mul_ps(mat_b_rearr[k][1], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[k][2] = _mm256_mul_ps(mat_b_rearr[k][2], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[k][3] = _mm256_mul_ps(mat_b_rearr[k][3], mat_a_diag_inv[3]); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[k][4] = _mm256_mul_ps(mat_b_rearr[k][4], mat_a_diag_inv[4]); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[k][5] = _mm256_mul_ps(mat_b_rearr[k][5], mat_a_diag_inv[5]); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[k][6] = _mm256_mul_ps(mat_b_rearr[k][6], mat_a_diag_inv[6]); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[k][7] = _mm256_mul_ps(mat_b_rearr[k][7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + + _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); + k++; + } + + + } + ///////////////////loop ends ///////////////////// +} + +static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + //float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[16][8]; + //__m256 mat_a_cols_rearr[8]; + __m256 mat_a_blk_elems[64]; + //__m256 mat_a_diag_inv[8]; + //__m256 reciprocal_diags[2]; + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + //(Row0) + mat_b_col[0] = mat_b_rearr[0][0]; + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) + mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); + + //i += cs_b_offset[6]; + //ptr_b_dup += cs_b_offset[6]; + i += 8; + ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += 8; + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += cs_b_offset[6]; + i1 += cs_b_offset[6]; + i3 += cs_l_offset[6]; + + i = 0; + i2 = 0; + for (k = 0; k < numCols_b; k += 8) + { + i = i1 + k; + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + i2++; + } + + i = 0; + i2 = 0; + for (l = 0; l < j; l += 8) // move across m + { + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); + + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); + + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); + mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); + mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); + mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); + mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); + + // _mm256_permute2f128_ps() + + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); + mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); + mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); + mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); + mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); + mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); + mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); + mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); + + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); + mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); + mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); + mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); + mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); + mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); + mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); + mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); + + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); + mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); + mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); + mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); + mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); + mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); + mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); + mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); + + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); + mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); + mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); + mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); + mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); + mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); + mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); + mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); + + i += cs_l_offset[6]; + + for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + + i4 = i2 + k; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + i4 = k >> 3; + + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) + + //end loop of cols + } + i2 += cs_b_offset[6]; + } + + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + k = 0; + for (i = 0; i < numCols_b; i+=8) + { + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A + + //(Row0): already done + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) + mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + + _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + } + + + } + ///////////////////loop ends ///////////////////// +} + +static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) +{ + //float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[16][8]; + //__m256 mat_a_cols_rearr[8]; + __m256 mat_a_blk_elems[64]; + //__m256 mat_a_diag_inv[8]; + //__m256 reciprocal_diags[2]; + __m256 alphaReg; + alphaReg = _mm256_broadcast_ss((float const *)&alpha); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + mat_b_rearr[0][0] = _mm256_mul_ps(mat_b_rearr[0][0], alphaReg); + mat_b_rearr[1][0] = _mm256_mul_ps(mat_b_rearr[1][0], alphaReg); + mat_b_rearr[2][0] = _mm256_mul_ps(mat_b_rearr[2][0], alphaReg); + mat_b_rearr[3][0] = _mm256_mul_ps(mat_b_rearr[3][0], alphaReg); + mat_b_rearr[4][0] = _mm256_mul_ps(mat_b_rearr[4][0], alphaReg); + mat_b_rearr[5][0] = _mm256_mul_ps(mat_b_rearr[5][0], alphaReg); + mat_b_rearr[6][0] = _mm256_mul_ps(mat_b_rearr[6][0], alphaReg); + mat_b_rearr[7][0] = _mm256_mul_ps(mat_b_rearr[7][0], alphaReg); + + //(Row0) + mat_b_col[0] = mat_b_rearr[0][0]; + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) + mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) + mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) + mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) + mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) + mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) + mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); + + //i += cs_b_offset[6]; + //ptr_b_dup += cs_b_offset[6]; + i += 8; + ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += 8; + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += cs_b_offset[6]; + i1 += cs_b_offset[6]; + i3 += cs_l_offset[6]; + + i = 0; + i2 = 0; + for (k = 0; k < numCols_b; k += 8) + { + i = i1 + k; + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + mat_b_rearr[i2][0] = _mm256_mul_ps(mat_b_rearr[i2][0], alphaReg); + mat_b_rearr[i2][1] = _mm256_mul_ps(mat_b_rearr[i2][1], alphaReg); + mat_b_rearr[i2][2] = _mm256_mul_ps(mat_b_rearr[i2][2], alphaReg); + mat_b_rearr[i2][3] = _mm256_mul_ps(mat_b_rearr[i2][3], alphaReg); + mat_b_rearr[i2][4] = _mm256_mul_ps(mat_b_rearr[i2][4], alphaReg); + mat_b_rearr[i2][5] = _mm256_mul_ps(mat_b_rearr[i2][5], alphaReg); + mat_b_rearr[i2][6] = _mm256_mul_ps(mat_b_rearr[i2][6], alphaReg); + mat_b_rearr[i2][7] = _mm256_mul_ps(mat_b_rearr[i2][7], alphaReg); + + i2++; + } + + i = 0; + i2 = 0; + for (l = 0; l < j; l += 8) // move across m + { + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); + + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); + + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); + mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); + mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); + mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); + mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); + + // _mm256_permute2f128_ps() + + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); + mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); + mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); + mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); + mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); + mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); + mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); + mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); + + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); + mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); + mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); + mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); + mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); + mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); + mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); + mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); + + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); + mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); + mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); + mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); + mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); + mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); + mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); + mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); + + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); + mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); + mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); + mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); + mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); + mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); + mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); + mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); + + i += cs_l_offset[6]; + + for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + { + /////////////////// Partial Lower 8x8 block trsm of B + + i4 = i2 + k; + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + i4 = k >> 3; + + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) + + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) + mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) + mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) + mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) + mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) + mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) + mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) + mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) + + //end loop of cols + } + i2 += cs_b_offset[6]; + } + + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + i += cs_l; + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); + + k = 0; + for (i = 0; i < numCols_b; i+=8) + { + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A + + //(Row0): already done + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) + mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) + + //////////////////////////////////////////////////////////////////////////////// + + //Store the computed B columns + + _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + } + + + } + ///////////////////loop ends ///////////////////// +} +#endif //OPT_CACHE_BLOCKING_L1 + +//////////////////////////// AutX=B /////////////////////// +static void trsm_AutXB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l, r; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup, *ptr_l_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_blk_elems[8]; + __m256 mat_a_diag_inv[8]; + __m256 reciprocal_diags[2]; + + reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + //read diag elems of L 16x16 block + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + reciprocal_diags[1] = reciprocal_diags[0]; + + //pack first 8 diags together + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); +#if 0 + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); +#endif + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], mat_a_diag_inv[0]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5])); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], mat_a_diag_inv[1]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[0])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[1])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[2])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[3])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[4])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[5])); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], mat_a_diag_inv[2]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[1])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[2])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[3])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[4])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[5])); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], mat_a_diag_inv[3]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[2])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[3])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[4])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[5])); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], mat_a_diag_inv[4]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[3])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[4])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[5])); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], mat_a_diag_inv[5]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[4])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[5])); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], mat_a_diag_inv[6]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 6 + cs_l_offset[5])); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + /* transpose steps start */ + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + /* transpose steps end */ + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_rearr[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_rearr[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_rearr[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_rearr[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_rearr[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_rearr[7]); + + i += cs_b_offset[6]; + ptr_b_dup += cs_b_offset[6]; + //i += 8; + //ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += cs_l_offset[6]; + + //Read next 8x8 block of A to get diag elements + i3 += 8; + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l + i3); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); + + //pack 8 diags of A together + reciprocal_diags[0] = reciprocal_diags[1]; + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += 8; + i1 += 8; + i = i1; + i2 = 0; + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + for (r = 0; r < numCols_b; r += GEMM_BLK_V1) + { +#if GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + /* transpose steps end */ +#endif + //i = 0; + ptr_l_dup = ptr_l; + i4 = i2; + for (l = 0; l < j; l += 8) // move across m + { + //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + //{ + /////////////////// Partial Lower 8x8 block trsm of B + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_a_blk_elems[0] = _mm256_unpackhi_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); + mat_a_blk_elems[1] = _mm256_unpackhi_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); + mat_a_blk_elems[2] = _mm256_unpackhi_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); + mat_a_blk_elems[3] = _mm256_unpackhi_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_a_blk_elems[4] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x44); + mat_a_blk_elems[5] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xEE); + mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x44); + mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xEE); +#else + mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x4E); + mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x4E); + mat_a_blk_elems[4] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[6], 0xCC); + mat_a_blk_elems[5] = _mm256_blend_ps(mat_a_blk_elems[1], mat_a_blk_elems[6], 0x33); + mat_a_blk_elems[6] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[7], 0xCC); + mat_a_blk_elems[7] = _mm256_blend_ps(mat_a_blk_elems[3], mat_a_blk_elems[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x31); + /* transpose steps end */ + + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + //i4 = k >> 3; + ptr_l_dup++; + +#if GEMM_ACCUM_A + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); + mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); + mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); + mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); + mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); + mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); + mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); + mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#endif + //end loop of cols + //} + //i2 += cs_b_offset[6]; + i4 += 8; + } + //trsm solve + + k = 0; + //for (i2 = 0; i2 < numCols_b; i2 += 8) + //{ + //i2 = i1 + r; + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A +#if !GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ +#endif + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + //i += cs_l; + +#if GEMM_ACCUM_A + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); +#else + mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); +#endif + +#if GEMM_ACCUM_A + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); + mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); + mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); + mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); + mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); + mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); + mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[0])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[1])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[2])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[3])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[4])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[1])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[2])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[3])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[4])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[2])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[3])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[4])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[3])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[4])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[4])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A76 to register + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 6 + cs_l_offset[5])); + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup + i2, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+i2), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i2), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i2), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i2), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i2), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i2), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i2), mat_b_col[7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + //} + i += cs_b_offset[6]; + i2 += cs_b_offset[6]; + } + } //numRows of A + ///////////////////loop ends ///////////////////// +} + +static void trsm_AutXB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) +{ + float ones = 1.0; + int i, i1, i2, i3, i4, j, k, l, r; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup, *ptr_l_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_blk_elems[8]; + __m256 mat_a_diag_inv[8]; + __m256 reciprocal_diags[2]; + __m256 alphaReg; + + reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); + alphaReg = _mm256_broadcast_ss((float const *)&alpha); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + //read diag elems of L 16x16 block + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + + reciprocal_diags[1] = reciprocal_diags[0]; + + //pack first 8 diags together + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); +#if 0 + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); +#endif + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); + + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], mat_a_diag_inv[0]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5])); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], mat_a_diag_inv[1]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[0])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[1])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[2])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[3])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[4])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[5])); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], mat_a_diag_inv[2]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[1])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[2])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[3])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[4])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[5])); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], mat_a_diag_inv[3]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[2])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[3])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[4])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[5])); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], mat_a_diag_inv[4]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[3])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[4])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[5])); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], mat_a_diag_inv[5]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[4])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[5])); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], mat_a_diag_inv[6]); + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 6 + cs_l_offset[5])); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + /* transpose steps start */ + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + /* transpose steps end */ + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_rearr[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_rearr[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_rearr[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_rearr[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_rearr[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_rearr[7]); + + i += cs_b_offset[6]; + ptr_b_dup += cs_b_offset[6]; + //i += 8; + //ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i3 = 0; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += cs_l_offset[6]; + + //Read next 8x8 block of A to get diag elements + i3 += 8; + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l + i3); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); + + //pack 8 diags of A together + reciprocal_diags[0] = reciprocal_diags[1]; + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 + mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 + mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 + mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 + mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 + + //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 + reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); + + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += 8; + i1 += 8; + i = i1; + i2 = 0; + + //extract diag a00 from a + mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); + //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); + + //extract diag a11 from a + mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); + //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); + + //extract diag a22 from a + mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); + //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); + + //extract diag a33 from a + mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); + //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); + + //extract diag a44 from a + mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); + mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); + //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); + + //extract diag a55 from a + mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); + mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); + //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); + + //extract diag a66 from a + mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); + mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); + //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); + + //extract diag a77 from a + mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); + mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); + //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); + + for (r = 0; r < numCols_b; r += GEMM_BLK_V1) + { +#if GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + /* transpose steps end */ + + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); +#endif + + //i = 0; + ptr_l_dup = ptr_l; + i4 = i2; + for (l = 0; l < j; l += 8) // move across m + { + //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + //{ + /////////////////// Partial Lower 8x8 block trsm of B + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_a_blk_elems[0] = _mm256_unpackhi_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); + mat_a_blk_elems[1] = _mm256_unpackhi_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); + mat_a_blk_elems[2] = _mm256_unpackhi_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); + mat_a_blk_elems[3] = _mm256_unpackhi_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_a_blk_elems[4] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x44); + mat_a_blk_elems[5] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xEE); + mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x44); + mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xEE); +#else + mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x4E); + mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x4E); + mat_a_blk_elems[4] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[6], 0xCC); + mat_a_blk_elems[5] = _mm256_blend_ps(mat_a_blk_elems[1], mat_a_blk_elems[6], 0x33); + mat_a_blk_elems[6] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[7], 0xCC); + mat_a_blk_elems[7] = _mm256_blend_ps(mat_a_blk_elems[3], mat_a_blk_elems[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x31); + /* transpose steps end */ + + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + //i4 = k >> 3; + ptr_l_dup++; + +#if GEMM_ACCUM_A + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); + mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); + mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); + mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); + mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); + mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); + mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); + mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#endif + //end loop of cols + //} + //i2 += cs_b_offset[6]; + i4 += 8; + } + //trsm solve + + k = 0; + //for (i2 = 0; i2 < numCols_b; i2 += 8) + //{ + //i2 = i1 + r; + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A +#if !GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); +#endif + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + //i += cs_l; + +#if GEMM_ACCUM_A + //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); +#else + mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); +#endif + +#if GEMM_ACCUM_A + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); + mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); + mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); + mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); + mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); + mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); + mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[0])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[1])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[2])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[3])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[4])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[1])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[2])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[3])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[4])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[2])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[3])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[4])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[3])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[4])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[4])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[5])); + //i += cs_l; + + //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A76 to register + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 6 + cs_l_offset[5])); + + //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); + + //////////////////////////////////////////////////////////////////////////////// + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup + i2, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+i2), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i2), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i2), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i2), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i2), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i2), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i2), mat_b_col[7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + //} + i += cs_b_offset[6]; + i2 += cs_b_offset[6]; + } + } //numRows of A + ///////////////////loop ends ///////////////////// +} + +static void trsm_AutXB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) +{ + //float ones = 1.0; + int i, i1, i2, i4, j, k, l, r; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup, *ptr_l_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_blk_elems[8]; + //__m256 mat_a_diag_inv[8]; + //__m256 reciprocal_diags[2]; + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + +#if 0 + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); +#endif + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + + //(Row0) + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5])); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[0])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[1])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[2])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[3])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[4])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[5])); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[1])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[2])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[3])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[4])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[5])); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[2])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[3])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[4])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[5])); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[3])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[4])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[5])); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[4])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[5])); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 6 + cs_l_offset[5])); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) + + + + //////////////////////////////////////////////////////////////////////////////// + + /* transpose steps start */ + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + /* transpose steps end */ + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_rearr[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_rearr[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_rearr[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_rearr[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_rearr[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_rearr[7]); + + i += cs_b_offset[6]; + ptr_b_dup += cs_b_offset[6]; + //i += 8; + //ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += cs_l_offset[6]; + + + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += 8; + i1 += 8; + i = i1; + i2 = 0; + + for (r = 0; r < numCols_b; r += GEMM_BLK_V1) + { +#if GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + /* transpose steps end */ +#endif + + //i = 0; + ptr_l_dup = ptr_l; + i4 = i2; + for (l = 0; l < j; l += 8) // move across m + { + //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + //{ + /////////////////// Partial Lower 8x8 block trsm of B + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_a_blk_elems[0] = _mm256_unpackhi_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); + mat_a_blk_elems[1] = _mm256_unpackhi_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); + mat_a_blk_elems[2] = _mm256_unpackhi_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); + mat_a_blk_elems[3] = _mm256_unpackhi_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_a_blk_elems[4] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x44); + mat_a_blk_elems[5] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xEE); + mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x44); + mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xEE); +#else + mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x4E); + mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x4E); + mat_a_blk_elems[4] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[6], 0xCC); + mat_a_blk_elems[5] = _mm256_blend_ps(mat_a_blk_elems[1], mat_a_blk_elems[6], 0x33); + mat_a_blk_elems[6] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[7], 0xCC); + mat_a_blk_elems[7] = _mm256_blend_ps(mat_a_blk_elems[3], mat_a_blk_elems[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x31); + /* transpose steps end */ + + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + //i4 = k >> 3; + ptr_l_dup++; + +#if GEMM_ACCUM_A + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); + mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); + mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); + mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); + mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); + mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); + mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); + mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#endif + //end loop of cols + //} + //i2 += cs_b_offset[6]; + i4 += 8; + } + //trsm solve + + k = 0; + //for (i2 = 0; i2 < numCols_b; i2 += 8) + //{ + //i2 = i1 + r; + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A +#if !GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ +#endif + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + //i += cs_l; + +#if GEMM_ACCUM_A + //(Row0): already done + +#else + mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); +#endif + +#if GEMM_ACCUM_A + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); + mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); + mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); + mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); + mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); + mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); + mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[0])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[1])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[2])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[3])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[4])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[1])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[2])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[3])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[4])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[2])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[3])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[4])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[3])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[4])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[4])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A76 to register + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 6 + cs_l_offset[5])); + + + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + + + //////////////////////////////////////////////////////////////////////////////// + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup + i2, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+i2), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i2), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i2), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i2), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i2), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i2), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i2), mat_b_col[7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + //} + i += cs_b_offset[6]; + i2 += cs_b_offset[6]; + } + } //numRows of A + ///////////////////loop ends ///////////////////// +} + +static void trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) +{ + //float ones = 1.0; + int i, i1, i2, i4, j, k, l, r; + int cs_b_offset[7]; + int cs_l_offset[7]; + float *ptr_b_dup, *ptr_l_dup; + + //57 number of ymm(256 bits) registers used + __m256 mat_b_col[8]; + __m256 mat_b_rearr[8]; + __m256 mat_a_blk_elems[8]; + //__m256 mat_a_diag_inv[8]; + //__m256 reciprocal_diags[2]; + __m256 alphaReg; + alphaReg = _mm256_broadcast_ss((float const *)&alpha); + + // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // + + //L matrix offsets + cs_l_offset[0] = (cs_l << 1); + cs_l_offset[1] = cs_l + cs_l_offset[0]; + cs_l_offset[2] = (cs_l << 2); + cs_l_offset[3] = cs_l + cs_l_offset[2]; + cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; + cs_l_offset[5] = cs_l + cs_l_offset[4]; + cs_l_offset[6] = (cs_l_offset[5] + cs_l); + + cs_b_offset[0] = (cs_b << 1); + cs_b_offset[1] = cs_b + cs_b_offset[0]; + cs_b_offset[2] = (cs_b << 2); + cs_b_offset[3] = cs_b + cs_b_offset[2]; + cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; + cs_b_offset[5] = cs_b + cs_b_offset[4]; + cs_b_offset[6] = (cs_b_offset[5] + cs_b); + +#if 0 + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); + + //Broadcast A21 to A71 to registers + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); + mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); + mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); + mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); + mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); + mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); + mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); + mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); + mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); + mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); + mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); + mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); + mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); + mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); + mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); + mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); + + //Broadcast A76 to register + mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); +#endif + + + /***************** first set of 8 rows of B processing starts *****************/ + ptr_b_dup = ptr_b; + i = 0; + for (j = 0; j < numCols_b; j += 8) + { + /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A + //read 8x8 block of B into registers + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); + + //(Row0) + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5])); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[0])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[1])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[2])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[3])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[4])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[5])); + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[1])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[2])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[3])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[4])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[5])); + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[2])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[3])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[4])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[5])); + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[3])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[4])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[5])); + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[4])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[5])); + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) + + + + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 6 + cs_l_offset[5])); + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) + + + + //////////////////////////////////////////////////////////////////////////////// + + /* transpose steps start */ + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + /* transpose steps end */ + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup, mat_b_rearr[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_rearr[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_rearr[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_rearr[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_rearr[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_rearr[7]); + + i += cs_b_offset[6]; + ptr_b_dup += cs_b_offset[6]; + //i += 8; + //ptr_b_dup += 8; + } + + //c = 0; + /***************** first set of 8 cols of B processing done *****************/ + ptr_b_dup = ptr_b; + i1 = 0; + //Start loop for cols of B to be processed in size of blk_width + for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row + { + ptr_l += cs_l_offset[6]; + + + //ptr_b += j; + //ptr_b_dup += 8; + ptr_b_dup += 8; + i1 += 8; + i = i1; + i2 = 0; + + for (r = 0; r < numCols_b; r += GEMM_BLK_V1) + { +#if GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); + mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); + mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); + mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + + ////unpackhigh//// + mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); + mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); + mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); + mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + /* transpose steps end */ + + mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); + mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); + mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); + mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); + mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); + mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); + mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); + mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); +#endif + + //i = 0; + ptr_l_dup = ptr_l; + i4 = i2; + for (l = 0; l < j; l += 8) // move across m + { + //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) + //{ + /////////////////// Partial Lower 8x8 block trsm of B + //Read current 8 cols of B columns from specified 8x8 current-block of B + mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_b + i4); + mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); + mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); + mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); + mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); + mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); + mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); + mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_a_blk_elems[0] = _mm256_unpackhi_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); + mat_a_blk_elems[1] = _mm256_unpackhi_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); + mat_a_blk_elems[2] = _mm256_unpackhi_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); + mat_a_blk_elems[3] = _mm256_unpackhi_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_a_blk_elems[4] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x44); + mat_a_blk_elems[5] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xEE); + mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x44); + mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xEE); +#else + mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x4E); + mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x4E); + mat_a_blk_elems[4] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[6], 0xCC); + mat_a_blk_elems[5] = _mm256_blend_ps(mat_a_blk_elems[1], mat_a_blk_elems[6], 0x33); + mat_a_blk_elems[6] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[7], 0xCC); + mat_a_blk_elems[7] = _mm256_blend_ps(mat_a_blk_elems[3], mat_a_blk_elems[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x31); + /* transpose steps end */ + + //Broadcast A8,0 to A15,0 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + //i4 = k >> 3; + ptr_l_dup++; + +#if GEMM_ACCUM_A + //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); + mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); + mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); + mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); + mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); + mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); + mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); + mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,2 to A15,2 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,3 to A15,3 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,4 to A15,4 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,5 to A15,5 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,6 to A15,6 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A8,7 to A15,7 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + ptr_l_dup++; +#if GEMM_ACCUM_A + //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) + mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) +#endif + //end loop of cols + //} + //i2 += cs_b_offset[6]; + i4 += 8; + } + //trsm solve + + k = 0; + //for (i2 = 0; i2 < numCols_b; i2 += 8) + //{ + //i2 = i1 + r; + /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A +#if !GEMM_ACCUM_A + //Read 8 cols of B columns of Block-to-be-solved + mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); + mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); + mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); + mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); + mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); + mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); + mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); + mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); + mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); + mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); + mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); + mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); + mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); + mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); + mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); +#endif + //Broadcast A10 to A70 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); + mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); + //i += cs_l; + +#if GEMM_ACCUM_A + //(Row0): already done + +#else + mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); +#endif + +#if GEMM_ACCUM_A + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#else + mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); + mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); + mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); + mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); + mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); + mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); + mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); + + //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) + mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) +#endif + //Broadcast A21 to A71 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[0])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[1])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[2])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[3])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[4])); + mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) + mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A32 to A72 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[1])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[2])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[3])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[4])); + mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) + mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A43 to A73 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[2])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[3])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[4])); + mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) + mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A54 to A74 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[3])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[4])); + mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) + mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A65 to A75 to registers + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[4])); + mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[5])); + //i += cs_l; + + + + //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) + mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) + + //Broadcast A76 to register + mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 6 + cs_l_offset[5])); + + + + //(Row7): FMA operations of b7 with elements of index (7, 0) + mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) + + + + //////////////////////////////////////////////////////////////////////////////// + + /* transpose steps start */ + ////unpacklow//// + mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange low elements +#if REARRANGE_SHFL == 1 + mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); + mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); +#else + mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); + mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); + mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); + mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); + mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); + mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); +#endif + //Merge rearranged low elements into complete rows + mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); + mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); + mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); + mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); + + ////unpackhigh//// + mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); + mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); + mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); + mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); + + //Rearrange high elements +#if REARRANGE_SHFL == 1 + mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); + mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); +#else + mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); + mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); + mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); + mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); + mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); + mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); +#endif + + //Merge rearranged high elements into complete rows + mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); + mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); + mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); + mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); + /* transpose steps end */ + + //Store the computed B columns + _mm256_storeu_ps((float *)ptr_b_dup + i2, mat_b_col[0]); + _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+i2), mat_b_col[1]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i2), mat_b_col[2]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i2), mat_b_col[3]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i2), mat_b_col[4]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i2), mat_b_col[5]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i2), mat_b_col[6]); + _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i2), mat_b_col[7]); + //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); + k++; + //} + i += cs_b_offset[6]; + i2 += cs_b_offset[6]; + } + } //numRows of A + ///////////////////loop ends ///////////////////// +} +#endif diff --git a/test/Makefile b/test/Makefile index 799900b58..27d3eaa2c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2017, Advanced Micro Devices, Inc. +# Copyright (C) 2017 - 2018, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are