From d56ca14589413fd4b14987262ab9de4533dbbbe8 Mon Sep 17 00:00:00 2001
From: sraut <Biplab.Raut@amd.com>
Date: Wed, 6 Jun 2018 11:24:33 +0530
Subject: [PATCH] small matrix trsm intrinsics optimization code for AX=B and
 XA'=B

Change-Id: I90123c4d9adbd314c867995cd19dc975150b448c
---
 frame/3/trsm/bli_trsm_front.c  |    8 +-
 kernels/zen/3/bli_trsm_small.c | 5423 +++++++++++++++++++++++++++++++-
 2 files changed, 5418 insertions(+), 13 deletions(-)

diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
index a69f4b76c..0b6db8a5a 100644
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -34,7 +34,7 @@
 */
 
 #include "blis.h"
-
+//#define PRINT_SMALL_TRSM_INFO
 void bli_trsm_front
      (
        side_t  side,
@@ -47,11 +47,15 @@ void bli_trsm_front
      )
 {
 	bli_init_once();
-
+	int i, j;
 	obj_t   a_local;
 	obj_t   b_local;
 	obj_t   c_local;
  
+int m = bli_obj_length(*b);
+int n = bli_obj_width(*b); 
+float *L =  a->buffer;
+  float *B =  b->buffer;
 
 #ifdef PRINT_SMALL_TRSM_INFO
         printf("Side:: %c\n", side ? 'R' : 'L');
diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c
index af84d0588..979c26dea 100644
--- a/kernels/zen/3/bli_trsm_small.c
+++ b/kernels/zen/3/bli_trsm_small.c
@@ -4,7 +4,11 @@ BLIS
 An object-based framework for developing high-performance BLAS-like
 libraries.
 
+<<<<<<< HEAD
 Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
+=======
+Copyright (C) 2018, Advanced Micro Devices, Inc.
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -26,15 +30,24 @@ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+<<<<<<< HEAD
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+=======
+THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
+<<<<<<< HEAD
+=======
+//#define BLIS_ENABLE_SMALL_MATRIX_TRSM
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
 
 #include "blis.h"
 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
 #include "immintrin.h"
+<<<<<<< HEAD
 #define GEMM_BLK_V1 8            //Block size to perform gemm and apply trsm
 #define GEMM_ACCUM_A 1            //Peform B1=B1-(B0*A0) operation instead of B1'=(B0*A0) and then B1=B1-B1'
 #define OPT_CACHE_BLOCKING_L1 1 //Perform trsm block-wise in blocks of GEMM_BLK_V1 instead of all columns of B together.
@@ -341,6 +354,91 @@ static void trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l,
                                     int cs_b,
                                     float alpha);
 
+=======
+
+static void (*fp_blis_strsm_microkernel)( float *ptr_l,
+		                            float *ptr_b,
+									int numRows_lb,
+									int numCols_b,
+									int rs_l,
+									int rs_b,
+									int cs_l,
+									int cs_b
+								);
+static void blis_strsm_microkernel( float *ptr_l,
+		                            float *ptr_b,
+									int numRows_lb,
+									int numCols_b,
+									int rs_l,
+									int rs_b,
+									int cs_l,
+									int cs_b
+								);
+static void blis_strsm_microkernel_alpha( float *ptr_l,
+		                            float *ptr_b,
+									int numRows_lb,
+									int numCols_b,
+									int rs_l,
+									int rs_b,
+									int cs_l,
+									int cs_b,
+									float alphaVal
+								);
+static void blis_strsm_microkernel_unitDiag( float *ptr_l,
+		                            float *ptr_b,
+									int numRows_lb,
+									int numCols_b,
+									int rs_l,
+									int rs_b,
+									int cs_l,
+									int cs_b
+								);
+static void blis_strsm_microkernel_alpha_unitDiag( float *ptr_l,
+		                            float *ptr_b,
+									int numRows_lb,
+									int numCols_b,
+									int rs_l,
+									int rs_b,
+									int cs_l,
+									int cs_b,
+									float alphaVal
+								);
+static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, 
+								  float *ptr_b, 
+								  int numRows_lb, 
+								  int numCols_b, 
+								  int rs_l, 
+								  int rs_b,
+								  int cs_l, 
+								  int cs_b);
+static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, 
+								  float *ptr_b, 
+								  int numRows_lb, 
+								  int numCols_b, 
+								  int rs_l, 
+								  int rs_b,
+								  int cs_l, 
+								  int cs_b,
+								  float alphaVal);
+static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, 
+								  float *ptr_b, 
+								  int numRows_lb, 
+								  int numCols_b, 
+								  int rs_l, 
+								  int rs_b,
+								  int cs_l, 
+								  int cs_b);
+static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, 
+								  float *ptr_b, 
+								  int numRows_lb, 
+								  int numCols_b, 
+								  int rs_l, 
+								  int rs_b,
+								  int cs_l, 
+								  int cs_b,
+								  float alphaVal);
+								  
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
 //AX = B;  A is lower triangular; No transpose; single precision
 static err_t bli_strsm_small_AlXB
      (
@@ -351,6 +449,21 @@ static err_t bli_strsm_small_AlXB
        cntx_t* cntx,
        cntl_t* cntl
      );
+<<<<<<< HEAD
+=======
+
+//AX = B;  A is lower triangular; No transpose; double precision
+static err_t bli_dtrsm_small_AlXB
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       cntx_t* cntx,
+       cntl_t* cntl
+     );
+
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
 //A.'X = B;  A is upper triangular; A has to be transposed; single precision
 static err_t bli_strsm_small_AutXB
      (
@@ -362,17 +475,8 @@ static err_t bli_strsm_small_AutXB
        cntl_t* cntl
      );
 
-//XA.' = B;  A is lower triangular; A has to be transposed; single precision
-static err_t bli_strsm_small_XAltB
-     (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       cntl_t* cntl
-     );
-
+<<<<<<< HEAD
+=======
 //A.'X = B;  A is upper triangular; A has to be transposed; double precision
 static err_t bli_dtrsm_small_AutXB
      (
@@ -384,6 +488,38 @@ static err_t bli_dtrsm_small_AutXB
        cntl_t* cntl
      );
 
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
+//XA.' = B;  A is lower triangular; A has to be transposed; single precision
+static err_t bli_strsm_small_XAltB
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       cntx_t* cntx,
+       cntl_t* cntl
+     );
+
+<<<<<<< HEAD
+//A.'X = B;  A is upper triangular; A has to be transposed; double precision
+static err_t bli_dtrsm_small_AutXB
+=======
+//XA.' = B;  A is lower triangular; A has to be transposed; double precision
+static err_t bli_dtrsm_small_XAltB
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       cntx_t* cntx,
+       cntl_t* cntl
+     );
+<<<<<<< HEAD
+
+=======
+	 void trsm_block_c(float *ptr_l, float *ptr_b, int blk_height, int blk_width, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b);
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
 /*
 * The bli_trsm_small implements unpacked version of TRSM 
 * Currently only column-major is supported, A & B are column-major
@@ -407,6 +543,7 @@ err_t bli_trsm_small
     return BLIS_NOT_YET_IMPLEMENTED;
 #endif
 
+<<<<<<< HEAD
     dim_t m = bli_obj_length(b);
     dim_t n = bli_obj_width(b);
 
@@ -414,6 +551,8 @@ err_t bli_trsm_small
         return BLIS_SUCCESS;
 
 
+=======
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
     // If alpha is zero, B matrix will become zero after scaling & hence solution is also zero matrix 
     if (bli_obj_equals(alpha, &BLIS_ZERO))
     {
@@ -422,8 +561,13 @@ err_t bli_trsm_small
     // We have to call matrix scaling if alpha != 1.0
     
     // if row major format return. Check this again.
+<<<<<<< HEAD
     if ((bli_obj_row_stride(a) != 1) ||
         (bli_obj_row_stride(b) != 1))
+=======
+    if ((bli_obj_row_stride(*a) != 1) ||
+        (bli_obj_row_stride(*b) != 1))
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
     {
         return BLIS_INVALID_ROW_STRIDE;
     }
@@ -433,6 +577,7 @@ err_t bli_trsm_small
     // only float and double datatypes are supported as of now.
     if (dt != BLIS_DOUBLE && dt != BLIS_FLOAT)
     {
+<<<<<<< HEAD
     return BLIS_EXPECTED_REAL_DATATYPE;
     }
 
@@ -440,12 +585,22 @@ err_t bli_trsm_small
     if (!bli_obj_is_upper_or_lower (a))
     {
     return BLIS_EXPECTED_TRIANGULAR_OBJECT;
+=======
+	return BLIS_EXPECTED_REAL_DATATYPE;
+    }
+
+    // A is expected to be triangular in trsm
+    if (!bli_obj_is_upper_or_lower (*a))
+    {
+	return BLIS_EXPECTED_TRIANGULAR_OBJECT;
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
     }
 
     // can use other control structs - even can use array of function pointers,
     // indexed by a number with bits formed by f('side', 'uplo', 'transa', dt).
     // In the below implementation, based on the number of finally implemented
     // cases, can move the checks with more cases higher up.
+<<<<<<< HEAD
 
 	if(side == BLIS_LEFT)
 	{
@@ -509,10 +664,45 @@ err_t bli_trsm_small
 
 			}
 
+=======
+    if (side == BLIS_LEFT)
+    {
+	if (bli_obj_has_trans(*a))
+	{
+		if (dt == BLIS_DOUBLE)
+		{
+			if (bli_obj_is_upper(*a))
+			{
+				//A.'X = B;  A is upper triangular; A has to be transposed; double precision
+#if 0 // planning to implement this in this iteration
+				return bli_dtrsm_small_AutXB(side, alpha, a, b, cntx, cntl);
+#else
+				return BLIS_NOT_YET_IMPLEMENTED;
+#endif
+			}
+			else
+			{
+				return BLIS_NOT_YET_IMPLEMENTED;
+			}
+		}
+		else if (dt == BLIS_FLOAT)
+		{
+			if (bli_obj_is_upper(*a))
+			{
+				//A.'X = B;  A is upper triangular; A has to be transposed; single precision
+				//return bli_strsm_small_AutXB(side, alpha, a, b, cntx, cntl);
+				return BLIS_NOT_YET_IMPLEMENTED;
+			}
+			else
+			{
+				return BLIS_NOT_YET_IMPLEMENTED;
+			}
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B
 		}
 	}
 	else
 	{
+<<<<<<< HEAD
 		if(bli_obj_has_trans(a))
 		{
 			if(dt == BLIS_DOUBLE)
@@ -25099,3 +25289,5214 @@ static void trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l,
     ///////////////////loop ends /////////////////////
 }
 #endif
+=======
+		if (dt == BLIS_DOUBLE)
+		{
+			if (bli_obj_is_upper(*a))
+			{
+				return BLIS_NOT_YET_IMPLEMENTED;
+			}
+			else
+			{
+				//AX = B;  A is lower triangular; No transpose; double precision
+			  //return bli_dtrsm_small_AlXB(side, alpha, a, b, cntx, cntl);
+				return BLIS_NOT_YET_IMPLEMENTED;
+			}
+		}
+		else if (dt == BLIS_FLOAT)
+		{
+			if (bli_obj_is_upper(*a))
+			{
+				return BLIS_NOT_YET_IMPLEMENTED;
+			}
+			else
+			{
+				//AX = B;  A is lower triangular; No transpose; single precision
+				return bli_strsm_small_AlXB(side, alpha, a, b, cntx, cntl);
+			}
+		}
+	}
+    }
+    else
+    {
+	if (bli_obj_has_trans(*a))
+	{
+		if (dt == BLIS_DOUBLE)
+		{
+			if (bli_obj_is_upper(*a))
+			{
+				return BLIS_NOT_YET_IMPLEMENTED;
+			}
+			else
+			{
+				//XA.' = B;  A is lower triangular; A has to be transposed; double precision
+#if 0 // planning to implement this in this iteration
+				return bli_dtrsm_small_XAltB(side, alpha, a, b, cntx, cntl);
+#else
+				return BLIS_NOT_YET_IMPLEMENTED;
+#endif
+			}
+		}
+		else if (dt == BLIS_FLOAT)
+		{
+			if (bli_obj_is_upper(*a))
+			{
+				return BLIS_NOT_YET_IMPLEMENTED;
+			}
+			else
+			{
+				//XA.' = B;  A is lower triangular; A has to be transposed; single precision
+				return bli_strsm_small_XAltB(side, alpha, a, b, cntx, cntl);
+			}
+		}
+	}
+	else
+	{
+		return BLIS_NOT_YET_IMPLEMENTED;
+	}
+    }
+
+    return BLIS_NOT_YET_IMPLEMENTED;
+};
+
+
+
+
+/*
+ * AX = alpha*B, Double precision, A: lower triangular
+ */
+static err_t bli_dtrsm_small_AlXB (
+                                    side_t  side,
+                                    obj_t*  alpha,
+                                    obj_t*  a,
+                                    obj_t*  b,
+                                    cntx_t* cntx,
+                                    cntl_t* cntl
+				                  )
+{
+
+  int M = bli_obj_length(*b); // number of rows of matrix B
+  int N = bli_obj_width(*b);  // number of columns of matrix B
+
+  int lda = bli_obj_col_stride(*a); // column stride of A
+  int ldb = bli_obj_col_stride(*b); // column stride of B
+
+  int i;
+  int j;
+  int k;
+
+  double *A =  a->buffer;
+  double *B =  b->buffer;
+
+  // Need to incorporate alpha
+
+  #if 0
+  
+  for (k = 0; k < M; k++)
+    {
+      double lkk_inv = 1.0/A[k+k*lda];
+
+      for (j = 0; j < N; j++)
+	{
+	  B[k + j*ldb] *= lkk_inv;
+	}
+      for (i = k+1; i < M; i++)
+	{
+	  for (j = 0; j < N; j++)
+	    {
+	      B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb];
+	    }
+	}
+    }// k -loop
+  #else
+  for (k = 0; k < M; k++)
+    {
+      double lkk_inv = 1.0/A[k+k*lda];
+
+      for (j = 0; j < N; j++)
+	{
+	  B[k + j*ldb] *= lkk_inv;
+	
+      // for (j = 0; j < N; j++)	
+	  for (i = k+1; i < M; i++)
+	    {
+	      B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb];
+	    }
+	}
+    }// k -loop
+  
+  #endif
+
+  return BLIS_SUCCESS;
+}// end of function
+
+
+
+static void trsm_small_AlXB (
+			      float *A,
+			      float *B,
+			      int M,
+			      int N,
+			      int lda,
+			      int ldb
+			    )			                                  
+{
+  int i;
+  int j;
+  int k;
+
+  // Need to incorporate alpha
+
+  for (k = 0; k < M; k++)
+    {
+      float lkk_inv = 1.0/A[k+k*lda];
+
+      for (j = 0; j < N; j++)
+	{
+	  B[k + j*ldb] *= lkk_inv;
+      
+	  for (i = k+1; i < M; i++)
+	    {
+	      B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb];
+	    }
+	}
+    }// k -loop
+
+}// end of function
+
+
+// Test code:
+void gemm_small( float *ptr_l,
+		 float *ptr_b,
+		 int blk_m,
+		 int blk_n,
+		 float *ptr_gemmOut,
+		 int cs_l,
+		 int cs_b,
+		 int rs_l,
+		 int rs_b,
+		 float alpha,
+		 float beta)
+{
+  int i, j, k;
+ 
+  for (i = 0; i < blk_m; i++)
+    {
+      for (j = 0; j < blk_n; j++)
+	{
+	  float t = 0.0;
+	  for (k = 0; k < blk_m; k++)
+	    {
+	      t += (ptr_l[i*rs_l + k* cs_l] * ptr_b[k*rs_b + j*cs_b]);	     
+	    }
+	  ptr_gemmOut[i*rs_b + j*cs_b] = beta * ptr_gemmOut[i*rs_b + j*cs_b] + alpha * t;
+	}
+    }
+}
+
+
+
+
+/*
+ * AX = Alpha*B, Single precision, A: lower triangular
+ */
+static err_t bli_strsm_small_AlXB (
+                                    side_t  side,
+                                    obj_t*  AlphaObj,
+                                    obj_t*  a,
+                                    obj_t*  b,
+                                    cntx_t* cntx,
+                                    cntl_t* cntl
+				 )
+{
+  obj_t alpha, beta; // gemm parameters
+  obj_t Ga, Gb, Gc;  // for GEMM
+  int m = bli_obj_length(*b); // number of rows of matrix B
+  int n = bli_obj_width(*b);  // number of columns of matrix B
+
+  int lda = bli_obj_col_stride(*a); // column stride of A
+  int ldb = bli_obj_col_stride(*b); // column stride of B
+
+  int rsa = bli_obj_row_stride(*a); // row stride of A
+  int rsb = bli_obj_row_stride(*b); // row stride of B
+
+  int i = 0;
+  int j;
+  int blk_size = 8;
+  int isUnitDiag = bli_obj_has_unit_diag(*a);
+
+  float alphaVal;
+  float *L =  a->buffer;
+  float *B =  b->buffer;
+
+  if (m != 16 || (n%8) != 0)
+  {
+	return BLIS_NOT_YET_IMPLEMENTED;
+  }
+  if ( (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM )
+  {
+  	return BLIS_NOT_YET_IMPLEMENTED;
+  }
+
+  alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, *AlphaObj));
+
+  /* Small _GEMM preparation code */
+  bli_obj_create( BLIS_FLOAT, 1, 1, 0, 0, &alpha );
+  bli_obj_create( BLIS_FLOAT, 1, 1, 0, 0, &beta );
+
+  /* B = B - A*B */
+  bli_setsc(  -(1.0), 0.0, &alpha );
+  bli_setsc( (1.0), 0.0, &beta );
+
+ 
+  bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, blk_size, a->buffer, rsa, lda, &Ga);
+  bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, n, b->buffer, rsb, ldb, &Gb);
+  bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, n, b->buffer, rsb, ldb, &Gc);
+
+  bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, Ga );
+  bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, Gb );
+  bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, Gc );
+
+  //first block of trsm
+  Gb.buffer = (void*)(B + i);
+  
+  //trsm of first 8xn block
+  if (alphaVal != 1)
+  {
+	  if (isUnitDiag == 0)
+	  {
+			blis_strsm_microkernel_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal);
+			fp_blis_strsm_microkernel = blis_strsm_microkernel;
+	  }
+	  else
+	  {
+		    blis_strsm_microkernel_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal);
+			fp_blis_strsm_microkernel = blis_strsm_microkernel_unitDiag;
+	  }
+      bli_setsc( alphaVal, 0.0, &beta );
+  }
+  else
+  {
+	  if (isUnitDiag == 0)
+	  {
+			blis_strsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb);
+			fp_blis_strsm_microkernel = blis_strsm_microkernel;
+	  }
+	  else
+	  {
+		   	blis_strsm_microkernel_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb);
+		  	fp_blis_strsm_microkernel = blis_strsm_microkernel_unitDiag;
+	  }
+  }
+
+  //gemm update
+  for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT
+  {
+      Ga.buffer = (void*)(L + j + i*lda);
+      Gc.buffer = (void*)(B + j);
+
+      bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb
+  }
+
+  //trsm of remaining blocks
+  for (i = blk_size; i < m; i += blk_size)
+  {
+	  Gb.buffer = (void*)(B + i);
+
+	  fp_blis_strsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb);
+
+	  for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT
+	  {
+	      Ga.buffer = (void*)(L + j + i*lda);
+	      Gc.buffer = (void*)(B + j);
+
+	      bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb
+	  }
+
+  } // End of for loop - i
+
+  return BLIS_SUCCESS;
+}
+
+void trsm_block_c(float *ptr_l, float *ptr_b, int blk_height, int blk_width, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b)
+{
+	int i, j, k, l;
+	float inv_l;
+
+	inv_l = 1.0 / *ptr_l;
+
+	for (j = 0; j < numCols_b; j += blk_width)
+	{
+		for (l = j; l < (j+blk_width); l++)
+		{
+			ptr_b[l*cs_b] = ptr_b[l*cs_b] * inv_l;
+		}
+
+		for (i = 1; i < blk_height; i++)
+		{
+			for (l = j; l < (j+blk_width); l++)
+			{
+				for (k = 0; k < i; k++)
+				{
+					ptr_b[i*rs_b + l*cs_b] -= (ptr_b[k*rs_b + l*cs_b] * ptr_l[i*rs_l + k*cs_l]);
+				}
+				ptr_b[i*rs_b + l*cs_b] = ptr_b[i*rs_b + l*cs_b] / ptr_l[i*rs_l + i*cs_l];
+			}
+		}
+	}
+}
+
+/*
+ * XA' = Alpha*B, Single precision, A: lower triangular
+ */
+static err_t bli_strsm_small_XAltB(
+                                    side_t  side,
+                                    obj_t*  AlphaObj,
+                                    obj_t*  a,
+                                    obj_t*  b,
+                                    cntx_t* cntx,
+                                    cntl_t* cntl
+				 )
+{
+  int m = bli_obj_length(*a); // number of rows of matrix B
+  int n = bli_obj_length(*b);  // number of columns of matrix B
+
+  int lda = bli_obj_col_stride(*a); // column stride of A
+  int ldb = bli_obj_col_stride(*b); // column stride of B
+
+  int rsa = bli_obj_row_stride(*a); // row stride of A
+  int rsb = bli_obj_row_stride(*b); // row stride of B
+
+  int i = 0;
+  int isUnitDiag = bli_obj_has_unit_diag(*a);
+
+  float alphaVal;
+  float *L =  a->buffer;
+  float *B =  b->buffer;
+ 
+  if ((m%8) != 0 || (n%8) != 0)
+  {
+	return BLIS_NOT_YET_IMPLEMENTED;
+  }
+  if ( (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM )
+  {
+  	return BLIS_NOT_YET_IMPLEMENTED;
+  }
+
+  alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, *AlphaObj));
+ 
+  if (alphaVal != 1)
+  {
+	  if (isUnitDiag == 0)
+	  {
+			trsm_XAtB_block_allSmallSizedMatrices_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal);
+	  }
+	  else
+	  {
+			trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal);
+	  }
+  }
+  else
+  {
+	  if (isUnitDiag == 0)
+	  {
+			trsm_XAtB_block_allSmallSizedMatrices((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb);
+	  }
+	  else
+	  {
+		  	trsm_XAtB_block_allSmallSizedMatrices_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb);
+	  }
+  }
+  return BLIS_SUCCESS;
+}
+
+static void blis_strsm_microkernel_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal)
+{
+	float ones = 1.0;
+	int j;
+	int cs_b_offset[6];
+	//int row2, row4, row6;
+	float *ptr_b_dup;
+
+	//70 number of ymm(256 bits) registers used
+	__m256 mat_b_col[8];
+	__m256 mat_b_rearr[8];
+	__m256 mat_a_cols[8];
+	__m256 mat_a_cols_rearr[36];
+	__m256 mat_a_diag_inv[8];
+	__m256 reciprocal_diags;
+	__m256 alphaReg;
+
+	cs_b_offset[0] = (cs_b << 1);
+	cs_b_offset[1] = cs_b + cs_b_offset[0];
+	cs_b_offset[2] = (cs_b << 2);
+	cs_b_offset[3] = cs_b + cs_b_offset[2];
+	cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2];
+	cs_b_offset[5] = cs_b + cs_b_offset[4];
+
+	//reciprocal_diags = _mm256_loadu_ps((float const *)ones);
+	reciprocal_diags = _mm256_broadcast_ss((float const *)&ones);
+	alphaReg = _mm256_broadcast_ss((float const *)&alphaVal);
+
+	// ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- //
+
+	//read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B
+	mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b);
+	//_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0);
+	//row2 = (cs_l << 1);
+	//row4 = (cs_l << 2);
+	mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b)));
+	//_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0);
+	mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0]));
+	//_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0);
+	mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1]));
+	//_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0);
+	//row6 = row2 + row4;
+	mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2]));
+	//_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0);
+	mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3]));
+	//_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0);
+	mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4]));
+	//_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0);
+	mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5]));
+	//_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0);
+
+	//reciprocal_diags = _mm256_loadu_ps((float const *)ones);
+
+	//read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width  for L
+	/*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/
+
+	//Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers
+	//tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually.
+	//mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]);
+	//1st col
+	mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0));
+	mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1));
+	mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2));
+	mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3));
+	mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4));
+	mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5));
+	mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6));
+	mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7));
+	//2nd col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1));
+	mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//3rd col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//4rth col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//5th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//6th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//7th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//7th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+
+	numCols_b -= 8; // blk_width = 8
+
+	//compute reciprocals of L(i,i) and broadcast in registers
+	mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]);
+	mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]);
+	mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]);
+	mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]);
+
+	//mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55);
+	//mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55);
+	mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);
+	mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);
+	mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20);
+
+	//reciprocal of diagnol elements
+	reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]);
+
+	//Start loop for cols of B to be processed in size of blk_width
+	for (j = 0; j < numCols_b; j += 8)
+	{
+		ptr_b_dup = ptr_b;
+
+		/*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/
+
+		////unpacklow////
+		mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]);
+		mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]);
+		mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]);
+		mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]);
+
+		//Rearrange low elements
+#if REARRANGE_SHFL == 1
+		mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+		mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+		mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+		mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+		mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+		mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+		//Merge rearranged low elements into complete rows
+		mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+		mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+		mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+		mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+		mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg);
+		mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg);
+		mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg);
+		mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg);
+		
+		////unpackhigh////
+		mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]);
+		mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]);
+		mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]);
+		mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]);
+
+		//Rearrange high elements
+#if REARRANGE_SHFL == 1
+		mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44);
+		mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE);
+		mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44);
+		mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE);
+#else
+		mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E);
+		mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E);
+		mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC);
+		mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33);
+		mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC);
+		mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33);
+#endif
+
+		//extract diag a00 from a
+		mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00);
+		mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+
+		//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+		mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]);
+
+		//Merge rearranged high elements into complete rows
+		mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20);
+		mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31);
+		mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20);
+		mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31);
+
+		mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg);
+		mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg);
+		mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg);
+		mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg);
+
+		//extract diag a11 from a
+		mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55);
+		mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+
+		//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+		mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b)
+		mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+		mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]);
+
+		//extract diag a22 from a
+		mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+		mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+
+		//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+		mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+		mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]);
+
+		//extract diag a33 from a
+		mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+		mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+
+		//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+		mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]);
+
+		//extract diag a44 from a
+		mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00);
+		mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+
+		//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+		mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]);
+
+		//extract diag a55 from a
+		mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55);
+		mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+
+		//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+		mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]);
+
+		//extract diag a66 from a
+		mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+		mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+
+		//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+		mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]);
+
+		//extract diag a77 from a
+		mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+		mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+
+		//(Row7): FMA operations of b7 with elements of index (7, 0)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+		mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]);
+
+		//--> Transpose and store results of columns of B block <--//
+		////unpacklow////
+		mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]);
+		mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]);
+		mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]);
+		mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+		//Rearrange low elements
+#if REARRANGE_SHFL == 1
+		mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44);
+		mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE);
+		mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44);
+		mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE);
+#else
+		mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E);
+		mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E);
+		mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC);
+		mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33);
+		mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC);
+		mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33);
+#endif
+		//Merge rearranged low elements into complete rows
+		mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20);
+		mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31);
+		mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20);
+		mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31);
+
+		////unpackhigh////
+		mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]);
+		mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]);
+		mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]);
+		mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+		//Rearrange high elements
+#if REARRANGE_SHFL == 1
+		mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+		mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+		mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+		mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+		mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+		mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+
+		//Merge rearranged high elements into complete rows
+		mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+		mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+		mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+		mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+		//Read next set of B columns
+		ptr_b += (cs_b + cs_b_offset[5]);
+		mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b);
+		mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b)));
+		mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0]));
+		mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1]));
+		mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2]));
+		mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3]));
+		mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4]));
+		mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5]));
+
+		//Store the computed B columns
+		_mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]);
+
+	//end loop of cols
+	}
+
+	//Last block trsm processing
+	ptr_b_dup = ptr_b;
+
+	/*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/
+
+	////unpacklow////
+	mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]);
+	mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]);
+	mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]);
+	mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]);
+
+	//Rearrange low elements
+#if REARRANGE_SHFL == 1
+	mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+	mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+	mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+	mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+	mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+	mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+	//Merge rearranged low elements into complete rows
+	mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+	mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+	mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+	mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+	
+	mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg);
+	mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg);
+	mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg);
+	mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg);
+	
+	////unpackhigh////
+	mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]);
+	mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]);
+	mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]);
+	mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]);
+
+	//Rearrange high elements
+#if REARRANGE_SHFL == 1
+	mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44);
+	mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE);
+	mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44);
+	mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE);
+#else
+	mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E);
+	mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E);
+	mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC);
+	mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33);
+	mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC);
+	mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33);
+#endif
+
+	//extract diag a00 from a
+	mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00);
+	mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+
+	//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+	mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]);
+
+	//Merge rearranged high elements into complete rows
+	mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20);
+	mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31);
+	mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20);
+	mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31);
+
+	mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg);
+	mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg);
+	mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg);
+	mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg);
+
+	//extract diag a11 from a
+	mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55);
+	mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+
+	//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+	mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b)
+	mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+	mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]);
+
+	//extract diag a22 from a
+	mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+	mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+
+	//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+	mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+	mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]);
+
+	//extract diag a33 from a
+	mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+	mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+
+	//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+	mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]);
+
+	//extract diag a44 from a
+	mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00);
+	mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+
+	//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+	mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]);
+
+	//extract diag a55 from a
+	mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55);
+	mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+
+	//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+	mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]);
+
+	//extract diag a66 from a
+	mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+	mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+
+	//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+	mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]);
+
+	//extract diag a77 from a
+	mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+	mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+
+	//(Row7): FMA operations of b7 with elements of index (7, 0)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+	mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]);
+
+	//--> Transpose and store results of columns of B block <--//
+	////unpacklow////
+	mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]);
+	mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]);
+	mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]);
+	mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+	//Rearrange low elements
+#if REARRANGE_SHFL == 1
+	mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44);
+	mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE);
+	mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44);
+	mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE);
+#else
+	mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E);
+	mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E);
+	mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC);
+	mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33);
+	mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC);
+	mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33);
+#endif
+	//Merge rearranged low elements into complete rows
+	mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20);
+	mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31);
+	mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20);
+	mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31);
+
+	////unpackhigh////
+	mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]);
+	mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]);
+	mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]);
+	mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+	//Rearrange high elements
+#if REARRANGE_SHFL == 1
+	mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+	mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+	mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+	mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+	mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+	mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+
+	//Merge rearranged high elements into complete rows
+	mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+	mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+	mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+	mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+	//Store the computed B columns
+	_mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]);
+
+	//end loop of cols
+}
+
+static void blis_strsm_microkernel_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal)
+{
+	//float ones = 1.0;
+	int j;
+	int cs_b_offset[6];
+	//int row2, row4, row6;
+	float *ptr_b_dup;
+
+	//70 number of ymm(256 bits) registers used
+	__m256 mat_b_col[8];
+	__m256 mat_b_rearr[8];
+	__m256 mat_a_cols[8];
+	__m256 mat_a_cols_rearr[36];
+	//__m256 mat_a_diag_inv[8];
+	//__m256 reciprocal_diags;
+	__m256 alphaReg;
+
+	cs_b_offset[0] = (cs_b << 1);
+	cs_b_offset[1] = cs_b + cs_b_offset[0];
+	cs_b_offset[2] = (cs_b << 2);
+	cs_b_offset[3] = cs_b + cs_b_offset[2];
+	cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2];
+	cs_b_offset[5] = cs_b + cs_b_offset[4];
+
+	//reciprocal_diags = _mm256_loadu_ps((float const *)ones);
+	//reciprocal_diags = _mm256_broadcast_ss((float const *)&ones);
+	alphaReg = _mm256_broadcast_ss((float const *)&alphaVal);
+
+	// ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- //
+
+	//read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B
+	mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b);
+	//_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0);
+	//row2 = (cs_l << 1);
+	//row4 = (cs_l << 2);
+	mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b)));
+	//_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0);
+	mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0]));
+	//_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0);
+	mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1]));
+	//_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0);
+	//row6 = row2 + row4;
+	mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2]));
+	//_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0);
+	mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3]));
+	//_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0);
+	mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4]));
+	//_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0);
+	mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5]));
+	//_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0);
+
+	//reciprocal_diags = _mm256_loadu_ps((float const *)ones);
+
+	//read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width  for L
+	/*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/
+
+	//Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers
+	//tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually.
+	//mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]);
+	//1st col
+	mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0));
+	mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1));
+	mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2));
+	mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3));
+	mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4));
+	mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5));
+	mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6));
+	mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7));
+	//2nd col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1));
+	mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//3rd col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//4rth col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//5th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//6th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//7th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//8th col
+	//ptr_l += cs_l;
+	//mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+
+	numCols_b -= 8; // blk_width = 8
+
+	//compute reciprocals of L(i,i) and broadcast in registers
+	//mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]);
+	//mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]);
+	//mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]);
+	//mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]);
+
+	//mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55);
+	//mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55);
+	//mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);
+	//mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);
+	//mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20);
+
+	//reciprocal of diagnol elements
+	//reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]);
+
+	//Start loop for cols of B to be processed in size of blk_width
+	for (j = 0; j < numCols_b; j += 8)
+	{
+		ptr_b_dup = ptr_b;
+
+		/*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/
+
+		////unpacklow////
+		mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]);
+		mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]);
+		mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]);
+		mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]);
+
+		//Rearrange low elements
+#if REARRANGE_SHFL == 1
+		mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+		mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+		mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+		mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+		mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+		mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+		//Merge rearranged low elements into complete rows
+		mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+		mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+		mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+		mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+		mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg);
+		mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg);
+		mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg);
+		mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg);
+		
+		////unpackhigh////
+		mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]);
+		mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]);
+		mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]);
+		mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]);
+
+		//Rearrange high elements
+#if REARRANGE_SHFL == 1
+		mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44);
+		mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE);
+		mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44);
+		mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE);
+#else
+		mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E);
+		mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E);
+		mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC);
+		mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33);
+		mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC);
+		mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33);
+#endif
+
+		//extract diag a00 from a
+		//mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00);
+		//mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+
+		//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+		//mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]);
+
+		//Merge rearranged high elements into complete rows
+		mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20);
+		mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31);
+		mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20);
+		mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31);
+
+		mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg);
+		mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg);
+		mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg);
+		mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg);
+
+		//extract diag a11 from a
+		//mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55);
+		//mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+
+		//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+		mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b)
+		mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+		//mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]);
+
+		//extract diag a22 from a
+		//mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+		//mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+
+		//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+		mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+		//mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]);
+
+		//extract diag a33 from a
+		//mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+		//mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+
+		//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+		//mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]);
+
+		//extract diag a44 from a
+		//mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00);
+		//mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+
+		//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+		//mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]);
+
+		//extract diag a55 from a
+		//mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55);
+		//mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+
+		//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+		//mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]);
+
+		//extract diag a66 from a
+		//mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+		//mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+
+		//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+		//mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]);
+
+		//extract diag a77 from a
+		//mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+		//mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+
+		//(Row7): FMA operations of b7 with elements of index (7, 0)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+		//mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]);
+
+		//--> Transpose and store results of columns of B block <--//
+		////unpacklow////
+		mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]);
+		mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]);
+		mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]);
+		mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+		//Rearrange low elements
+#if REARRANGE_SHFL == 1
+		mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44);
+		mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE);
+		mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44);
+		mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE);
+#else
+		mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E);
+		mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E);
+		mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC);
+		mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33);
+		mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC);
+		mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33);
+#endif
+		//Merge rearranged low elements into complete rows
+		mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20);
+		mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31);
+		mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20);
+		mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31);
+
+		////unpackhigh////
+		mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]);
+		mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]);
+		mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]);
+		mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+		//Rearrange high elements
+#if REARRANGE_SHFL == 1
+		mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+		mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+		mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+		mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+		mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+		mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+
+		//Merge rearranged high elements into complete rows
+		mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+		mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+		mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+		mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+		//Read next set of B columns
+		ptr_b += (cs_b + cs_b_offset[5]);
+		mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b);
+		mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b)));
+		mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0]));
+		mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1]));
+		mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2]));
+		mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3]));
+		mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4]));
+		mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5]));
+
+		//Store the computed B columns
+		_mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]);
+
+	//end loop of cols
+	}
+
+	//Last block trsm processing
+	ptr_b_dup = ptr_b;
+
+	/*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/
+
+	////unpacklow////
+	mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]);
+	mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]);
+	mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]);
+	mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]);
+
+	//Rearrange low elements
+#if REARRANGE_SHFL == 1
+	mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+	mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+	mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+	mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+	mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+	mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+	//Merge rearranged low elements into complete rows
+	mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+	mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+	mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+	mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+	
+	mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg);
+	mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg);
+	mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg);
+	mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg);
+	
+	////unpackhigh////
+	mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]);
+	mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]);
+	mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]);
+	mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]);
+
+	//Rearrange high elements
+#if REARRANGE_SHFL == 1
+	mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44);
+	mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE);
+	mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44);
+	mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE);
+#else
+	mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E);
+	mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E);
+	mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC);
+	mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33);
+	mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC);
+	mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33);
+#endif
+
+	//extract diag a00 from a
+	//mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00);
+	//mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+
+	//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+	//mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]);
+
+	//Merge rearranged high elements into complete rows
+	mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20);
+	mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31);
+	mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20);
+	mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31);
+
+	mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg);
+	mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg);
+	mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg);
+	mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg);
+
+	//extract diag a11 from a
+	//mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55);
+	//mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+
+	//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+	mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b)
+	mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+	//mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]);
+
+	//extract diag a22 from a
+	//mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+	//mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+
+	//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+	mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+	//mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]);
+
+	//extract diag a33 from a
+	//mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+	//mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+
+	//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+	//mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]);
+
+	//extract diag a44 from a
+	//mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00);
+	//mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+
+	//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+	//mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]);
+
+	//extract diag a55 from a
+	//mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55);
+	//mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+
+	//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+	//mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]);
+
+	//extract diag a66 from a
+	//mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+	//mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+
+	//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+	//mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]);
+
+	//extract diag a77 from a
+	//mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+	//mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+
+	//(Row7): FMA operations of b7 with elements of index (7, 0)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+	//mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]);
+
+	//--> Transpose and store results of columns of B block <--//
+	////unpacklow////
+	mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]);
+	mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]);
+	mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]);
+	mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+	//Rearrange low elements
+#if REARRANGE_SHFL == 1
+	mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44);
+	mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE);
+	mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44);
+	mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE);
+#else
+	mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E);
+	mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E);
+	mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC);
+	mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33);
+	mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC);
+	mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33);
+#endif
+	//Merge rearranged low elements into complete rows
+	mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20);
+	mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31);
+	mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20);
+	mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31);
+
+	////unpackhigh////
+	mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]);
+	mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]);
+	mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]);
+	mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+	//Rearrange high elements
+#if REARRANGE_SHFL == 1
+	mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+	mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+	mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+	mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+	mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+	mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+
+	//Merge rearranged high elements into complete rows
+	mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+	mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+	mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+	mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+	//Store the computed B columns
+	_mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]);
+
+	//end loop of cols
+}
+
+static void blis_strsm_microkernel_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b)
+{
+	//float ones = 1.0;
+	int j;
+	int cs_b_offset[6];
+	//int row2, row4, row6;
+	float *ptr_b_dup;
+
+	//70 number of ymm(256 bits) registers used
+	__m256 mat_b_col[8];
+	__m256 mat_b_rearr[8];
+	__m256 mat_a_cols[8];
+	__m256 mat_a_cols_rearr[36];
+	//__m256 mat_a_diag_inv[8];
+	//__m256 reciprocal_diags;
+
+	cs_b_offset[0] = (cs_b << 1);
+	cs_b_offset[1] = cs_b + cs_b_offset[0];
+	cs_b_offset[2] = (cs_b << 2);
+	cs_b_offset[3] = cs_b + cs_b_offset[2];
+	cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2];
+	cs_b_offset[5] = cs_b + cs_b_offset[4];
+
+	//reciprocal_diags = _mm256_loadu_ps((float const *)ones);
+	//reciprocal_diags = _mm256_broadcast_ss((float const *)&ones);
+
+	// ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- //
+
+	//read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B
+	mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b);
+	//_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0);
+	//row2 = (cs_l << 1);
+	//row4 = (cs_l << 2);
+	mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b)));
+	//_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0);
+	mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0]));
+	//_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0);
+	mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1]));
+	//_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0);
+	//row6 = row2 + row4;
+	mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2]));
+	//_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0);
+	mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3]));
+	//_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0);
+	mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4]));
+	//_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0);
+	mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5]));
+	//_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0);
+
+	//reciprocal_diags = _mm256_loadu_ps((float const *)ones);
+
+	//read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width  for L
+	/*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/
+
+	//Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers
+	//tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually.
+	//mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]);
+	//1st col
+	mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0));
+	mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1));
+	mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2));
+	mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3));
+	mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4));
+	mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5));
+	mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6));
+	mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7));
+	//2nd col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1));
+	mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//3rd col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//4rth col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//5th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//6th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//7th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//8th col
+	//ptr_l += cs_l;
+	//mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+
+	numCols_b -= 8; // blk_width = 8
+
+	//compute reciprocals of L(i,i) and broadcast in registers
+	//mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]);
+	//mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]);
+	//mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]);
+	//mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]);
+
+	//mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55);
+	//mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55);
+	//mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);
+	//mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);
+	//mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20);
+
+	//reciprocal of diagnol elements
+	//reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]);
+
+	//Start loop for cols of B to be processed in size of blk_width
+	for (j = 0; j < numCols_b; j += 8)
+	{
+		ptr_b_dup = ptr_b;
+
+		/*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/
+
+		////unpacklow////
+		mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]);
+		mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]);
+		mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]);
+		mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]);
+
+		//Rearrange low elements
+#if REARRANGE_SHFL == 1
+		mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+		mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+		mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+		mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+		mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+		mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+		//Merge rearranged low elements into complete rows
+		mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+		mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+		mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+		mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+		////unpackhigh////
+		mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]);
+		mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]);
+		mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]);
+		mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]);
+
+		//Rearrange high elements
+#if REARRANGE_SHFL == 1
+		mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44);
+		mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE);
+		mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44);
+		mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE);
+#else
+		mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E);
+		mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E);
+		mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC);
+		mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33);
+		mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC);
+		mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33);
+#endif
+
+		//extract diag a00 from a
+		//mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00);
+		//mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+
+		//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+		//mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]);
+
+		//Merge rearranged high elements into complete rows
+		mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20);
+		mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31);
+		mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20);
+		mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31);
+
+		//extract diag a11 from a
+		//mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55);
+		//mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+
+		//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+		mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b)
+		mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+		//mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]);
+
+		//extract diag a22 from a
+		//mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+		//mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+
+		//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+		mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+		//mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]);
+
+		//extract diag a33 from a
+		//mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+		//mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+
+		//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+		//mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]);
+
+		//extract diag a44 from a
+		//mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00);
+		//mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+
+		//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+		//mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]);
+
+		//extract diag a55 from a
+		//mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55);
+		//mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+
+		//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+		//mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]);
+
+		//extract diag a66 from a
+		//mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+		//mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+
+		//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+		//mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]);
+
+		//extract diag a77 from a
+		//mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+		//mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+
+		//(Row7): FMA operations of b7 with elements of index (7, 0)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+		//mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]);
+
+		//--> Transpose and store results of columns of B block <--//
+		////unpacklow////
+		mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]);
+		mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]);
+		mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]);
+		mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+		//Rearrange low elements
+#if REARRANGE_SHFL == 1
+		mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44);
+		mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE);
+		mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44);
+		mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE);
+#else
+		mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E);
+		mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E);
+		mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC);
+		mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33);
+		mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC);
+		mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33);
+#endif
+		//Merge rearranged low elements into complete rows
+		mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20);
+		mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31);
+		mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20);
+		mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31);
+
+		////unpackhigh////
+		mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]);
+		mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]);
+		mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]);
+		mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+		//Rearrange high elements
+#if REARRANGE_SHFL == 1
+		mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+		mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+		mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+		mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+		mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+		mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+
+		//Merge rearranged high elements into complete rows
+		mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+		mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+		mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+		mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+		//Read next set of B columns
+		ptr_b += (cs_b + cs_b_offset[5]);
+		mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b);
+		mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b)));
+		mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0]));
+		mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1]));
+		mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2]));
+		mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3]));
+		mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4]));
+		mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5]));
+
+		//Store the computed B columns
+		_mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]);
+	//end loop of cols
+	}
+
+	//Last block trsm processing
+	ptr_b_dup = ptr_b;
+
+	/*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/
+
+	////unpacklow////
+	mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]);
+	mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]);
+	mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]);
+	mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]);
+
+	//Rearrange low elements
+#if REARRANGE_SHFL == 1
+	mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+	mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+	mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+	mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+	mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+	mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+	//Merge rearranged low elements into complete rows
+	mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+	mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+	mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+	mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+	
+	////unpackhigh////
+	mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]);
+	mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]);
+	mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]);
+	mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]);
+
+	//Rearrange high elements
+#if REARRANGE_SHFL == 1
+	mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44);
+	mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE);
+	mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44);
+	mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE);
+#else
+	mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E);
+	mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E);
+	mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC);
+	mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33);
+	mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC);
+	mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33);
+#endif
+
+	//extract diag a00 from a
+	//mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00);
+	//mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+
+	//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+	//mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]);
+
+	//Merge rearranged high elements into complete rows
+	mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20);
+	mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31);
+	mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20);
+	mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31);
+
+	//extract diag a11 from a
+	//mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55);
+	//mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+
+	//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+	mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b)
+	mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+	//mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]);
+
+	//extract diag a22 from a
+	//mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+	//mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+
+	//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+	mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+	//mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]);
+
+	//extract diag a33 from a
+	//mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+	//mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+
+	//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+	//mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]);
+
+	//extract diag a44 from a
+	//mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00);
+	//mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+
+	//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+	//mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]);
+
+	//extract diag a55 from a
+	//mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55);
+	//mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+
+	//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+	//mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]);
+
+	//extract diag a66 from a
+	//mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+	//mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+
+	//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+	//mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]);
+
+	//extract diag a77 from a
+	//mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+	//mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+
+	//(Row7): FMA operations of b7 with elements of index (7, 0)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+	//mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]);
+
+	//--> Transpose and store results of columns of B block <--//
+	////unpacklow////
+	mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]);
+	mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]);
+	mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]);
+	mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+	//Rearrange low elements
+#if REARRANGE_SHFL == 1
+	mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44);
+	mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE);
+	mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44);
+	mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE);
+#else
+	mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E);
+	mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E);
+	mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC);
+	mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33);
+	mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC);
+	mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33);
+#endif
+	//Merge rearranged low elements into complete rows
+	mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20);
+	mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31);
+	mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20);
+	mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31);
+
+	////unpackhigh////
+	mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]);
+	mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]);
+	mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]);
+	mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+	//Rearrange high elements
+#if REARRANGE_SHFL == 1
+	mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+	mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+	mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+	mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+	mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+	mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+
+	//Merge rearranged high elements into complete rows
+	mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+	mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+	mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+	mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+	//Store the computed B columns
+	_mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]);
+	//end loop of cols
+}
+
+static void blis_strsm_microkernel(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b)
+{
+	float ones = 1.0;
+	int j;
+	int cs_b_offset[6];
+	//int row2, row4, row6;
+	float *ptr_b_dup;
+
+	//70 number of ymm(256 bits) registers used
+	__m256 mat_b_col[8];
+	__m256 mat_b_rearr[8];
+	__m256 mat_a_cols[8];
+	__m256 mat_a_cols_rearr[36];
+	__m256 mat_a_diag_inv[8];
+	__m256 reciprocal_diags;
+
+	cs_b_offset[0] = (cs_b << 1);
+	cs_b_offset[1] = cs_b + cs_b_offset[0];
+	cs_b_offset[2] = (cs_b << 2);
+	cs_b_offset[3] = cs_b + cs_b_offset[2];
+	cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2];
+	cs_b_offset[5] = cs_b + cs_b_offset[4];
+
+	//reciprocal_diags = _mm256_loadu_ps((float const *)ones);
+	reciprocal_diags = _mm256_broadcast_ss((float const *)&ones);
+
+	// ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- //
+
+	//read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B
+	mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b);
+	//_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0);
+	//row2 = (cs_l << 1);
+	//row4 = (cs_l << 2);
+	mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b)));
+	//_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0);
+	mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0]));
+	//_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0);
+	mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1]));
+	//_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0);
+	//row6 = row2 + row4;
+	mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2]));
+	//_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0);
+	mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3]));
+	//_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0);
+	mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4]));
+	//_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0);
+	mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5]));
+	//_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0);
+
+	//reciprocal_diags = _mm256_loadu_ps((float const *)ones);
+
+	//read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width  for L
+	/*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l);
+	ptr_l += cs_l;
+	mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/
+
+	//Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers
+	//tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually.
+	//mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]);
+	//1st col
+	mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0));
+	mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1));
+	mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2));
+	mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3));
+	mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4));
+	mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5));
+	mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6));
+	mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7));
+	//2nd col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1));
+	mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//3rd col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//4rth col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//5th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//6th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//7th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+	//7th col
+	ptr_l += cs_l;
+	mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+
+	numCols_b -= 8; // blk_width = 8
+
+	//compute reciprocals of L(i,i) and broadcast in registers
+	mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]);
+	mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]);
+	mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]);
+	mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]);
+
+	//mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55);
+	//mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55);
+	mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);
+	mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);
+	mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20);
+
+	//reciprocal of diagnol elements
+	reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]);
+
+	//Start loop for cols of B to be processed in size of blk_width
+	for (j = 0; j < numCols_b; j += 8)
+	{
+		ptr_b_dup = ptr_b;
+
+		/*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/
+
+		////unpacklow////
+		mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]);
+		mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]);
+		mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]);
+		mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]);
+
+		//Rearrange low elements
+#if REARRANGE_SHFL == 1
+		mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+		mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+		mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+		mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+		mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+		mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+		//Merge rearranged low elements into complete rows
+		mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+		mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+		mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+		mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+		////unpackhigh////
+		mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]);
+		mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]);
+		mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]);
+		mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]);
+
+		//Rearrange high elements
+#if REARRANGE_SHFL == 1
+		mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44);
+		mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE);
+		mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44);
+		mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE);
+#else
+		mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E);
+		mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E);
+		mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC);
+		mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33);
+		mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC);
+		mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33);
+#endif
+
+		//extract diag a00 from a
+		mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00);
+		mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+
+		//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+		mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]);
+
+		//Merge rearranged high elements into complete rows
+		mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20);
+		mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31);
+		mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20);
+		mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31);
+
+		//extract diag a11 from a
+		mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55);
+		mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+
+		//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+		mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b)
+		mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+		mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]);
+
+		//extract diag a22 from a
+		mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+		mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+
+		//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+		mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+		mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]);
+
+		//extract diag a33 from a
+		mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+		mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+
+		//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+		mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+		mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]);
+
+		//extract diag a44 from a
+		mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00);
+		mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+
+		//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+		mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+		mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]);
+
+		//extract diag a55 from a
+		mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55);
+		mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+
+		//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+		mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+		mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]);
+
+		//extract diag a66 from a
+		mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+		mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+
+		//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+		mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+		mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]);
+
+		//extract diag a77 from a
+		mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+		mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+
+		//(Row7): FMA operations of b7 with elements of index (7, 0)
+		mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+		mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]);
+
+		//--> Transpose and store results of columns of B block <--//
+		////unpacklow////
+		mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]);
+		mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]);
+		mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]);
+		mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+		//Rearrange low elements
+#if REARRANGE_SHFL == 1
+		mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44);
+		mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE);
+		mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44);
+		mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE);
+#else
+		mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E);
+		mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E);
+		mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC);
+		mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33);
+		mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC);
+		mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33);
+#endif
+		//Merge rearranged low elements into complete rows
+		mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20);
+		mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31);
+		mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20);
+		mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31);
+
+		////unpackhigh////
+		mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]);
+		mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]);
+		mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]);
+		mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+		//Rearrange high elements
+#if REARRANGE_SHFL == 1
+		mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+		mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+		mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+		mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+		mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+		mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+		mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+		mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+
+		//Merge rearranged high elements into complete rows
+		mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+		mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+		mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+		mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+		//Read next set of B columns
+		ptr_b += (cs_b + cs_b_offset[5]);
+		mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b);
+		mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b)));
+		mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0]));
+		mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1]));
+		mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2]));
+		mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3]));
+		mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4]));
+		mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5]));
+
+		//Store the computed B columns
+		_mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]);
+	//end loop of cols
+	}
+
+	//Last block trsm processing
+	ptr_b_dup = ptr_b;
+
+	/*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/
+
+	////unpacklow////
+	mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]);
+	mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]);
+	mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]);
+	mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]);
+
+	//Rearrange low elements
+#if REARRANGE_SHFL == 1
+	mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+	mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+	mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+	mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+	mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+	mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+	//Merge rearranged low elements into complete rows
+	mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+	mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+	mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+	mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+	
+	////unpackhigh////
+	mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]);
+	mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]);
+	mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]);
+	mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]);
+
+	//Rearrange high elements
+#if REARRANGE_SHFL == 1
+	mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44);
+	mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE);
+	mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44);
+	mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE);
+#else
+	mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E);
+	mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E);
+	mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC);
+	mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33);
+	mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC);
+	mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33);
+#endif
+
+	//extract diag a00 from a
+	mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00);
+	mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+
+	//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+	mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]);
+
+	//Merge rearranged high elements into complete rows
+	mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20);
+	mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31);
+	mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20);
+	mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31);
+
+	//extract diag a11 from a
+	mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55);
+	mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+
+	//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+	mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b)
+	mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+	mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]);
+
+	//extract diag a22 from a
+	mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+	mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+
+	//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+	mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+	mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]);
+
+	//extract diag a33 from a
+	mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+	mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+
+	//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+	mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+	mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]);
+
+	//extract diag a44 from a
+	mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00);
+	mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+
+	//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+	mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+	mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]);
+
+	//extract diag a55 from a
+	mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55);
+	mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+
+	//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+	mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+	mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]);
+
+	//extract diag a66 from a
+	mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA);
+	mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+
+	//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+	mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+	mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]);
+
+	//extract diag a77 from a
+	mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF);
+	mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+
+	//(Row7): FMA operations of b7 with elements of index (7, 0)
+	mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b)
+
+	//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+	mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]);
+
+	//--> Transpose and store results of columns of B block <--//
+	////unpacklow////
+	mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]);
+	mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]);
+	mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]);
+	mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+	//Rearrange low elements
+#if REARRANGE_SHFL == 1
+	mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44);
+	mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE);
+	mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44);
+	mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE);
+#else
+	mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E);
+	mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E);
+	mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC);
+	mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33);
+	mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC);
+	mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33);
+#endif
+	//Merge rearranged low elements into complete rows
+	mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20);
+	mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31);
+	mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20);
+	mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31);
+
+	////unpackhigh////
+	mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]);
+	mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]);
+	mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]);
+	mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]);
+
+	//Rearrange high elements
+#if REARRANGE_SHFL == 1
+	mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44);
+	mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE);
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE);
+#else
+	mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E);
+	mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E);
+	mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC);
+	mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33);
+	mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC);
+	mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33);
+#endif
+
+	//Merge rearranged high elements into complete rows
+	mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20);
+	mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31);
+	mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20);
+	mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31);
+
+	//Store the computed B columns
+	_mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]);
+	_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]);
+	//end loop of cols
+}
+
+///////////////////////////////////// XA'=B functions ////////////////////////////////
+
+static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b)
+{
+	float ones = 1.0;
+	int i, i1, i2, i3, i4, j, k, l;
+	int cs_b_offset[7];
+	int cs_l_offset[7];
+	float *ptr_b_dup;
+
+	//57 number of ymm(256 bits) registers used
+	__m256 mat_b_col[8];
+	__m256 mat_b_rearr[16][8];
+	__m256 mat_a_cols_rearr[8];
+	__m256 mat_a_blk_elems[64];
+	__m256 mat_a_diag_inv[8];
+	__m256 reciprocal_diags[2];
+
+	reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones));
+
+	// ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- //
+
+	//L matrix offsets
+	cs_l_offset[0] = (cs_l << 1);
+	cs_l_offset[1] = cs_l + cs_l_offset[0];
+	cs_l_offset[2] = (cs_l << 2);
+	cs_l_offset[3] = cs_l + cs_l_offset[2];
+	cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2];
+	cs_l_offset[5] = cs_l + cs_l_offset[4];
+	cs_l_offset[6] = (cs_l_offset[5] + cs_l);
+
+	//read diag elems of L 16x16 block
+	mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l);
+	mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l);
+	mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]);
+	mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]);
+	mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]);
+	mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]);
+	mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]);
+	mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]);
+
+	cs_b_offset[0] = (cs_b << 1);
+	cs_b_offset[1] = cs_b + cs_b_offset[0];
+	cs_b_offset[2] = (cs_b << 2);
+	cs_b_offset[3] = cs_b + cs_b_offset[2];
+	cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2];
+	cs_b_offset[5] = cs_b + cs_b_offset[4];
+	cs_b_offset[6] = (cs_b_offset[5] + cs_b);
+
+	reciprocal_diags[1] = reciprocal_diags[0];
+
+	//pack first 8 diags together
+	mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1
+	mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3
+	mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5
+	mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7
+	mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3
+	mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7
+	mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7
+
+	//reciprocal of diagnal elements 0,1,2,3,4,5,6,7
+	reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]);
+
+	//Broadcast A10 to A70 to registers
+	mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1));
+	mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+
+	//Broadcast A21 to A71 to registers
+	mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2));
+	mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3));
+	mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4));
+	mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5));
+	mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6));
+	mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7));
+
+	//Broadcast A32 to A72 to registers
+	mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3));
+	mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4));
+	mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5));
+	mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6));
+	mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7));
+
+	//Broadcast A43 to A73 to registers
+	mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4));
+	mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5));
+	mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6));
+	mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7));
+
+	//Broadcast A54 to A74 to registers
+	mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5));
+	mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6));
+	mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7));
+
+	//Broadcast A65 to A75 to registers
+	mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6));
+	mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7));
+
+	//Broadcast A76 to register
+	mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7));
+
+	//extract diag a00 from a
+	mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00);
+	mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+	//mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]);
+	//extract diag a11 from a
+	mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55);
+	mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+	//mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]);
+	//extract diag a22 from a
+	mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA);
+	mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+	//mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]);
+	//extract diag a33 from a
+	mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF);
+	mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+	//mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]);
+	//extract diag a44 from a
+	mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00);
+	mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+	//mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]);
+	//extract diag a55 from a
+	mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55);
+	mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+	//mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]);
+	//extract diag a66 from a
+	mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA);
+	mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+	//mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]);
+	//extract diag a77 from a
+	mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF);
+	mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+	//mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]);
+
+
+	/*****************   first set of 8 rows of B processing starts    *****************/
+	ptr_b_dup = ptr_b;
+	i = 0;
+	for (j = 0; j < numCols_b; j += 8)
+	{
+		/////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A
+		//read 8x8 block of B into registers
+		mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i);
+		mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i));
+		mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i));
+		mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i));
+		mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i));
+		mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i));
+		mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i));
+		mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i));
+
+		//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+		mat_b_col[0] = _mm256_mul_ps(mat_b_rearr[0][0], mat_a_diag_inv[0]);
+
+		//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+		mat_b_rearr[1][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b)
+		mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+		mat_b_col[1] = _mm256_mul_ps(mat_b_rearr[1][0], mat_a_diag_inv[1]);
+
+		//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+		mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+		mat_b_col[2] = _mm256_mul_ps(mat_b_rearr[2][0], mat_a_diag_inv[2]);
+
+		//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+		mat_b_col[3] = _mm256_mul_ps(mat_b_rearr[3][0], mat_a_diag_inv[3]);
+
+		//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+		mat_b_col[4] = _mm256_mul_ps(mat_b_rearr[4][0], mat_a_diag_inv[4]);
+
+		//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+		mat_b_col[5] = _mm256_mul_ps(mat_b_rearr[5][0], mat_a_diag_inv[5]);
+
+		//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+		mat_b_col[6] = _mm256_mul_ps(mat_b_rearr[6][0], mat_a_diag_inv[6]);
+
+		//(Row7): FMA operations of b7 with elements of index (7, 0)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+		mat_b_col[7] = _mm256_mul_ps(mat_b_rearr[7][0], mat_a_diag_inv[7]);
+
+		////////////////////////////////////////////////////////////////////////////////
+
+		//Store the computed B columns
+		_mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]);
+
+		//i += cs_b_offset[6];
+		//ptr_b_dup += cs_b_offset[6];
+		i += 8;
+		ptr_b_dup += 8;
+	}
+
+	//c = 0;
+	/***************** first set of 8 cols of B processing done *****************/
+	ptr_b_dup = ptr_b;
+	i3 = 0;
+	i1 = 0;
+	//Start loop for cols of B to be processed in size of blk_width
+	for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row
+	{
+		ptr_l += 8;
+		//ptr_b += j;
+		//ptr_b_dup += 8;
+		ptr_b_dup += cs_b_offset[6];
+		i1 += cs_b_offset[6];
+
+		//Read next 8x8 block of A to get diag elements
+		i3 += cs_l_offset[6];
+		mat_a_cols_rearr[8] = _mm256_loadu_ps((float const *)ptr_l + i3);
+		mat_a_cols_rearr[9] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l);
+		mat_a_cols_rearr[10] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]);
+		mat_a_cols_rearr[11] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]);
+		mat_a_cols_rearr[12] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]);
+		mat_a_cols_rearr[13] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]);
+		mat_a_cols_rearr[14] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]);
+		mat_a_cols_rearr[15] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]);
+
+		//pack 8 diags of A together
+		reciprocal_diags[0] = reciprocal_diags[1];
+		mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[8], mat_a_cols_rearr[9], 0xAA);//diag 0,1
+		mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[10], mat_a_cols_rearr[11], 0xAA);//diag 2,3
+		mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[12], mat_a_cols_rearr[13], 0xAA);//diag 4,5
+		mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[15], 0xAA);//diag 6,7
+		mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3
+		mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7
+		mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7
+
+		//reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7
+		reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]);
+
+		i = 0;
+		i2 = 0;
+		for (k = 0; k < numCols_b; k += 8)
+		{
+			i = i1 + k;
+			//Read 8 cols of B columns of Block-to-be-solved
+			mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i);
+			mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i));
+			mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i));
+			mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i));
+			mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i));
+			mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i));
+			mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i));
+			mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i));
+			i2++;
+		}
+		
+		i = 0;
+		i2 = 0;
+		for (l = 0; l < j; l += 8) // move across m
+		{
+			//Broadcast A8,0 to A15,0 to registers
+			mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i));
+			mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1));
+			mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+			mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+			mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+			mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+			mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+			mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		
+			//Broadcast A21 to A71 to registers
+			mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i));
+			mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1));
+			mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2));
+			mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3));
+			mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4));
+			mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5));
+			mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6));
+			mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7));
+			
+			//Broadcast A8,2 to A15,2 to registers
+			mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i));
+			mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1));
+			mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2));
+			mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3));
+			mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4));
+			mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5));
+			mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6));
+			mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7));
+		
+			//Broadcast A8,3 to A15,3 to registers
+			mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i));
+			mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1));
+			mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2));
+			mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3));
+			mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4));
+			mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5));
+			mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6));
+			mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7));
+			
+			// _mm256_permute2f128_ps()
+			
+			//Broadcast A8,4 to A15,4 to registers
+			mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i));
+			mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1));
+			mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2));
+			mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3));
+			mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4));
+			mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5));
+			mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6));
+			mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7));
+			
+			//Broadcast A8,5 to A15,5 to registers
+			mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i));
+			mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1));
+			mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2));
+			mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3));
+			mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4));
+			mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5));
+			mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6));
+			mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7));
+			
+			//Broadcast A8,6 to A15,6 to registers
+			mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i));
+			mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1));
+			mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2));
+			mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3));
+			mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4));
+			mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5));
+			mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6));
+			mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7));
+			
+			//Broadcast A8,7 to A15,7 to registers
+			mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i));
+			mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1));
+			mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2));
+			mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3));
+			mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4));
+			mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5));
+			mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6));
+			mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7));
+						
+			i += cs_l_offset[6];
+			
+			
+			for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m)
+			{
+				/////////////////// Partial Lower 8x8 block trsm of B
+
+				i4 = i2 + k;
+				//Read current 8 cols of B columns from specified 8x8 current-block of B
+				mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4);
+				mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b));
+				mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0]));
+				mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1]));
+				mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2]));
+				mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3]));
+				mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4]));
+				mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5]));
+
+				i4 = k >> 3;
+				
+				//(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//end loop of cols					
+			}
+			i2 += cs_b_offset[6];
+		}
+		
+		//Broadcast A10 to A70 to registers
+		mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1));
+		mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+		mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a00 from a
+		mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00);
+		mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+		//mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]);
+		
+		//Broadcast A21 to A71 to registers
+		mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+		mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a11 from a
+		mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55);
+		mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+		//mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]);
+	
+		//Broadcast A32 to A72 to registers
+		mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a22 from a
+		mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA);
+		mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+		//mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]);
+	
+		//Broadcast A43 to A73 to registers
+		mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a33 from a
+		mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF);
+		mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+		//mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]);
+	
+		//Broadcast A54 to A74 to registers
+		mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a44 from a
+		mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00);
+		mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+		//mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]);
+	
+		//Broadcast A65 to A75 to registers
+		mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a55 from a
+		mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55);
+		mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+		//mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]);
+	
+		//Broadcast A76 to register
+		mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		//extract diag a66 from a
+		mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA);
+		mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+		//mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]);
+
+		//extract diag a77 from a
+		mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF);
+		mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+		//mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]);
+
+		k = 0;
+		for (i = 0; i < numCols_b; i+=8)
+		{
+			/////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A
+			
+			//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+			mat_b_rearr[k][0] = _mm256_mul_ps(mat_b_rearr[k][0], mat_a_diag_inv[0]);
+
+			//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+			mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b)
+			mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+			mat_b_rearr[k][1] = _mm256_mul_ps(mat_b_rearr[k][1], mat_a_diag_inv[1]);
+
+			//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+			mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+			mat_b_rearr[k][2] = _mm256_mul_ps(mat_b_rearr[k][2], mat_a_diag_inv[2]);
+
+			//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+			mat_b_rearr[k][3] = _mm256_mul_ps(mat_b_rearr[k][3], mat_a_diag_inv[3]);
+
+			//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+			mat_b_rearr[k][4] = _mm256_mul_ps(mat_b_rearr[k][4], mat_a_diag_inv[4]);
+
+			//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+			mat_b_rearr[k][5] = _mm256_mul_ps(mat_b_rearr[k][5], mat_a_diag_inv[5]);
+
+			//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+			mat_b_rearr[k][6] = _mm256_mul_ps(mat_b_rearr[k][6], mat_a_diag_inv[6]);
+
+			//(Row7): FMA operations of b7 with elements of index (7, 0)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+			mat_b_rearr[k][7] = _mm256_mul_ps(mat_b_rearr[k][7], mat_a_diag_inv[7]);
+
+			////////////////////////////////////////////////////////////////////////////////
+
+			//Store the computed B columns
+
+			_mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]);
+			//printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k));
+			k++;
+		}
+
+
+	}
+	///////////////////loop ends /////////////////////
+}
+
+static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha)
+{
+	float ones = 1.0;
+	int i, i1, i2, i3, i4, j, k, l;
+	int cs_b_offset[7];
+	int cs_l_offset[7];
+	float *ptr_b_dup;
+
+	//57 number of ymm(256 bits) registers used
+	__m256 mat_b_col[8];
+	__m256 mat_b_rearr[16][8];
+	__m256 mat_a_cols_rearr[8];
+	__m256 mat_a_blk_elems[64];
+	__m256 mat_a_diag_inv[8];
+	__m256 reciprocal_diags[2];
+	__m256 alphaReg;
+
+	reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones));
+	alphaReg = _mm256_broadcast_ss((float const *)&alpha);
+
+	// ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- //
+
+	//L matrix offsets
+	cs_l_offset[0] = (cs_l << 1);
+	cs_l_offset[1] = cs_l + cs_l_offset[0];
+	cs_l_offset[2] = (cs_l << 2);
+	cs_l_offset[3] = cs_l + cs_l_offset[2];
+	cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2];
+	cs_l_offset[5] = cs_l + cs_l_offset[4];
+	cs_l_offset[6] = (cs_l_offset[5] + cs_l);
+
+	//read diag elems of L 16x16 block
+	mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l);
+	mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l);
+	mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]);
+	mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]);
+	mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]);
+	mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]);
+	mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]);
+	mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]);
+
+	cs_b_offset[0] = (cs_b << 1);
+	cs_b_offset[1] = cs_b + cs_b_offset[0];
+	cs_b_offset[2] = (cs_b << 2);
+	cs_b_offset[3] = cs_b + cs_b_offset[2];
+	cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2];
+	cs_b_offset[5] = cs_b + cs_b_offset[4];
+	cs_b_offset[6] = (cs_b_offset[5] + cs_b);
+
+	reciprocal_diags[1] = reciprocal_diags[0];
+
+	//pack first 8 diags together
+	mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1
+	mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3
+	mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5
+	mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7
+	mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3
+	mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7
+	mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7
+
+	//reciprocal of diagnal elements 0,1,2,3,4,5,6,7
+	reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]);
+
+	//Broadcast A10 to A70 to registers
+	mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1));
+	mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+
+	//Broadcast A21 to A71 to registers
+	mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2));
+	mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3));
+	mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4));
+	mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5));
+	mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6));
+	mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7));
+
+	//Broadcast A32 to A72 to registers
+	mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3));
+	mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4));
+	mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5));
+	mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6));
+	mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7));
+
+	//Broadcast A43 to A73 to registers
+	mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4));
+	mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5));
+	mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6));
+	mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7));
+
+	//Broadcast A54 to A74 to registers
+	mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5));
+	mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6));
+	mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7));
+
+	//Broadcast A65 to A75 to registers
+	mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6));
+	mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7));
+
+	//Broadcast A76 to register
+	mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7));
+
+	//extract diag a00 from a
+	mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00);
+	mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+	//mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]);
+	//extract diag a11 from a
+	mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55);
+	mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+	//mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]);
+	//extract diag a22 from a
+	mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA);
+	mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+	//mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]);
+	//extract diag a33 from a
+	mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF);
+	mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+	//mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]);
+	//extract diag a44 from a
+	mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00);
+	mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+	//mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]);
+	//extract diag a55 from a
+	mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55);
+	mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+	//mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]);
+	//extract diag a66 from a
+	mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA);
+	mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+	//mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]);
+	//extract diag a77 from a
+	mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF);
+	mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+	//mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]);
+
+
+	/*****************   first set of 8 rows of B processing starts    *****************/
+	ptr_b_dup = ptr_b;
+	i = 0;
+	for (j = 0; j < numCols_b; j += 8)
+	{
+		/////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A
+		//read 8x8 block of B into registers
+		mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i);
+		mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i));
+		mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i));
+		mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i));
+		mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i));
+		mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i));
+		mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i));
+		mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i));
+
+		mat_b_rearr[0][0] = _mm256_mul_ps(mat_b_rearr[0][0], alphaReg);
+		mat_b_rearr[1][0] = _mm256_mul_ps(mat_b_rearr[1][0], alphaReg);
+		mat_b_rearr[2][0] = _mm256_mul_ps(mat_b_rearr[2][0], alphaReg);
+		mat_b_rearr[3][0] = _mm256_mul_ps(mat_b_rearr[3][0], alphaReg);
+		mat_b_rearr[4][0] = _mm256_mul_ps(mat_b_rearr[4][0], alphaReg);
+		mat_b_rearr[5][0] = _mm256_mul_ps(mat_b_rearr[5][0], alphaReg);
+		mat_b_rearr[6][0] = _mm256_mul_ps(mat_b_rearr[6][0], alphaReg);
+		mat_b_rearr[7][0] = _mm256_mul_ps(mat_b_rearr[7][0], alphaReg);
+
+		//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+		mat_b_col[0] = _mm256_mul_ps(mat_b_rearr[0][0], mat_a_diag_inv[0]);
+
+		//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+		mat_b_rearr[1][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b)
+		mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+		mat_b_col[1] = _mm256_mul_ps(mat_b_rearr[1][0], mat_a_diag_inv[1]);
+
+		//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+		mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+		mat_b_col[2] = _mm256_mul_ps(mat_b_rearr[2][0], mat_a_diag_inv[2]);
+
+		//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+		mat_b_col[3] = _mm256_mul_ps(mat_b_rearr[3][0], mat_a_diag_inv[3]);
+
+		//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+		mat_b_col[4] = _mm256_mul_ps(mat_b_rearr[4][0], mat_a_diag_inv[4]);
+
+		//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+		mat_b_col[5] = _mm256_mul_ps(mat_b_rearr[5][0], mat_a_diag_inv[5]);
+
+		//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+		mat_b_col[6] = _mm256_mul_ps(mat_b_rearr[6][0], mat_a_diag_inv[6]);
+
+		//(Row7): FMA operations of b7 with elements of index (7, 0)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+		mat_b_col[7] = _mm256_mul_ps(mat_b_rearr[7][0], mat_a_diag_inv[7]);
+
+		////////////////////////////////////////////////////////////////////////////////
+
+		//Store the computed B columns
+		_mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]);
+
+		//i += cs_b_offset[6];
+		//ptr_b_dup += cs_b_offset[6];
+		i += 8;
+		ptr_b_dup += 8;
+	}
+
+	//c = 0;
+	/***************** first set of 8 cols of B processing done *****************/
+	ptr_b_dup = ptr_b;
+	i3 = 0;
+	i1 = 0;
+	//Start loop for cols of B to be processed in size of blk_width
+	for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row
+	{
+		ptr_l += 8;
+		//ptr_b += j;
+		//ptr_b_dup += 8;
+		ptr_b_dup += cs_b_offset[6];
+		i1 += cs_b_offset[6];
+
+		//Read next 8x8 block of A to get diag elements
+		i3 += cs_l_offset[6];
+		mat_a_cols_rearr[8] = _mm256_loadu_ps((float const *)ptr_l + i3);
+		mat_a_cols_rearr[9] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l);
+		mat_a_cols_rearr[10] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]);
+		mat_a_cols_rearr[11] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]);
+		mat_a_cols_rearr[12] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]);
+		mat_a_cols_rearr[13] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]);
+		mat_a_cols_rearr[14] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]);
+		mat_a_cols_rearr[15] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]);
+
+		//pack 8 diags of A together
+		reciprocal_diags[0] = reciprocal_diags[1];
+		mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[8], mat_a_cols_rearr[9], 0xAA);//diag 0,1
+		mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[10], mat_a_cols_rearr[11], 0xAA);//diag 2,3
+		mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[12], mat_a_cols_rearr[13], 0xAA);//diag 4,5
+		mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[15], 0xAA);//diag 6,7
+		mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3
+		mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7
+		mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7
+
+		//reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7
+		reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]);
+
+		i = 0;
+		i2 = 0;
+		for (k = 0; k < numCols_b; k += 8)
+		{
+			i = i1 + k;
+			//Read 8 cols of B columns of Block-to-be-solved
+			mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i);
+			mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i));
+			mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i));
+			mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i));
+			mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i));
+			mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i));
+			mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i));
+			mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i));
+			
+			mat_b_rearr[i2][0] = _mm256_mul_ps(mat_b_rearr[i2][0], alphaReg);
+		        mat_b_rearr[i2][1] = _mm256_mul_ps(mat_b_rearr[i2][1], alphaReg);
+		    	mat_b_rearr[i2][2] = _mm256_mul_ps(mat_b_rearr[i2][2], alphaReg);
+		    	mat_b_rearr[i2][3] = _mm256_mul_ps(mat_b_rearr[i2][3], alphaReg);
+		    	mat_b_rearr[i2][4] = _mm256_mul_ps(mat_b_rearr[i2][4], alphaReg);
+		    	mat_b_rearr[i2][5] = _mm256_mul_ps(mat_b_rearr[i2][5], alphaReg);
+		    	mat_b_rearr[i2][6] = _mm256_mul_ps(mat_b_rearr[i2][6], alphaReg);
+		    	mat_b_rearr[i2][7] = _mm256_mul_ps(mat_b_rearr[i2][7], alphaReg);
+			
+			i2++;
+		}
+		
+		i = 0;
+		i2 = 0;
+		for (l = 0; l < j; l += 8) // move across m
+		{
+			//Broadcast A8,0 to A15,0 to registers
+			mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i));
+			mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1));
+			mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+			mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+			mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+			mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+			mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+			mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		
+			//Broadcast A21 to A71 to registers
+			mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i));
+			mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1));
+			mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2));
+			mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3));
+			mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4));
+			mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5));
+			mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6));
+			mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7));
+			
+			//Broadcast A8,2 to A15,2 to registers
+			mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i));
+			mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1));
+			mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2));
+			mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3));
+			mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4));
+			mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5));
+			mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6));
+			mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7));
+		
+			//Broadcast A8,3 to A15,3 to registers
+			mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i));
+			mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1));
+			mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2));
+			mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3));
+			mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4));
+			mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5));
+			mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6));
+			mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7));
+			
+			// _mm256_permute2f128_ps()
+			
+			//Broadcast A8,4 to A15,4 to registers
+			mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i));
+			mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1));
+			mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2));
+			mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3));
+			mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4));
+			mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5));
+			mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6));
+			mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7));
+			
+			//Broadcast A8,5 to A15,5 to registers
+			mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i));
+			mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1));
+			mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2));
+			mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3));
+			mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4));
+			mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5));
+			mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6));
+			mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7));
+			
+			//Broadcast A8,6 to A15,6 to registers
+			mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i));
+			mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1));
+			mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2));
+			mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3));
+			mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4));
+			mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5));
+			mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6));
+			mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7));
+			
+			//Broadcast A8,7 to A15,7 to registers
+			mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i));
+			mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1));
+			mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2));
+			mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3));
+			mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4));
+			mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5));
+			mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6));
+			mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7));
+						
+			i += cs_l_offset[6];
+			
+			
+			for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m)
+			{
+				/////////////////// Partial Lower 8x8 block trsm of B
+
+				i4 = i2 + k;
+				//Read current 8 cols of B columns from specified 8x8 current-block of B
+				mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4);
+				mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b));
+				mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0]));
+				mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1]));
+				mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2]));
+				mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3]));
+				mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4]));
+				mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5]));
+
+				i4 = k >> 3;
+				
+				//(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//end loop of cols					
+			}
+			i2 += cs_b_offset[6];
+		}
+		
+		//Broadcast A10 to A70 to registers
+		mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1));
+		mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+		mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a00 from a
+		mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00);
+		mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00);
+		//mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]);
+		
+		//Broadcast A21 to A71 to registers
+		mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+		mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a11 from a
+		mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55);
+		mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00);
+		//mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]);
+	
+		//Broadcast A32 to A72 to registers
+		mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a22 from a
+		mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA);
+		mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00);
+		//mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]);
+	
+		//Broadcast A43 to A73 to registers
+		mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a33 from a
+		mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF);
+		mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00);
+		//mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]);
+	
+		//Broadcast A54 to A74 to registers
+		mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a44 from a
+		mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00);
+		mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11);
+		//mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]);
+	
+		//Broadcast A65 to A75 to registers
+		mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+		//extract diag a55 from a
+		mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55);
+		mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11);
+		//mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]);
+	
+		//Broadcast A76 to register
+		mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		//extract diag a66 from a
+		mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA);
+		mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11);
+		//mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]);
+
+		//extract diag a77 from a
+		mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF);
+		mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11);
+		//mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]);
+
+		k = 0;
+		for (i = 0; i < numCols_b; i+=8)
+		{
+			/////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A
+			
+			//(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B
+			mat_b_rearr[k][0] = _mm256_mul_ps(mat_b_rearr[k][0], mat_a_diag_inv[0]);
+
+			//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+			mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b)
+			mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B
+			mat_b_rearr[k][1] = _mm256_mul_ps(mat_b_rearr[k][1], mat_a_diag_inv[1]);
+
+			//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+			mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B
+			mat_b_rearr[k][2] = _mm256_mul_ps(mat_b_rearr[k][2], mat_a_diag_inv[2]);
+
+			//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B
+			mat_b_rearr[k][3] = _mm256_mul_ps(mat_b_rearr[k][3], mat_a_diag_inv[3]);
+
+			//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B
+			mat_b_rearr[k][4] = _mm256_mul_ps(mat_b_rearr[k][4], mat_a_diag_inv[4]);
+
+			//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B
+			mat_b_rearr[k][5] = _mm256_mul_ps(mat_b_rearr[k][5], mat_a_diag_inv[5]);
+
+			//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B
+			mat_b_rearr[k][6] = _mm256_mul_ps(mat_b_rearr[k][6], mat_a_diag_inv[6]);
+
+			//(Row7): FMA operations of b7 with elements of index (7, 0)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B
+			mat_b_rearr[k][7] = _mm256_mul_ps(mat_b_rearr[k][7], mat_a_diag_inv[7]);
+
+			////////////////////////////////////////////////////////////////////////////////
+
+			//Store the computed B columns
+
+			_mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]);
+			//printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k));
+			k++;
+		}
+
+
+	}
+	///////////////////loop ends /////////////////////
+}
+
+static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b)
+{
+	//float ones = 1.0;
+	int i, i1, i2, i3, i4, j, k, l;
+	int cs_b_offset[7];
+	int cs_l_offset[7];
+	float *ptr_b_dup;
+
+	//57 number of ymm(256 bits) registers used
+	__m256 mat_b_col[8];
+	__m256 mat_b_rearr[16][8];
+	//__m256 mat_a_cols_rearr[8];
+	__m256 mat_a_blk_elems[64];
+	//__m256 mat_a_diag_inv[8];
+	//__m256 reciprocal_diags[2];
+
+	// ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- //
+
+	//L matrix offsets
+	cs_l_offset[0] = (cs_l << 1);
+	cs_l_offset[1] = cs_l + cs_l_offset[0];
+	cs_l_offset[2] = (cs_l << 2);
+	cs_l_offset[3] = cs_l + cs_l_offset[2];
+	cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2];
+	cs_l_offset[5] = cs_l + cs_l_offset[4];
+	cs_l_offset[6] = (cs_l_offset[5] + cs_l);
+
+	cs_b_offset[0] = (cs_b << 1);
+	cs_b_offset[1] = cs_b + cs_b_offset[0];
+	cs_b_offset[2] = (cs_b << 2);
+	cs_b_offset[3] = cs_b + cs_b_offset[2];
+	cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2];
+	cs_b_offset[5] = cs_b + cs_b_offset[4];
+	cs_b_offset[6] = (cs_b_offset[5] + cs_b);
+
+	//Broadcast A10 to A70 to registers
+	mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1));
+	mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+
+	//Broadcast A21 to A71 to registers
+	mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2));
+	mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3));
+	mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4));
+	mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5));
+	mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6));
+	mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7));
+
+	//Broadcast A32 to A72 to registers
+	mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3));
+	mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4));
+	mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5));
+	mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6));
+	mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7));
+
+	//Broadcast A43 to A73 to registers
+	mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4));
+	mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5));
+	mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6));
+	mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7));
+
+	//Broadcast A54 to A74 to registers
+	mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5));
+	mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6));
+	mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7));
+
+	//Broadcast A65 to A75 to registers
+	mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6));
+	mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7));
+
+	//Broadcast A76 to register
+	mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7));
+
+
+	/*****************   first set of 8 rows of B processing starts    *****************/
+	ptr_b_dup = ptr_b;
+	i = 0;
+	for (j = 0; j < numCols_b; j += 8)
+	{
+		/////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A
+		//read 8x8 block of B into registers
+		mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i);
+		mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i));
+		mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i));
+		mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i));
+		mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i));
+		mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i));
+		mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i));
+		mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i));
+
+		//(Row0)
+		mat_b_col[0] = mat_b_rearr[0][0];
+
+		//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+		mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b)
+		mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+		mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+		mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+		mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+		mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+		mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row7): FMA operations of b7 with elements of index (7, 0)
+		mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		////////////////////////////////////////////////////////////////////////////////
+
+		//Store the computed B columns
+		_mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]);
+
+		//i += cs_b_offset[6];
+		//ptr_b_dup += cs_b_offset[6];
+		i += 8;
+		ptr_b_dup += 8;
+	}
+
+	//c = 0;
+	/***************** first set of 8 cols of B processing done *****************/
+	ptr_b_dup = ptr_b;
+	i3 = 0;
+	i1 = 0;
+	//Start loop for cols of B to be processed in size of blk_width
+	for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row
+	{
+		ptr_l += 8;
+		//ptr_b += j;
+		//ptr_b_dup += 8;
+		ptr_b_dup += cs_b_offset[6];
+		i1 += cs_b_offset[6];
+		i3 += cs_l_offset[6];
+
+		i = 0;
+		i2 = 0;
+		for (k = 0; k < numCols_b; k += 8)
+		{
+			i = i1 + k;
+			//Read 8 cols of B columns of Block-to-be-solved
+			mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i);
+			mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i));
+			mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i));
+			mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i));
+			mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i));
+			mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i));
+			mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i));
+			mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i));
+			i2++;
+		}
+		
+		i = 0;
+		i2 = 0;
+		for (l = 0; l < j; l += 8) // move across m
+		{
+			//Broadcast A8,0 to A15,0 to registers
+			mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i));
+			mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1));
+			mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+			mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+			mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+			mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+			mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+			mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		
+			//Broadcast A21 to A71 to registers
+			mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i));
+			mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1));
+			mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2));
+			mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3));
+			mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4));
+			mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5));
+			mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6));
+			mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7));
+			
+			//Broadcast A8,2 to A15,2 to registers
+			mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i));
+			mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1));
+			mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2));
+			mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3));
+			mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4));
+			mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5));
+			mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6));
+			mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7));
+		
+			//Broadcast A8,3 to A15,3 to registers
+			mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i));
+			mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1));
+			mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2));
+			mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3));
+			mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4));
+			mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5));
+			mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6));
+			mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7));
+			
+			// _mm256_permute2f128_ps()
+			
+			//Broadcast A8,4 to A15,4 to registers
+			mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i));
+			mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1));
+			mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2));
+			mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3));
+			mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4));
+			mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5));
+			mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6));
+			mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7));
+			
+			//Broadcast A8,5 to A15,5 to registers
+			mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i));
+			mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1));
+			mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2));
+			mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3));
+			mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4));
+			mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5));
+			mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6));
+			mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7));
+			
+			//Broadcast A8,6 to A15,6 to registers
+			mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i));
+			mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1));
+			mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2));
+			mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3));
+			mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4));
+			mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5));
+			mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6));
+			mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7));
+			
+			//Broadcast A8,7 to A15,7 to registers
+			mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i));
+			mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1));
+			mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2));
+			mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3));
+			mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4));
+			mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5));
+			mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6));
+			mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7));
+						
+			i += cs_l_offset[6];
+			
+			for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m)
+			{
+				/////////////////// Partial Lower 8x8 block trsm of B
+
+				i4 = i2 + k;
+				//Read current 8 cols of B columns from specified 8x8 current-block of B
+				mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4);
+				mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b));
+				mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0]));
+				mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1]));
+				mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2]));
+				mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3]));
+				mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4]));
+				mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5]));
+
+				i4 = k >> 3;
+				
+				//(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//end loop of cols					
+			}
+			i2 += cs_b_offset[6];
+		}
+		
+		//Broadcast A10 to A70 to registers
+		mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1));
+		mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+		mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+				
+		//Broadcast A21 to A71 to registers
+		mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+		mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A32 to A72 to registers
+		mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A43 to A73 to registers
+		mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A54 to A74 to registers
+		mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A65 to A75 to registers
+		mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A76 to register
+		mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		
+		k = 0;
+		for (i = 0; i < numCols_b; i+=8)
+		{
+			/////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A
+			
+			//(Row0): already done
+
+			//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+			mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b)
+			mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+			mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row7): FMA operations of b7 with elements of index (7, 0)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			////////////////////////////////////////////////////////////////////////////////
+
+			//Store the computed B columns
+
+			_mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]);
+			//printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k));
+			k++;
+		}
+
+
+	}
+	///////////////////loop ends /////////////////////
+}
+
+static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha)
+{
+	//float ones = 1.0;
+	int i, i1, i2, i3, i4, j, k, l;
+	int cs_b_offset[7];
+	int cs_l_offset[7];
+	float *ptr_b_dup;
+
+	//57 number of ymm(256 bits) registers used
+	__m256 mat_b_col[8];
+	__m256 mat_b_rearr[16][8];
+	//__m256 mat_a_cols_rearr[8];
+	__m256 mat_a_blk_elems[64];
+	//__m256 mat_a_diag_inv[8];
+	//__m256 reciprocal_diags[2];
+	__m256 alphaReg;
+	alphaReg = _mm256_broadcast_ss((float const *)&alpha);
+
+	// ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- //
+
+	//L matrix offsets
+	cs_l_offset[0] = (cs_l << 1);
+	cs_l_offset[1] = cs_l + cs_l_offset[0];
+	cs_l_offset[2] = (cs_l << 2);
+	cs_l_offset[3] = cs_l + cs_l_offset[2];
+	cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2];
+	cs_l_offset[5] = cs_l + cs_l_offset[4];
+	cs_l_offset[6] = (cs_l_offset[5] + cs_l);
+
+	cs_b_offset[0] = (cs_b << 1);
+	cs_b_offset[1] = cs_b + cs_b_offset[0];
+	cs_b_offset[2] = (cs_b << 2);
+	cs_b_offset[3] = cs_b + cs_b_offset[2];
+	cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2];
+	cs_b_offset[5] = cs_b + cs_b_offset[4];
+	cs_b_offset[6] = (cs_b_offset[5] + cs_b);
+
+	//Broadcast A10 to A70 to registers
+	mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1));
+	mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2));
+	mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3));
+	mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4));
+	mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5));
+	mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6));
+	mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7));
+
+	//Broadcast A21 to A71 to registers
+	mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2));
+	mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3));
+	mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4));
+	mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5));
+	mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6));
+	mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7));
+
+	//Broadcast A32 to A72 to registers
+	mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3));
+	mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4));
+	mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5));
+	mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6));
+	mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7));
+
+	//Broadcast A43 to A73 to registers
+	mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4));
+	mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5));
+	mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6));
+	mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7));
+
+	//Broadcast A54 to A74 to registers
+	mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5));
+	mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6));
+	mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7));
+
+	//Broadcast A65 to A75 to registers
+	mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6));
+	mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7));
+
+	//Broadcast A76 to register
+	mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7));
+
+
+	/*****************   first set of 8 rows of B processing starts    *****************/
+	ptr_b_dup = ptr_b;
+	i = 0;
+	for (j = 0; j < numCols_b; j += 8)
+	{
+		/////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A
+		//read 8x8 block of B into registers
+		mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i);
+		mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i));
+		mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i));
+		mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i));
+		mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i));
+		mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i));
+		mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i));
+		mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i));
+
+		mat_b_rearr[0][0] = _mm256_mul_ps(mat_b_rearr[0][0], alphaReg);
+		mat_b_rearr[1][0] = _mm256_mul_ps(mat_b_rearr[1][0], alphaReg);
+		mat_b_rearr[2][0] = _mm256_mul_ps(mat_b_rearr[2][0], alphaReg);
+		mat_b_rearr[3][0] = _mm256_mul_ps(mat_b_rearr[3][0], alphaReg);
+		mat_b_rearr[4][0] = _mm256_mul_ps(mat_b_rearr[4][0], alphaReg);
+		mat_b_rearr[5][0] = _mm256_mul_ps(mat_b_rearr[5][0], alphaReg);
+		mat_b_rearr[6][0] = _mm256_mul_ps(mat_b_rearr[6][0], alphaReg);
+		mat_b_rearr[7][0] = _mm256_mul_ps(mat_b_rearr[7][0], alphaReg);
+		
+		//(Row0)
+		mat_b_col[0] = mat_b_rearr[0][0];
+
+		//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+		mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b)
+		mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+		mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b)
+		mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+		mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b)
+		mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+		mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b)
+		mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+		mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b)
+		mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+		mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b)
+		mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		//(Row7): FMA operations of b7 with elements of index (7, 0)
+		mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b)
+
+		////////////////////////////////////////////////////////////////////////////////
+
+		//Store the computed B columns
+		_mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]);
+		_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]);
+
+		//i += cs_b_offset[6];
+		//ptr_b_dup += cs_b_offset[6];
+		i += 8;
+		ptr_b_dup += 8;
+	}
+
+	//c = 0;
+	/***************** first set of 8 cols of B processing done *****************/
+	ptr_b_dup = ptr_b;
+	i3 = 0;
+	i1 = 0;
+	//Start loop for cols of B to be processed in size of blk_width
+	for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row
+	{
+		ptr_l += 8;
+		//ptr_b += j;
+		//ptr_b_dup += 8;
+		ptr_b_dup += cs_b_offset[6];
+		i1 += cs_b_offset[6];
+		i3 += cs_l_offset[6];
+
+		i = 0;
+		i2 = 0;
+		for (k = 0; k < numCols_b; k += 8)
+		{
+			i = i1 + k;
+			//Read 8 cols of B columns of Block-to-be-solved
+			mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i);
+			mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i));
+			mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i));
+			mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i));
+			mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i));
+			mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i));
+			mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i));
+			mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i));
+			
+			mat_b_rearr[i2][0] = _mm256_mul_ps(mat_b_rearr[i2][0], alphaReg);
+		        mat_b_rearr[i2][1] = _mm256_mul_ps(mat_b_rearr[i2][1], alphaReg);
+		    	mat_b_rearr[i2][2] = _mm256_mul_ps(mat_b_rearr[i2][2], alphaReg);
+		    	mat_b_rearr[i2][3] = _mm256_mul_ps(mat_b_rearr[i2][3], alphaReg);
+		    	mat_b_rearr[i2][4] = _mm256_mul_ps(mat_b_rearr[i2][4], alphaReg);
+		    	mat_b_rearr[i2][5] = _mm256_mul_ps(mat_b_rearr[i2][5], alphaReg);
+		    	mat_b_rearr[i2][6] = _mm256_mul_ps(mat_b_rearr[i2][6], alphaReg);
+		    	mat_b_rearr[i2][7] = _mm256_mul_ps(mat_b_rearr[i2][7], alphaReg);
+			
+			i2++;
+		}
+		
+		i = 0;
+		i2 = 0;
+		for (l = 0; l < j; l += 8) // move across m
+		{
+			//Broadcast A8,0 to A15,0 to registers
+			mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i));
+			mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1));
+			mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+			mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+			mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+			mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+			mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+			mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		
+			//Broadcast A21 to A71 to registers
+			mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i));
+			mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1));
+			mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2));
+			mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3));
+			mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4));
+			mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5));
+			mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6));
+			mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7));
+			
+			//Broadcast A8,2 to A15,2 to registers
+			mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i));
+			mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1));
+			mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2));
+			mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3));
+			mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4));
+			mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5));
+			mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6));
+			mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7));
+		
+			//Broadcast A8,3 to A15,3 to registers
+			mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i));
+			mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1));
+			mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2));
+			mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3));
+			mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4));
+			mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5));
+			mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6));
+			mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7));
+			
+			// _mm256_permute2f128_ps()
+			
+			//Broadcast A8,4 to A15,4 to registers
+			mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i));
+			mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1));
+			mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2));
+			mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3));
+			mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4));
+			mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5));
+			mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6));
+			mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7));
+			
+			//Broadcast A8,5 to A15,5 to registers
+			mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i));
+			mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1));
+			mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2));
+			mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3));
+			mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4));
+			mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5));
+			mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6));
+			mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7));
+			
+			//Broadcast A8,6 to A15,6 to registers
+			mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i));
+			mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1));
+			mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2));
+			mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3));
+			mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4));
+			mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5));
+			mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6));
+			mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7));
+			
+			//Broadcast A8,7 to A15,7 to registers
+			mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i));
+			mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1));
+			mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2));
+			mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3));
+			mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4));
+			mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5));
+			mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6));
+			mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7));
+						
+			i += cs_l_offset[6];
+			
+			for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m)
+			{
+				/////////////////// Partial Lower 8x8 block trsm of B
+
+				i4 = i2 + k;
+				//Read current 8 cols of B columns from specified 8x8 current-block of B
+				mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4);
+				mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b));
+				mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0]));
+				mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1]));
+				mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2]));
+				mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3]));
+				mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4]));
+				mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5]));
+
+				i4 = k >> 3;
+				
+				//(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+				mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b)
+				mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b)
+				mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b)
+				mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b)
+				mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b)
+				mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b)
+				mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b)
+				mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b)
+
+				//end loop of cols					
+			}
+			i2 += cs_b_offset[6];
+		}
+		
+		//Broadcast A10 to A70 to registers
+		mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1));
+		mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+		mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+				
+		//Broadcast A21 to A71 to registers
+		mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2));
+		mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A32 to A72 to registers
+		mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3));
+		mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A43 to A73 to registers
+		mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4));
+		mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A54 to A74 to registers
+		mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5));
+		mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A65 to A75 to registers
+		mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6));
+		mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		i += cs_l;
+			
+		//Broadcast A76 to register
+		mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7));
+		
+		k = 0;
+		for (i = 0; i < numCols_b; i+=8)
+		{
+			/////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A
+			
+			//(Row0): already done
+
+			//(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0)
+			mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b)
+			mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0)
+			mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0)
+			mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0)
+			mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0)
+			mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0)
+			mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			//(Row7): FMA operations of b7 with elements of index (7, 0)
+			mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b)
+
+			////////////////////////////////////////////////////////////////////////////////
+
+			//Store the computed B columns
+
+			_mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]);
+			_mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]);
+			//printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k));
+			k++;
+		}
+
+
+	}
+	///////////////////loop ends /////////////////////
+}
+
+
+#endif
+
+>>>>>>> small matrix trsm intrinsics optimization code for AX=B and XA'=B