From 82c2eb4e8efd8b02a09a72e0440a43f589a2486d Mon Sep 17 00:00:00 2001
From: Edward Smyth <Edward.Smyth@amd.com>
Date: Fri, 16 Dec 2022 06:24:50 -0500
Subject: [PATCH] Code cleanup and warnings fixes

Corrections for some occurances of:
- Compiler warnings about initialization of float from double
- Spelling mistakes in comments
- Incorrect indentation of code and comments

AMD-Internal: [CPUPL-2870]
Change-Id: Icb68c789687bd0684844331d43071bfffecac9fc
---
 aocl_dtl/aocldtl.c                 |   4 +-
 config/zen3/make_defs.mk           |   4 +-
 config/zen4/make_defs.mk           |   4 +-
 configure                          |   4 +-
 frame/3/bli_l3_packm.c             |   4 +-
 frame/compat/bla_gemm_amd.c        |   4 +-
 frame/thread/bli_l3_decor_openmp.c |   4 +-
 frame/thread/bli_l3_decor_single.c |   4 +-
 frame/util/bli_util_unb_var1.c     |   6 +-
 kernels/zen/3/bli_gemm_small.c     |  22 +-
 kernels/zen/3/bli_trsm_small.c     | 348 ++++++++++++++---------------
 11 files changed, 204 insertions(+), 204 deletions(-)

diff --git a/aocl_dtl/aocldtl.c b/aocl_dtl/aocldtl.c
index 6e7ee3510..a9b3db178 100644
--- a/aocl_dtl/aocldtl.c
+++ b/aocl_dtl/aocldtl.c
@@ -5,7 +5,7 @@
  *               These functions are invoked though macros by
  *               end user.
  *
- * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  *=======================================================================*/
 #include "blis.h"
@@ -129,7 +129,7 @@ void DTL_Initialize(
 
 #if (AOCL_DTL_LOG_ENABLE || AOCL_DTL_DUMP_ENABLE)
     
-    /* Check if DTL logging is requested via envoronment variable */ 
+    /* Check if DTL logging is requested via environment variable */
     gbIsLoggingEnabled = bli_env_get_var( "AOCL_VERBOSE", TRUE );
 #endif
 
diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk
index 8522a1e95..ecbb7889b 100644
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -96,7 +96,7 @@ ifeq ($(CC_VENDOR),clang)
 # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
 # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
 
-# For our prupose we just want to know if it version 2x or 3x
+# For our purpose we just want to know if it version 2x or 3x
 
 # for version 3x we will enable znver3
 ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
diff --git a/config/zen4/make_defs.mk b/config/zen4/make_defs.mk
index f10f20a48..05975b969 100644
--- a/config/zen4/make_defs.mk
+++ b/config/zen4/make_defs.mk
@@ -4,7 +4,7 @@
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -112,7 +112,7 @@ ifeq ($(CC_VENDOR),clang)
 # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
 # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0)
 
-# For our prupose we just want to know if it version 2x or 3x or 4x
+# For our purpose we just want to know if it version 2x or 3x or 4x
 
 # for version 4x we will enable znver4
 ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1)
diff --git a/configure b/configure
index 73dc8cc35..36b3e3c85 100755
--- a/configure
+++ b/configure
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -215,7 +215,7 @@ print_usage()
 	echo " "
 	echo "                 Set the size (in bits) of internal BLIS integers and"
 	echo "                 integer types used in native BLIS interfaces. The"
-	echo "                 default inteter type size is architecture dependent."
+	echo "                 default integer type size is architecture dependent."
 	echo "                 (Hint: You can always find this value printed at the"
 	echo "                 beginning of the testsuite output.)"
 	echo " "
diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c
index 1134bdc1f..5d6914ec5 100644
--- a/frame/3/bli_l3_packm.c
+++ b/frame/3/bli_l3_packm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -164,7 +164,7 @@ void bli_l3_packm
 	// with the mem_t entry acquired from the memory broker (now cached in
 	// the control tree node).
 	void* buf = bli_mem_buffer( cntl_mem_p );
-    bli_obj_set_buffer( buf, x_pack );
+	bli_obj_set_buffer( buf, x_pack );
 
 
 	// Pack the contents of object x to object x_pack.
diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c
index 7ae62b06f..3eb67b8d2 100644
--- a/frame/compat/bla_gemm_amd.c
+++ b/frame/compat/bla_gemm_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -804,7 +804,7 @@ void zgemm_blis_impl
     bli_obj_set_conjtrans( blis_transa, &ao );
     bli_obj_set_conjtrans( blis_transb, &bo );
 
-    // default instance peformance tuning is done in zgemm.
+    // default instance performance tuning is done in zgemm.
     // Single instance tuning is done based on env set.
     //dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 );
 
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
index b01c208a3..14b2bd04a 100644
--- a/frame/thread/bli_l3_decor_openmp.c
+++ b/frame/thread/bli_l3_decor_openmp.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -145,7 +145,7 @@ void bli_l3_thread_decorator
 		tls_aoclprogress_counter = 0;
 
 		// We send the update only after certain threshold is reached, 
-		// The thresold is defined as AOCL_PROGRESS_FREQUENCY. 
+		// The threshold is defined as AOCL_PROGRESS_FREQUENCY.
 		// This variable stores the counter value when last update was sent. 
 		// It is compared with current counter value to see if it is time to
 		// send the next update.
diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c
index 444583e73..0a4d16c22 100644
--- a/frame/thread/bli_l3_decor_single.c
+++ b/frame/thread/bli_l3_decor_single.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -121,7 +121,7 @@ void bli_l3_thread_decorator
 		tls_aoclprogress_counter = 0;
 		
 		// We send the update only after certain threshold is reached, 
-		// The thresold is defined as AOCL_PROGRESS_FREQUENCY. 
+		// The threshold is defined as AOCL_PROGRESS_FREQUENCY.
 		// This variable stores the counter value when last update was sent. 
 		// It is compared with current counter value to see if it is time to
 		// send the next update.
diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index e4b0acf04..3c4fd6ea1 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -329,8 +329,8 @@ void bli_cnormfv_unb_var1
     }
     else
     {
-        float* zero       = bli_d0;
-        float* one        = bli_d1;
+        float* zero       = bli_s0;
+        float* one        = bli_s1;
         float  scale;
         float  sumsq;
         float  sqrt_sumsq;
diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c
index 22bb48f73..069b90170 100644
--- a/kernels/zen/3/bli_gemm_small.c
+++ b/kernels/zen/3/bli_gemm_small.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2017-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2017-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -165,25 +165,25 @@ err_t bli_gemm_small
 #ifndef BLIS_ENABLE_MULTITHREADING
             // bli_dgemm_small_At is called directly from blas interface for
             // sizes within thresholds.
-            // Avoinding calling of bli_dgemm_small_At from gemm_front
+            // Avoiding calling of bli_dgemm_small_At from gemm_front
             // and directing to native implementation.
             return BLIS_NOT_YET_IMPLEMENTED;
 #else
             return bli_dgemm_small_At(alpha, a, b, beta, c, cntx, cntl);
 #endif
         }
-    if(dt == BLIS_DCOMPLEX)
-    {
+        if(dt == BLIS_DCOMPLEX)
+        {
 #ifndef BLIS_ENABLE_MULTITHREADING
             // bli_zgemm_small_At is called directly from blas interface for
             // sizes within thresholds.
-            // Avoinding calling of bli_zgemm_small_At from gemm_front
+            // Avoiding calling of bli_zgemm_small_At from gemm_front
             // and directing to native implementation.
             return BLIS_NOT_YET_IMPLEMENTED;
 #else
-        return bli_zgemm_small_At(alpha, a, b, beta, c, cntx, cntl);
+            return bli_zgemm_small_At(alpha, a, b, beta, c, cntx, cntl);
 #endif
-    }
+        }
 
         if (bli_obj_has_notrans( b ))
         {
@@ -1827,7 +1827,7 @@ static err_t bli_sgemm_small
         double *C = bli_obj_buffer_at_off(c); // pointer to elements of Matrix C
 
         double *tA = A, *tB = B, *tC = C;//, *tA_pack;
-        double *tA_packed; // temprorary pointer to hold packed A memory pointer
+        double *tA_packed; // temporary pointer to hold packed A memory pointer
         guint_t row_idx_packed; //packed A memory row index
         guint_t lda_packed; //lda of packed A
         guint_t col_idx_start; //starting index after A matrix is packed.
@@ -4341,7 +4341,7 @@ err_t bli_dgemm_small_At
         double *C = bli_obj_buffer_at_off(c); // pointer to elements of Matrix C
 
         double *tA = A, *tB = B, *tC = C;//, *tA_pack;
-        double *tA_packed; // temprorary pointer to hold packed A memory pointer
+        double *tA_packed; // temporary pointer to hold packed A memory pointer
         guint_t row_idx_packed; //packed A memory row index
         guint_t lda_packed; //lda of packed A
         dim_t tb_inc_row = 1; // row stride of matrix B
@@ -5822,7 +5822,7 @@ err_t bli_zgemm_small
         dcomplex *C = bli_obj_buffer_at_off(c); //pointer to elements of Matrix C
 
         dcomplex *tA = A, *tB = B, *tC = C;//, *tA_pack;
-        dcomplex *tA_packed; //temprorary pointer to hold packed A memory pointer
+        dcomplex *tA_packed; //temporary pointer to hold packed A memory pointer
         guint_t row_idx_packed; //packed A memory row index
         guint_t lda_packed; //lda of packed A
         guint_t col_idx_start; //starting index after A matrix is packed.
@@ -9779,7 +9779,7 @@ err_t bli_zgemm_small_At
         dcomplex *C = bli_obj_buffer_at_off(c); //pointer to elements of Matrix C
 
         dcomplex *tA = A, *tB = B, *tC = C;//, *tA_pack;
-        dcomplex *tA_packed; // temprorary pointer to hold packed A memory pointer
+        dcomplex *tA_packed; // temporary pointer to hold packed A memory pointer
         guint_t row_idx_packed; //packed A memory row index
         guint_t lda_packed; //lda of packed A
         dim_t tb_inc_row = 1; // row stride of matrix B
diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c
index f2f94fc1a..12c0ee729 100644
--- a/kernels/zen/3/bli_trsm_small.c
+++ b/kernels/zen/3/bli_trsm_small.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -1439,8 +1439,8 @@ BLIS_INLINE err_t dtrsm_XAltB_ref
 
 /*
    Load b11 of size 6x8 and multiply with alpha
-   Add the GEMM output and perform inregister transose of b11
-   to peform DTRSM operation for left cases.
+   Add the GEMM output and perform in register transpose of b11
+   to perform DTRSM operation for left cases.
 */
 #define BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal) \
         ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal));\
@@ -4684,8 +4684,8 @@ BLIS_INLINE err_t dtrsm_XAltB_ref
 
 /*
    Load b11 of size 6x8 and multiply with alpha
-   Add the GEMM output and perform inregister transose of b11
-   to peform DTRSM operation for left cases.
+   Add the GEMM output and perform in register transpose of b11
+   to perform DTRSM operation for left cases.
 */
 #define BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal) \
         ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));\
@@ -5119,7 +5119,7 @@ BLIS_INLINE void bli_dtrsm_small_pack
 }
 /*
     Pack diagonal elements of A block (8 or 6) into an array
-    a. This helps in utilze cache line efficiently in TRSM operation
+    a. This helps to utilize cache line efficiently in TRSM operation
     b. store ones when input is unit diagonal
 */
 BLIS_INLINE void dtrsm_small_pack_diag_element
@@ -7772,8 +7772,8 @@ BLIS_INLINE err_t ztrsm_AuXB_ref
 
 /*
  * Load b11 of size 3x4 and multiply with alpha
- * Add the GEMM output and perform inregister transose of b11
- * to peform ZTRSM operation for left cases.
+ * Add the GEMM output and perform in register transpose of b11
+ * to perform ZTRSM operation for left cases.
  */
 #define BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal) {\
     ymm16 = _mm256_broadcast_pd(( __m128d const *)(&AlphaVal));\
@@ -8383,7 +8383,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
     bool transa = bli_obj_has_trans(a);
     dim_t cs_a, rs_a;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -8480,7 +8480,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
 
             /*
                Pack 6 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
 
@@ -8513,8 +8513,8 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-            Peform GEMM between a01 and b10 blocks
-            For first itteration there will be no GEMM operation
+            Perform GEMM between a01 and b10 blocks
+            For first iteration there will be no GEMM operation
             where k_iter are zero
             */
             BLIS_DTRSM_SMALL_GEMM_6nx8m(a01,b10,cs_b,p_lda,k_iter)
@@ -8522,7 +8522,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
             /*
             Load b11 of size 8x6 and multiply with alpha
             Add the GEMM output to b11
-            and peform TRSM operation.
+            and perform TRSM operation.
             */
 
             BLIS_PRE_DTRSM_SMALL_6x8(AlphaVal,b11,cs_b)
@@ -10785,7 +10785,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
     dim_t cs_a, rs_a;
     dim_t d_mr = 8,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -10883,7 +10883,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
 
             /*
                Pack 6 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
@@ -10915,8 +10915,8 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-            Peform GEMM between a01 and b10 blocks
-            For first itteration there will be no GEMM operation
+            Perform GEMM between a01 and b10 blocks
+            For first iteration there will be no GEMM operation
             where k_iter are zero
             */
 
@@ -10925,7 +10925,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
             /*
             Load b11 of size 8x6 and multiply with alpha
             Add the GEMM output to b11
-            and peform TRSM operation.
+            and perform TRSM operation.
             */
 
             BLIS_PRE_DTRSM_SMALL_6x8(AlphaVal,b11,cs_b)
@@ -13105,7 +13105,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 8,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -13174,7 +13174,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
     }
 
     /*
-        Performs solving TRSM for 8 colmns at a time from  0 to m/d_mr in steps of d_mr
+        Performs solving TRSM for 8 columns at a time from 0 to m/d_mr in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-d_mr)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -13196,15 +13196,15 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
             /*
               Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 8x(m-8) which is the maximum GEMM alone block size in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_dtrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
 
                /*
                Pack 8 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -13221,7 +13221,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
             c. This loop GEMM+TRSM loops operates with 8x6 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         for(j = (n - d_nr); (j + 1) > 0; j -= d_nr)
         {
@@ -13235,16 +13235,16 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-                Peform GEMM between a10 and b01 blocks
-                For first itteration there will be no GEMM operation
+                Perform GEMM between a10 and b01 blocks
+                For first iteration there will be no GEMM operation
                 where k_iter are zero
             */
             BLIS_DTRSM_SMALL_GEMM_8mx6n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x8 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal)
 
@@ -15110,7 +15110,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 8,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -15178,7 +15178,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
     }
 
     /*
-        Performs solving TRSM for 8 colmns at a time from  0 to m/8 in steps of d_mr
+        Performs solving TRSM for 8 columns at a time from 0 to m/8 in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-8)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -15194,17 +15194,17 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
         if(transa)
         {
             /*
-              Load, tranpose and pack current A block (a10) into packed buffer memory D_A_pack
+              Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 8x(m-8) which is the maximum GEMM alone block size in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_dtrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
 
             /*
                Pack 8 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -15221,7 +15221,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
             c. This loop GEMM+TRSM loops operates with 8x6 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         dim_t temp = n - d_nr + 1;
         for(j = 0; j < temp; j += d_nr)   //loop along 'N' dimension
@@ -15237,16 +15237,16 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-              Peform GEMM between a10 and b01 blocks
-              For first itteration there will be no GEMM operation
+              Perform GEMM between a10 and b01 blocks
+              For first iteration there will be no GEMM operation
               where k_iter are zero
             */
             BLIS_DTRSM_SMALL_GEMM_8mx6n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x8 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal)
 
@@ -17148,7 +17148,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
 
 /*
     Pack diagonal elements of A block (16 or 6) into an array
-    a. This helps in utilze cache line efficiently in TRSM operation
+    a. This helps to utilize cache line efficiently in TRSM operation
     b. store ones when input is unit diagonal
 */
 BLIS_INLINE void strsm_small_pack_diag_element
@@ -17709,7 +17709,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
     dim_t cs_a, rs_a;
     dim_t d_mr = 16,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -17805,7 +17805,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
 
             /*
                Pack 6 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             strsm_small_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
@@ -17837,8 +17837,8 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
             BLIS_SET_S_YMM_REG_ZEROS
 
             /*
-            Peform GEMM between a01 and b10 blocks
-            For first itteration there will be no GEMM operation
+            Perform GEMM between a01 and b10 blocks
+            For first iteration there will be no GEMM operation
             where k_iter are zero
             */
             BLIS_STRSM_SMALL_GEMM_6nx16m(a01,b10,cs_b,p_lda,k_iter)
@@ -17846,7 +17846,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
             /*
             Load b11 of size 16x6 and multiply with alpha
             Add the GEMM output to b11
-            and peform TRSM operation.
+            and perform TRSM operation.
             */
 
             BLIS_PRE_STRSM_SMALL_6x16(AlphaVal,b11,cs_b)
@@ -21374,7 +21374,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
     bool transa = bli_obj_has_trans(a);
     dim_t cs_a, rs_a;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -21472,7 +21472,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
 
             /*
                Pack 6 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
 
@@ -21505,8 +21505,8 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
             BLIS_SET_S_YMM_REG_ZEROS
 
             /*
-            Peform GEMM between a01 and b10 blocks
-            For first itteration there will be no GEMM operation
+            Perform GEMM between a01 and b10 blocks
+            For first iteration there will be no GEMM operation
             where k_iter are zero
             */
             BLIS_STRSM_SMALL_GEMM_6nx16m(a01,b10,cs_b,p_lda,k_iter)
@@ -21514,7 +21514,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
             /*
             Load b11 of size 16x6 and multiply with alpha
             Add the GEMM output to b11
-            and peform TRSM operation.
+            and perform TRSM operation.
             */
 
             BLIS_PRE_STRSM_SMALL_6x16(AlphaVal,b11,cs_b)
@@ -25217,7 +25217,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 16,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -25283,7 +25283,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
     }
 
     /*
-        Performs solving TRSM for 16 colmns at a time from  0 to m/16 in steps of d_mr
+        Performs solving TRSM for 16 columns at a time from  0 to m/16 in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 16x6 to 16x (m-16)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -25299,17 +25299,17 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
         if(transa)
         {
             /*
-              Load, tranpose and pack current A block (a10) into packed buffer memory D_A_pack
+              Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 16x(m-16) which is the maximum GEMM alone block size in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 16x(m-16) which is the maximum GEMM alone block size in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_strsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
 
             /*
                Pack 16 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -25326,7 +25326,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
             c. This loop GEMM+TRSM loops operates with 16x6 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         dim_t temp = n - d_nr + 1;
         for(j = 0; j < temp; j += d_nr)   //loop along 'N' dimension
@@ -25342,16 +25342,16 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
             BLIS_SET_S_YMM_REG_ZEROS
 
             /*
-              Peform GEMM between a10 and b01 blocks
-              For first itteration there will be no GEMM operation
+              Perform GEMM between a10 and b01 blocks
+              For first iteration there will be no GEMM operation
               where k_iter are zero
             */
             BLIS_STRSM_SMALL_GEMM_16mx6n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x16 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal)
 
@@ -25877,8 +25877,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
             /*
                Load b11 of size 6x16 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
             ymm0 = _mm256_broadcast_ss((float const *)(&zero));
@@ -26459,7 +26459,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
             a11 += rs_a;
 
-            // N-register tranpose and store
+            // N-register transpose and store
 
             ymm0 = _mm256_unpacklo_ps(ymm10, ymm11);
             ymm1 = _mm256_unpacklo_ps(ymm17, ymm18);
@@ -26556,8 +26556,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
                 /*
                 Load b11 of size 6x16 and multiply with alpha
-                Add the GEMM output and perform inregister transose of b11
-                to peform TRSM operation.
+                Add the GEMM output and perform in register transpose of b11
+                to perform TRSM operation.
                 */
                 ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
                 ymm0 = _mm256_broadcast_ss((float const *)(&zero));
@@ -26641,8 +26641,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
                 /*
                 Load b11 of size 6x16 and multiply with alpha
-                Add the GEMM output and perform inregister transose of b11
-                to peform TRSM operation.
+                Add the GEMM output and perform in register transpose of b11
+                to perform TRSM operation.
                 */
                 ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
                 ymm0 = _mm256_broadcast_ss((float const *)(&zero));
@@ -26724,8 +26724,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
                 /*
                 Load b11 of size 6x16 and multiply with alpha
-                Add the GEMM output and perform inregister transose of b11
-                to peform TRSM operation.
+                Add the GEMM output and perform in register transpose of b11
+                to perform TRSM operation.
                 */
                 ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
                 ymm0 = _mm256_broadcast_ss((float const *)(&zero));
@@ -29583,7 +29583,7 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 16,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -29671,15 +29671,15 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
             /*
               Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 16x(m-16) which is the maximum GEMM alone block size in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 16x(m-16) which is the maximum GEMM alone block size in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_strsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
 
                /*
                Pack 8 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -29710,16 +29710,16 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
             BLIS_SET_S_YMM_REG_ZEROS
 
             /*
-                Peform GEMM between a10 and b01 blocks
-                For first itteration there will be no GEMM operation
+                Perform GEMM between a10 and b01 blocks
+                For first iteration there will be no GEMM operation
                 where k_iter are zero
             */
             BLIS_STRSM_SMALL_GEMM_16mx6n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x16 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal)
 
@@ -31830,8 +31830,8 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
 
             /*
                Load b11 of size 6x8 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
 
@@ -33724,7 +33724,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 4,d_nr = 3;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -33792,7 +33792,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
     }
 
     /*
-        Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+        Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -33808,19 +33808,19 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
         if(transa)
         {
             /*
-              Load, tranpose and pack current A block (a10) into packed buffer memory
-          D_A_pack
+              Load, transpose and pack current A block (a10) into packed buffer memory
+              D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 4x(m-4) which is the maximum GEMM alone block size
-         in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 4x(m-4) which is the maximum GEMM alone block size
+                 in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_ztrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
 
             /*
                Pack 4 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -33836,7 +33836,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
             c. This loop GEMM+TRSM loops operates with 4x3 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         dim_t temp = n - d_nr + 1;
         for(j = 0; j < temp; j += d_nr)   //loop along 'N' dimension
@@ -33852,16 +33852,16 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-              Peform GEMM between a10 and b01 blocks
-              For first itteration there will be no GEMM operation
+              Perform GEMM between a10 and b01 blocks
+              For first iteration there will be no GEMM operation
               where k_iter are zero
             */
             BLIS_ZTRSM_SMALL_GEMM_4mx3n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 3x4 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal)
         /*
@@ -34957,7 +34957,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 4,d_nr = 3;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -35026,7 +35026,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
     }
 
     /*
-        Performs solving TRSM for 4 colmns at a time from  0 to m/d_mr in steps of d_mr
+        Performs solving TRSM for 4 columns at a time from  0 to m/d_mr in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-d_mr)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -35047,18 +35047,18 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
         {
             /*
               Load, transpose and pack current A block (a10) into packed buffer memory
-          D_A_pack
+              D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 4x(m-4) which is the maximum GEMM alone block size
-         in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 4x(m-4) which is the maximum GEMM alone block size
+                 in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_ztrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
 
                /*
                Pack 8 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -35075,7 +35075,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
             c. This loop GEMM+TRSM loops operates with 8x6 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         for(j = (n - d_nr); (j + 1) > 0; j -= d_nr)
         {
@@ -35089,16 +35089,16 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-                Peform GEMM between a10 and b01 blocks
-                For first itteration there will be no GEMM operation
+                Perform GEMM between a10 and b01 blocks
+                For first iteration there will be no GEMM operation
                 where k_iter are zero
             */
             BLIS_ZTRSM_SMALL_GEMM_4mx3n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x8 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal)
 
@@ -36181,7 +36181,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
     dim_t cs_a, rs_a;
     dim_t d_mr = 4,d_nr = 3;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -36272,7 +36272,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
 
             /*
                Pack 3 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM
+               a. This helps to utilize cache line efficiently in TRSM
                operation
                b. store ones when input is unit diagonal
                */
@@ -36308,8 +36308,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
 
@@ -36318,7 +36318,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             /*
                Load b11 multiply with alpha
                Add the GEMM output to b11
-               and peform TRSM operation.
+               and perform TRSM operation.
                */
 
             BLIS_PRE_ZTRSM_SMALL_3x4(AlphaVal,b11,cs_b)
@@ -36491,8 +36491,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                  * accumulations */
                 BLIS_SET_YMM_REG_ZEROS
                 /*
-                   Peform GEMM between a01 and b10 blocks
-                   For first itteration there will be no GEMM operation
+                   Perform GEMM between a01 and b10 blocks
+                   For first iteration there will be no GEMM operation
                    where k_iter are zero
                    */
 
@@ -36501,7 +36501,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                 /*
                    Load b11 multiply with alpha
                    Add the GEMM output to b11
-                   and peform TRSM operation.
+                   and perform TRSM operation.
                    */
 
                 BLIS_PRE_ZTRSM_SMALL_3x3(AlphaVal,b11,cs_b)
@@ -36684,8 +36684,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                  * accumulations */
                 BLIS_SET_YMM_REG_ZEROS
                 /*
-                   Peform GEMM between a01 and b10 blocks
-                   For first itteration there will be no GEMM operation
+                   Perform GEMM between a01 and b10 blocks
+                   For first iteration there will be no GEMM operation
                    where k_iter are zero
                    */
 
@@ -36694,7 +36694,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                 /*
                    Load b11 of size 8x6 and multiply with alpha
                    Add the GEMM output to b11
-                   and peform TRSM operation.
+                   and perform TRSM operation.
                    */
 
                 BLIS_PRE_ZTRSM_SMALL_3x2(AlphaVal,b11,cs_b)
@@ -36832,8 +36832,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                  * accumulations */
                 BLIS_SET_YMM_REG_ZEROS
                 /*
-                   Peform GEMM between a01 and b10 blocks
-                   For first itteration there will be no GEMM operation
+                   Perform GEMM between a01 and b10 blocks
+                   For first iteration there will be no GEMM operation
                    where k_iter are zero
                    */
 
@@ -36842,7 +36842,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                 /*
                    Load b11 and multiply with alpha
                    Add the GEMM output to b11
-                   and peform TRSM operation.
+                   and perform TRSM operation.
                    */
 
                 BLIS_PRE_ZTRSM_SMALL_3x1(AlphaVal,b11,cs_b)
@@ -37166,8 +37166,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             /*Fill zeros into ymm registers used in gemm accumulations */
             BLIS_SET_YMM_REG_ZEROS
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
             //BLIS_ZTRSM_SMALL_GEMM_3nx3m(a01,b10,cs_b,p_lda,k_iter)
@@ -37255,8 +37255,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             /*Fill zeros into ymm registers used in gemm accumulations */
             BLIS_SET_YMM_REG_ZEROS
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
             BLIS_ZTRSM_SMALL_GEMM_2nx2m(a01,b10,cs_b,p_lda,k_iter)
@@ -37326,8 +37326,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             /*Fill zeros into ymm registers used in gemm accumulations */
             BLIS_SET_YMM_REG_ZEROS
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
             BLIS_ZTRSM_SMALL_GEMM_2nx1m(a01,b10,cs_b,p_lda,k_iter)
@@ -37644,7 +37644,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
     dim_t cs_a, rs_a;
     dim_t d_mr = 4,d_nr = 3;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -37734,7 +37734,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
 
             /*
                Pack 3 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM
+               a. This helps to utilize cache line efficiently in TRSM
                operation
                b. store ones when input is unit diagonal
                */
@@ -37769,8 +37769,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
 
@@ -37779,7 +37779,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
             /*
                Load b11 of size 4x3 and multiply with alpha
                Add the GEMM output to b11
-               and peform TRSM operation.
+               and perform TRSM operation.
                */
 
             BLIS_PRE_ZTRSM_SMALL_3x4(AlphaVal,b11,cs_b)
@@ -42231,7 +42231,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 	dim_t cs_a, rs_a;
 	dim_t d_mr = 8,d_nr = 3;
 
-	// Swap rs_a & cs_a in case of non-tranpose.
+	// Swap rs_a & cs_a in case of non-transpose.
 	if(transa)
 	{
 		cs_a = bli_obj_col_stride(a); // column stride of A
@@ -42308,7 +42308,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 	}
     
     /*
-	   Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+	   Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
 	   a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
 	   First there will be no GEMM and no packing of a10 because it is only TRSM
 	   b. Using packed a10 block and b01 block perform GEMM operation
@@ -42324,11 +42324,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 		if(transa)
 		{
 			/*
-			   Load, tranpose and pack current A block (a10) into packed buffer memory
+			   Load, transpose and pack current A block (a10) into packed buffer memory
 			   D_A_pack
 			   a. This a10 block is used in GEMM portion only and this
-			   a10 block size will be increasing by d_mr for every next itteration
-			   untill it reaches 4x(m-4) which is the maximum GEMM alone block size
+			   a10 block size will be increasing by d_mr for every next iteration
+			   until it reaches 4x(m-4) which is the maximum GEMM alone block size
 			   in A
 			   b. This packed buffer is reused to calculate all n rows of B matrix
 			   */
@@ -42336,7 +42336,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 
 			/*
 			   Pack 4 diagonal elements of A block into an array
-			   a. This helps in utilze cache line efficiently in TRSM operation
+			   a. This helps to utilize cache line efficiently in TRSM operation
 			   b. store ones when input is unit diagonal
 			   */
 			ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -42352,7 +42352,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 		   c. This loop GEMM+TRSM loops operates with 4x3 block size
 		   along n dimension for every d_nr rows of b01 where
 		   packed A buffer is reused in computing all n rows of B.
-		   d. Same approch is used in remaining fringe cases.
+		   d. Same approach is used in remaining fringe cases.
 		   */
 		dim_t temp = n - d_nr + 1;
 		for(j = 0; j < temp; j += d_nr)   //loop along 'N' dimension
@@ -42368,16 +42368,16 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 			BLIS_SET_S_YMM_REG_ZEROS
 
 			/*
-			   Peform GEMM between a10 and b01 blocks
-			   For first itteration there will be no GEMM operation
+			   Perform GEMM between a10 and b01 blocks
+			   For first iteration there will be no GEMM operation
 			   where k_iter are zero
 			   */
 			BLIS_CTRSM_SMALL_GEMM_8mx3n(a10,b01,cs_b,p_lda,k_iter)
 
 			/*
 			   Load b11 of size 3x4 and multiply with alpha
-			   Add the GEMM output and perform inregister transose of b11
-			   to peform TRSM operation.
+			   Add the GEMM output and perform in register transpose of b11
+			   to perform TRSM operation.
 			   */
 			BLIS_CTRSM_SMALL_NREG_TRANSPOSE_3x8(b11,cs_b,AlphaVal)
 			/*
@@ -44763,7 +44763,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 	dim_t cs_a, rs_a;
 	dim_t d_mr = 8,d_nr = 3;
 
-	// Swap rs_a & cs_a in case of non-tranpose.
+	// Swap rs_a & cs_a in case of non-transpose.
 	if(transa)
 	{
 		cs_a = bli_obj_col_stride(a); // column stride of A
@@ -44840,7 +44840,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 	}
 
 	/*
-	   Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+	   Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
 	   a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
 	   First there will be no GEMM and no packing of a10 because it is only TRSM
 	   b. Using packed a10 block and b01 block perform GEMM operation
@@ -44858,11 +44858,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 		if(transa)
 		{
 			/*
-			   Load, tranpose and pack current A block (a10) into packed buffer memory
+			   Load, transpose and pack current A block (a10) into packed buffer memory
 			   D_A_pack
 			   a. This a10 block is used in GEMM portion only and this
-			   a10 block size will be increasing by d_mr for every next itteration
-			   untill it reaches 4x(m-4) which is the maximum GEMM alone block size
+			   a10 block size will be increasing by d_mr for every next iteration
+			   until it reaches 4x(m-4) which is the maximum GEMM alone block size
 			   in A
 			   b. This packed buffer is reused to calculate all n rows of B matrix
 			   */
@@ -44870,7 +44870,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 
 			/*
 			   Pack 4 diagonal elements of A block into an array
-			   a. This helps in utilze cache line efficiently in TRSM operation
+			   a. This helps to utilize cache line efficiently in TRSM operation
 			   b. store ones when input is unit diagonal
 			   */
 			ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -44886,7 +44886,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 		   c. This loop GEMM+TRSM loops operates with 4x3 block size
 		   along n dimension for every d_nr rows of b01 where
 		   packed A buffer is reused in computing all n rows of B.
-		   d. Same approch is used in remaining fringe cases.
+		   d. Same approach is used in remaining fringe cases.
 		   */
 
 		for(j = (n - d_nr); (j + 1) > 0; j -= d_nr)   //loop along 'N' dimension
@@ -44902,16 +44902,16 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 			BLIS_SET_S_YMM_REG_ZEROS
 
 			/*
-			   Peform GEMM between a10 and b01 blocks
-			   For first itteration there will be no GEMM operation
+			   Perform GEMM between a10 and b01 blocks
+			   For first iteration there will be no GEMM operation
 			   where k_iter are zero
 			   */
 			BLIS_CTRSM_SMALL_GEMM_8mx3n(a10,b01,cs_b,p_lda,k_iter)
 
 			/*
 			   Load b11 of size 3x4 and multiply with alpha
-			   Add the GEMM output and perform inregister transose of b11
-			   to peform TRSM operation.
+			   Add the GEMM output and perform in register transpose of b11
+			   to perform TRSM operation.
 			   */
 			BLIS_CTRSM_SMALL_NREG_TRANSPOSE_3x8(b11,cs_b,AlphaVal)
 			/*
@@ -47544,7 +47544,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 	dim_t cs_a, rs_a;
 	dim_t d_mr = 8,d_nr = 3;
 
-	// Swap rs_a & cs_a in case of non-tranpose.
+	// Swap rs_a & cs_a in case of non-transpose.
 	if(transa)
 	{
 		cs_a = bli_obj_col_stride(a); // column stride of A
@@ -47618,7 +47618,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 	}
 
 	/*
-	   Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+	   Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
 	   a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
 	   First there will be no GEMM and no packing of a10 because it is only TRSM
 	   b. Using packed a10 block and b01 block perform GEMM operation
@@ -47634,11 +47634,11 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 		if(transa)
 		{
 			/*
-			   Load, tranpose and pack current A block (a10) into packed buffer memory
+			   Load, transpose and pack current A block (a10) into packed buffer memory
 			   D_A_pack
 			   a. This a10 block is used in GEMM portion only and this
-			   a10 block size will be increasing by d_mr for every next itteration
-			   untill it reaches 4x(m-4) which is the maximum GEMM alone block size
+			   a10 block size will be increasing by d_mr for every next iteration
+			   until it reaches 4x(m-4) which is the maximum GEMM alone block size
 			   in A
 			   b. This packed buffer is reused to calculate all n rows of B matrix
 			   */
@@ -47646,7 +47646,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 
 			/*
 			   Pack 4 diagonal elements of A block into an array
-			   a. This helps in utilze cache line efficiently in TRSM operation
+			   a. This helps to utilize cache line efficiently in TRSM operation
 			   b. store ones when input is unit diagonal
 			   */
 			ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
@@ -47662,7 +47662,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 		   c. This loop GEMM+TRSM loops operates with 4x3 block size
 		   along n dimension for every d_nr rows of b01 where
 		   packed A buffer is reused in computing all n rows of B.
-		   d. Same approch is used in remaining fringe cases.
+		   d. Same approach is used in remaining fringe cases.
 		   */
 		for(i = (m-d_mr); (i+1) > 0; i -= d_mr)     //loop along 'M' direction
 		{
@@ -47678,16 +47678,16 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 			BLIS_SET_S_YMM_REG_ZEROS
 
 			/*
-			   Peform GEMM between a10 and b01 blocks
-			   For first itteration there will be no GEMM operation
+			   Perform GEMM between a10 and b01 blocks
+			   For first iteration there will be no GEMM operation
 			   where k_iter are zero
 			   */
 			BLIS_CTRSM_SMALL_GEMM_3nx8m(a01,b10,cs_b,p_lda,k_iter)
 
 			/*
 			   Load b11 of size 3x4 and multiply with alpha
-			   Add the GEMM output and perform inregister transose of b11
-			   to peform TRSM operation.
+			   Add the GEMM output and perform in register transpose of b11
+			   to perform TRSM operation.
 			   */
 			BLIS_PRE_CTRSM_SMALL_3x8(AlphaVal, b11, cs_b)
 			/*
@@ -49164,7 +49164,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 	dim_t cs_a, rs_a;
 	dim_t d_mr = 8,d_nr = 3;
 
-	// Swap rs_a & cs_a in case of non-tranpose.
+	// Swap rs_a & cs_a in case of non-transpose.
 	if(transa)
 	{
 		cs_a = bli_obj_col_stride(a); // column stride of A
@@ -49237,7 +49237,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 	}
 
 	/*
-	   Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+	   Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
 	   a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
 	   First there will be no GEMM and no packing of a10 because it is only TRSM
 	   b. Using packed a10 block and b01 block perform GEMM operation
@@ -49254,11 +49254,11 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 		if(transa)
 		{
 			/*
-			   Load, tranpose and pack current A block (a10) into packed buffer memory
+			   Load, transpose and pack current A block (a10) into packed buffer memory
 			   D_A_pack
 			   a. This a10 block is used in GEMM portion only and this
-			   a10 block size will be increasing by d_mr for every next itteration
-			   untill it reaches 4x(m-4) which is the maximum GEMM alone block size
+			   a10 block size will be increasing by d_mr for every next iteration
+			   until it reaches 4x(m-4) which is the maximum GEMM alone block size
 			   in A
 			   b. This packed buffer is reused to calculate all n rows of B matrix
 			   */
@@ -49266,7 +49266,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 
 			/*
 			   Pack 4 diagonal elements of A block into an array
-			   a. This helps in utilze cache line efficiently in TRSM operation
+			   a. This helps to utilize cache line efficiently in TRSM operation
 			   b. store ones when input is unit diagonal
 			   */
 			ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
@@ -49282,7 +49282,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 		   c. This loop GEMM+TRSM loops operates with 4x3 block size
 		   along n dimension for every d_nr rows of b01 where
 		   packed A buffer is reused in computing all n rows of B.
-		   d. Same approch is used in remaining fringe cases.
+		   d. Same approach is used in remaining fringe cases.
 		   */
 		for(i = 0; (i+d_mr-1) < m; i += d_mr)     //loop along 'M' direction
 		{
@@ -49298,16 +49298,16 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 			BLIS_SET_S_YMM_REG_ZEROS
 
 			/*
-			   Peform GEMM between a10 and b01 blocks
-			   For first itteration there will be no GEMM operation
+			   Perform GEMM between a10 and b01 blocks
+			   For first iteration there will be no GEMM operation
 			   where k_iter are zero
 			   */
 			BLIS_CTRSM_SMALL_GEMM_3nx8m(a01,b10,cs_b,p_lda,k_iter)
 
 			/*
 			   Load b11 of size 3x4 and multiply with alpha
-			   Add the GEMM output and perform inregister transose of b11
-			   to peform TRSM operation.
+			   Add the GEMM output and perform in register transpose of b11
+			   to perform TRSM operation.
 			   */
 			BLIS_PRE_CTRSM_SMALL_3x8(AlphaVal, b11, cs_b)
 			/*