Code cleanup: spelling corrections

Corrections for spelling and other mistakes in code comments and doc files. AMD-Internal: [CPUPL-2870] Change-Id: Ifbb5df7df2d6312fe73e06ee6d41c00b16c593ce
2026-04-20 07:38:53 +00:00 · 2023-03-31 11:03:43 -04:00
parent 99d10c3f88
commit 6835205ba8
50 changed files with 173 additions and 166 deletions
--- a/addon/aocl_gemm/aocl_gemm_bf16_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_bf16_utils.c
@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
 	}

 	// Extra space since packing does width in multiples of 16. The bf16
-	// instruction can be used as long as atleast one zmm register can be fully
-	// loaded; and since k_dim needs to be atleast 2, having n_dim atleast 16
+	// instruction can be used as long as at least one zmm register can be fully
+	// loaded; and since k_dim needs to be at least 2, having n_dim at least 16
 	// should give 2x16=32 elements, enough for 1 zmm register.The padding is
 	// not rounded to NR (=64), since that would result in memory wastage.
 	dim_t n_reorder = make_multiple_of_n( n, 16 );
--- a/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c
@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s16os16)
 	}

 	// Extra space since packing does width in multiples of 16. The vpmaddubsw
-	// instruction can be used as long as atleast one ymm register can be fully
-	// loaded; and since k_dim needs to be at least 2, having n_dim atleast 16
+	// instruction can be used as long as at least one ymm register can be fully
+	// loaded; and since k_dim needs to be at least 2, having n_dim at least 16
 	// should give 2x16=32 elements, enough for 1 ymm register.The padding is
 	// not rounded to NR (=16), since that would result in memory wastage.
 	dim_t n_reorder = make_multiple_of_n(n, 16);
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c
@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s32os32)
 	}

 	// Extra space since packing does width in multiples of 16. The vnni
-	// instruction can be used as long as atleast one zmm register can be fully
-	// loaded; and since k_dim needs to be atleast 4, having n_dim atleast 16
+	// instruction can be used as long as at least one zmm register can be fully
+	// loaded; and since k_dim needs to be at least 4, having n_dim at least 16
 	// should give 4x16=64 elements, enough for 1 zmm register.The padding is
 	// not rounded to NR (=64), since that would result in memory wastage.
 	dim_t n_reorder = make_multiple_of_n( n, 16 );
--- a/docs/BLISTypedAPI.md
+++ b/docs/BLISTypedAPI.md
@@ -1891,7 +1891,7 @@ Possible microkernel types (ie: the return values for `bli_info_get_*_ukr_impl_s

 ### Operation implementation type query

-The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implemenation query](BLISTypedAPI.md#microkernel-implementation-type-query).
+The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implementation query](BLISTypedAPI.md#microkernel-implementation-type-query).
 ```c
 char* bli_info_get_gemm_impl_string( num_t dt );
 char* bli_info_get_hemm_impl_string( num_t dt );
--- a/frame/2/gemv/bli_gemv_unf_var1_amd.c
+++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c
@@ -143,7 +143,7 @@ void bli_dgemv_unf_var1

    conja = bli_extract_conj(transa);

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
@@ -460,7 +460,7 @@ void bli_sgemv_unf_var1

    conja = bli_extract_conj( transa );

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
--- a/frame/2/gemv/bli_gemv_unf_var2_amd.c
+++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c
@@ -177,7 +177,7 @@ void bli_dgemv_unf_var2

    conja = bli_extract_conj( transa );

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
@@ -247,7 +247,7 @@ void bli_dgemv_unf_var2

    /* If beta is zero, use setv. Otherwise, scale by beta. */
        /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
+    /* beta=0 case is handled by scalv internally */

    bli_dscalv_zen_int10
    (
@@ -448,7 +448,7 @@ void bli_sgemv_unf_var2

    conja = bli_extract_conj( transa );

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
@@ -516,7 +516,7 @@ void bli_sgemv_unf_var2

    /* If beta is zero, use setv. Otherwise, scale by beta. */
        /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
+    /* beta=0 case is handled by scalv internally */
    bli_sscalv_zen_int10
    (
      BLIS_NO_CONJUGATE,
@@ -835,7 +835,7 @@ void bli_cgemv_unf_var2

    /* If beta is zero, use setv. Otherwise, scale by beta. */
        /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
+    /* beta=0 case is handled by scalv internally */
    /*bli_cscalv_zen_int10
    (
      BLIS_NO_CONJUGATE,
@@ -846,7 +846,7 @@ void bli_cgemv_unf_var2
      cntx
    );*/

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
--- a/frame/2/hemv/bli_hemv_unf_var1_amd.c
+++ b/frame/2/hemv/bli_hemv_unf_var1_amd.c
@@ -316,7 +316,7 @@ void bli_dhemv_unf_var1
 	 * factor. */
 	/* Assign kernel function pointer and fusing factor. */

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	{
--- a/frame/2/hemv/bli_hemv_unf_var3_amd.c
+++ b/frame/2/hemv/bli_hemv_unf_var3_amd.c
@@ -312,7 +312,7 @@ void bli_dhemv_unf_var3

 	PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	{
--- a/frame/2/trsv/bli_trsv_unf_var1_amd.c
+++ b/frame/2/trsv/bli_trsv_unf_var1_amd.c
@@ -295,7 +295,7 @@ void bli_dtrsv_unf_var1

    PASTECH(d,dotxf_ker_ft) kfp_df;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_df = bli_ddotxf_zen_int_8;
@@ -496,7 +496,7 @@ void bli_strsv_unf_var1

    PASTECH(s,dotxf_ker_ft) kfp_df;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_df = bli_sdotxf_zen_int_8;
--- a/frame/2/trsv/bli_trsv_unf_var2_amd.c
+++ b/frame/2/trsv/bli_trsv_unf_var2_amd.c
@@ -297,7 +297,7 @@ void bli_dtrsv_unf_var2

    PASTECH(d,axpyf_ker_ft) kfp_af;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_af = bli_daxpyf_zen_int_16x4;
@@ -496,7 +496,7 @@ void bli_strsv_unf_var2

    PASTECH(s, axpyf_ker_ft) kfp_af;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_af = bli_saxpyf_zen_int_5;
@@ -695,7 +695,7 @@ void bli_ztrsv_unf_var2

    PASTECH(z, axpyf_ker_ft) kfp_af;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_af = bli_zaxpyf_zen_int_5;
@@ -893,7 +893,7 @@ void bli_ctrsv_unf_var2

    PASTECH(c, axpyf_ker_ft) kfp_af;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_af = bli_caxpyf_zen_int_5;
--- a/frame/3/bli_l3_smart_threading.c
+++ b/frame/3/bli_l3_smart_threading.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -149,7 +149,7 @@ err_t bli_gemm_smart_threading_sup
 {
 	err_t ret_val = BLIS_FAILURE;

-	// Sanity check, max available threads should be atleast 4 for the
+	// Sanity check, max available threads should be at least 4 for the
 	// smart threading/factorization to be meaningful. For nt < 4 the
 	// default ic,jc factorization holds good.
 	if ( ( m <= 1 ) || ( n <= 1 ) ||  ( k <= 1 ) || ( max_available_nt < 4 ) )
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -803,7 +803,7 @@ void bli_gemm_md_zgemm
 	}

 	{
-		// A sort of hack for communicating the desired pach schemas for A and B
+		// A sort of hack for communicating the desired pack schemas for A and B
 		// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 		// bli_l3_cntl_create_if()). This allows us to access the schemas from
 		// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -150,7 +151,7 @@ void bli_hemm_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/her2k/bli_her2k_front.c
+++ b/frame/3/her2k/bli_her2k_front.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -119,7 +120,7 @@ void bli_her2k_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/herk/bli_herk_front.c
+++ b/frame/3/herk/bli_herk_front.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -99,7 +100,7 @@ void bli_herk_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -149,7 +150,7 @@ void bli_symm_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/syr2k/bli_syr2k_front.c
+++ b/frame/3/syr2k/bli_syr2k_front.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -100,7 +101,7 @@ void bli_syr2k_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/syrk/bli_syrk_front.c
+++ b/frame/3/syrk/bli_syrk_front.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -94,7 +94,7 @@ void bli_syrk_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -168,7 +168,7 @@ void bli_trmm_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/trmm/bli_trmm_front_amd.c
+++ b/frame/3/trmm/bli_trmm_front_amd.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -177,7 +177,7 @@ void bli_trmm_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -160,7 +161,7 @@ void bli_trmm3_front
 	  rntm
 	);

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -154,7 +154,7 @@ void bli_trsm_front
 	// not impact the global cntx object.
 	cntx_t cntx_trsm = *cntx;

-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
@@ -165,11 +165,11 @@ void bli_trsm_front
 		/* Zen4 TRSM Fixme:
 		 *
 		 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels 
-		 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+		 * for TRSM (Till we implement TRSM AVX-512 kernels)
 		 * 
 		 * The AVX2 kernels use different block sizes then AVX512 kernels
 		 * Here we override the default block sizes in the context with AVX2 
-		 * specific block size used in  GEMMTRSM kernerls.
+		 * specific block size used in GEMMTRSM kernerls.
 		 * 
 		 * We need to revisit this when TRSM AVX-512 kernels are implemented.
 		 */
@@ -182,7 +182,7 @@ void bli_trsm_front
 		bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
 		bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
 	}
-	else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
+	else // if ( bli_cntx_method( cntx_trsm ) != BLIS_NAT )
 	{
 		pack_t schema_a = bli_cntx_schema_a_block( &cntx_trsm );
 		pack_t schema_b = bli_cntx_schema_b_panel( &cntx_trsm );
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	/* Zen4 TRSM Fixme:
 	 *
 	 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
-	 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+	 * for TRSM (Till we implement TRSM AVX-512 kernels)
 	 *
 	 * The AVX2 kernels for TRSM are enabled in the context, but they
 	 * are compatible with only AVX2 version of GEMM kernels.
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	/* Zen4 TRSM Fixme:
 	 *
 	 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
-	 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+	 * for TRSM (Till we implement TRSM AVX-512 kernels)
 	 *
 	 * The AVX2 kernels for TRSM are enabled in the context, but they
 	 * are compatible with only AVX2 version of GEMM kernels.
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	/* Zen4 TRSM Fixme:
 	 *
 	 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
-	 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+	 * for TRSM (Till we implement TRSM AVX-512 kernels)
 	 *
 	 * The AVX2 kernels for TRSM are enabled in the context, but they
 	 * are compatible with only AVX2 version of GEMM kernels.
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	/* Zen4 TRSM Fixme:
 	 *
 	 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
-	 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+	 * for TRSM (Till we implement TRSM AVX-512 kernels)
 	 *
 	 * The AVX2 kernels for TRSM are enabled in the context, but they
 	 * are compatible with only AVX2 version of GEMM kernels.
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -393,7 +393,7 @@ void bli_gks_register_cntx

 	// At this point, we know the pointer to the array of cntx_t* is NULL and
 	// needs to be allocated. Allocate the memory and initialize it to
-	// zeros/NULL, storing the address of the alloacted memory at the element
+	// zeros/NULL, storing the address of the allocated memory at the element
 	// for the current architecture id.
 	gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS );

--- a/frame/compat/bla_amax_amd.c
+++ b/frame/compat/bla_amax_amd.c
@@ -167,7 +167,7 @@ f77_int isamax_blis_impl
        incx0 = ( inc_t )(*incx);
    }

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
--- a/frame/compat/bla_axpy_amd.c
+++ b/frame/compat/bla_axpy_amd.c
@@ -499,7 +499,7 @@ void caxpy_blis_impl
      incy0 = ( inc_t )(*incy);
    }

-  // This function is invoked on all architectures including ‘generic’.
+  // This function is invoked on all architectures including 'generic'.
  // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
  if (bli_cpuid_is_avx2fma3_supported() == TRUE)
  {
@@ -603,7 +603,7 @@ void zaxpy_blis_impl
      incy0 = ( inc_t )(*incy);
    }

-  // This function is invoked on all architectures including ‘generic’.
+  // This function is invoked on all architectures including 'generic'.
  // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
  if (bli_cpuid_is_avx2fma3_supported() == TRUE)
  {
--- a/frame/compat/bla_copy_amd.c
+++ b/frame/compat/bla_copy_amd.c
@@ -162,7 +162,7 @@ void scopy_blis_impl
 		incy0 = (inc_t)(*incy);
 	}

-	// This function is invoked on all architectures including ‘generic’.
+	// This function is invoked on all architectures including 'generic'.
 	// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
 	if (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	{
@@ -267,7 +267,7 @@ void dcopy_blis_impl
 		incy0 = (inc_t)(*incy);
 	}

-	// This function is invoked on all architectures including ‘generic’.
+	// This function is invoked on all architectures including 'generic'.
 	// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
 	if (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	{
--- a/frame/compat/bla_dot_amd.c
+++ b/frame/compat/bla_dot_amd.c
@@ -584,7 +584,7 @@ scomplex cdotu_blis_impl
        incy0 = ( inc_t )(*incy);
    }

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
@@ -696,7 +696,7 @@ dcomplex zdotu_blis_impl
        incy0 = ( inc_t )(*incy);
    }

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
@@ -810,7 +810,7 @@ scomplex cdotc_blis_impl
        incy0 = ( inc_t )(*incy);
    }

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
@@ -922,7 +922,7 @@ dcomplex zdotc_blis_impl
        incy0 = ( inc_t )(*incy);
    }

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
--- a/frame/compat/bla_gemm_amd.c
+++ b/frame/compat/bla_gemm_amd.c
@@ -512,7 +512,7 @@ void dgemm_blis_impl
    const inc_t rs_c = 1;
    const inc_t cs_c = *ldc;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
@@ -681,7 +681,7 @@ void dgemm_blis_impl
    bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.

 #ifdef AOCL_DYNAMIC
-    //For smaller sizes dgemm_small is perfoming better
+    //For smaller sizes dgemm_small is performing better
    if (is_parallel && (((m0 >32) || (n0>32) || (k0>32)) && ((m0+n0+k0)>150)) )
 #else
    if (is_parallel)
--- a/frame/compat/bla_gemv_amd.c
+++ b/frame/compat/bla_gemv_amd.c
@@ -283,7 +283,7 @@ void dgemv_blis_impl
    rs_a = 1;
    cs_a = *lda;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
@@ -482,7 +482,7 @@ void sgemv_blis_impl
    rs_a = 1;
    cs_a = *lda;

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
--- a/frame/compat/bla_swap_amd.c
+++ b/frame/compat/bla_swap_amd.c
@@ -155,7 +155,7 @@ void sswap_blis_impl
        incy0 = ( inc_t )(*incy);
    }

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
        /* Call BLIS kernel */
@@ -255,7 +255,7 @@ void dswap_blis_impl
    }


-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    bli_dswapv_zen_int8
--- a/frame/compat/bla_trsm_amd.c
+++ b/frame/compat/bla_trsm_amd.c
@@ -706,7 +706,7 @@ void strsm_blis_impl
    bli_obj_set_struc( struca, &ao );

 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
@@ -1014,7 +1014,7 @@ void dtrsm_blis_impl

 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM

-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
@@ -1449,7 +1449,7 @@ void ztrsm_blis_impl
    bli_obj_set_struc( struca, &ao );

 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
@@ -1817,7 +1817,7 @@ void ctrsm_blis_impl
    bli_obj_set_struc( struca, &ao );

 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
    {
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2016 - 2023, Advanced Micro Devices, Inc.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
   Copyright (C) 2018, The University of Texas at Austin	

   Redistribution and use in source and binary forms, with or without
@@ -255,8 +255,8 @@ void bli_samaxv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// later, especially if BLIS is compiled with -mfpmath=sse).
+	// transitioning from AVX to SSE instructions (which may occur later,
+	// especially if BLIS is compiled with -mfpmath=sse).
 	_mm256_zeroupper();

 	/* Store final index to output variable. */
@@ -743,8 +743,8 @@ static void bli_vec_search_double
 		/*
 			Issue vzeroupper instruction to clear upper lanes of ymm registers.
 			This avoids a performance penalty caused by false dependencies when
-			transitioning from from AVX to SSE instructions (which may occur
-			as soon as the n_left cleanup loop below if BLIS is compiled with
+			transitioning from AVX to SSE instructions (which may occur as soon
+			as the n_left cleanup loop below if BLIS is compiled with
 			-mfpmath=sse).
 		*/
 		_mm256_zeroupper();
--- a/kernels/zen/1/bli_axpbyv_zen_int.c
+++ b/kernels/zen/1/bli_axpbyv_zen_int.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -150,8 +150,8 @@ void bli_saxpbyv_zen_int
 		
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();

@@ -282,8 +282,8 @@ void bli_daxpbyv_zen_int

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();

@@ -634,8 +634,8 @@ void bli_caxpbyv_zen_int

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();

@@ -1063,8 +1063,8 @@ void bli_zaxpbyv_zen_int

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();

@@ -1146,4 +1146,4 @@ void bli_zaxpbyv_zen_int
 		}
 	}
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
-}
+}
--- a/kernels/zen/1/bli_axpbyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpbyv_zen_int10.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -374,8 +374,8 @@ void bli_saxpbyv_zen_int10

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();

@@ -680,8 +680,8 @@ void bli_daxpbyv_zen_int10

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();

@@ -706,4 +706,4 @@ void bli_daxpbyv_zen_int10
 		}
 	}
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
-}
+}
--- a/kernels/zen/1/bli_axpyv_zen_int.c
+++ b/kernels/zen/1/bli_axpyv_zen_int.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
   Copyright (C) 2018, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
@@ -138,8 +138,8 @@ void bli_saxpyv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();

@@ -242,8 +242,8 @@ void bli_daxpyv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();

--- a/kernels/zen/1/bli_axpyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpyv_zen_int10.c
@@ -307,9 +307,10 @@ void bli_saxpyv_zen_int10

        // Issue vzeroupper instruction to clear upper lanes of ymm registers.
        // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
        // -mfpmath=sse).
+
        _mm256_zeroupper();

        for ( ; (i + 0) < n; i += 1 )
@@ -583,8 +584,8 @@ void bli_daxpyv_zen_int10

        // Issue vzeroupper instruction to clear upper lanes of ymm registers.
        // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
        // -mfpmath=sse).
        _mm256_zeroupper();

@@ -638,8 +639,8 @@ void bli_caxpyv_zen_int5
    float alphaR, alphaI;

    //scomplex alpha => aR + aI i
-    __m256           alphaRv;            // for braodcast vector aR (real part of alpha)
-    __m256           alphaIv;            // for braodcast vector aI (imaginary part of alpha)
+    __m256           alphaRv;            // for broadcast vector aR (real part of alpha)
+    __m256           alphaIv;            // for broadcast vector aI (imaginary part of alpha)
    __m256           xv[10];
    __m256           xShufv[10];
    __m256           yv[10];
@@ -837,8 +838,8 @@ void bli_caxpyv_zen_int5

        // Issue vzeroupper instruction to clear upper lanes of ymm registers.
        // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
        // -mfpmath=sse).
        _mm256_zeroupper();

@@ -947,8 +948,8 @@ void bli_zaxpyv_zen_int5
    {
        const dim_t n_elem_per_reg = 4;

-        __m256d alphaRv; // for braodcast vector aR (real part of alpha)
-        __m256d alphaIv; // for braodcast vector aI (imaginary part of alpha)
+        __m256d alphaRv; // for broadcast vector aR (real part of alpha)
+        __m256d alphaIv; // for broadcast vector aI (imaginary part of alpha)
        __m256d xv[7]; // Holds the X vector elements
        __m256d xShufv[5]; // Holds the permuted X vector elements
        __m256d yv[7]; // Holds the y vector elements
@@ -1258,8 +1259,8 @@ void bli_zaxpyv_zen_int5

        // Issue vzeroupper instruction to clear upper lanes of ymm registers.
        // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
        // -mfpmath=sse).
        _mm256_zeroupper();
    }
--- a/kernels/zen/1/bli_copyv_zen_int.c
+++ b/kernels/zen/1/bli_copyv_zen_int.c
@@ -460,8 +460,8 @@ void bli_zcopyv_zen_int

 			// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 			// This avoids a performance penalty caused by false dependencies when
-			// transitioning from from AVX to SSE instructions (which may occur
-			// as soon as the n_left cleanup loop below if BLIS is compiled with
+			// transitioning from AVX to SSE instructions (which may occur as soon
+			// as the n_left cleanup loop below if BLIS is compiled with
 			// -mfpmath=sse).
 			_mm256_zeroupper();
 		}
@@ -602,8 +602,8 @@ void bli_zcopyv_zen_int

 			// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 			// This avoids a performance penalty caused by false dependencies when
-			// transitioning from from AVX to SSE instructions (which may occur
-			// as soon as the n_left cleanup loop below if BLIS is compiled with
+			// transitioning from AVX to SSE instructions (which may occur as soon
+			// as the n_left cleanup loop below if BLIS is compiled with
 			// -mfpmath=sse).
 			_mm256_zeroupper();
 		}
--- a/kernels/zen/1/bli_dotv_zen_int.c
+++ b/kernels/zen/1/bli_dotv_zen_int.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
   Copyright (C) 2018, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
@@ -153,8 +153,8 @@ void bli_sdotv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();

@@ -274,8 +274,8 @@ void bli_ddotv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();

--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2016 - 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
   Copyright (C) 2018, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
@@ -222,8 +222,8 @@ void bli_sdotv_zen_int10

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// later, especially if BLIS is compiled with -mfpmath=sse).
+		// transitioning from AVX to SSE instructions (which may occur later,
+		// especially if BLIS is compiled with -mfpmath=sse).
 		_mm256_zeroupper();
 	}
 	else
@@ -434,8 +434,8 @@ void bli_ddotv_zen_int10

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// later, especially if BLIS is compiled with -mfpmath=sse).
+		// transitioning from AVX to SSE instructions (which may occur later,
+		// especially if BLIS is compiled with -mfpmath=sse).
 		_mm256_zeroupper();
 	}
 	else
@@ -711,8 +711,8 @@ void bli_cdotv_zen_int5
        }
        // Issue vzeroupper instruction to clear upper lanes of ymm registers.
        // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // later, especially if BLIS is compiled with -mfpmath=sse).
+        // transitioning from AVX to SSE instructions (which may occur later,
+        // especially if BLIS is compiled with -mfpmath=sse).
        _mm256_zeroupper();
    }
    else
@@ -1000,8 +1000,8 @@ void bli_zdotv_zen_int5

        // Issue vzeroupper instruction to clear upper lanes of ymm registers.
        // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // later, especially if BLIS is compiled with -mfpmath=sse).
+        // transitioning from AVX to SSE instructions (which may occur later,
+        // especially if BLIS is compiled with -mfpmath=sse).
        _mm256_zeroupper();
    }
    else
--- a/kernels/zen/1/bli_dotxv_zen_int.c
+++ b/kernels/zen/1/bli_dotxv_zen_int.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2016 - 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
   Copyright (C) 2018, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
@@ -181,8 +181,8 @@ void bli_sdotxv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();

@@ -311,8 +311,8 @@ void bli_ddotxv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();

@@ -551,8 +551,8 @@ void bli_zdotxv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();

@@ -800,8 +800,8 @@ void bli_cdotxv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();

--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -791,8 +791,8 @@ void bli_zdscalv_zen_int10

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();
 	}
@@ -968,8 +968,8 @@ void bli_zscalv_zen_int

 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// later, especially if BLIS is compiled with -mfpmath=sse).
+	// transitioning from AVX to SSE instructions (which may occur later,
+	// especially if BLIS is compiled with -mfpmath=sse).
 	_mm256_zeroupper();

 	/* In double complex data type the computation of
--- a/kernels/zen/1f/bli_axpy2v_zen_int.c
+++ b/kernels/zen/1f/bli_axpy2v_zen_int.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2018, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -539,8 +539,8 @@ void bli_zaxpy2v_zen_int

        // Issue vzeroupper instruction to clear upper lanes of ymm registers.
        // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
        // -mfpmath=sse).
        _mm256_zeroupper();

@@ -718,4 +718,4 @@ void bli_zaxpy2v_zen_int
        }
    }
    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
-}
+}
--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -717,8 +717,8 @@ void bli_zaxpyf_zen_int_4

    // Issue vzeroupper instruction to clear upper lanes of ymm registers.
    // This avoids a performance penalty caused by false dependencies when
-    // transitioning from from AVX to SSE instructions (which may occur
-    // later, especially if BLIS is compiled with -mfpmath=sse).
+    // transitioning from AVX to SSE instructions (which may occur later,
+    // especially if BLIS is compiled with -mfpmath=sse).
    _mm256_zeroupper();

    __m128d a_vec[4], y_vec, inter[2];
--- a/kernels/zen/3/bli_gemm_small.c
+++ b/kernels/zen/3/bli_gemm_small.c
@@ -104,7 +104,7 @@ err_t bli_gemm_small
    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
    return BLIS_NOT_YET_IMPLEMENTED;
 #else
-    // This function is invoked on all architectures including ‘generic’.
+    // This function is invoked on all architectures including 'generic'.
    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
    {
@@ -360,8 +360,8 @@ static err_t bli_sgemm_small

            // This is the part of the pack and compute optimization.
            // During the first column iteration, we store the accessed A matrix into
-            // contiguous static memory. This helps to keep te A matrix in Cache and
-            // aviods the TLB misses.
+            // contiguous static memory. This helps to keep the A matrix in Cache and
+            // avoids the TLB misses.
            if (required_packing_A)
            {
                col_idx = 0;
@@ -1748,7 +1748,7 @@ static err_t bli_sgemm_small
    {
        AOCL_DTL_TRACE_EXIT_ERR(
            AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for small gemm."
+            "Invalid dimensions for small gemm."
            );
        return BLIS_NONCONFORMAL_DIMENSIONS;
    }
@@ -1776,7 +1776,7 @@ err_t bli_dgemm_small
    gint_t K = bli_obj_width( a );  // number of columns of OP(A), will be updated if OP(A) is Transpose(A) .
    gint_t L = M * N;

-    /* if (N<3) //Implemenation assumes that N is atleast 3. VK */
+    /* if (N<3) //Implementation assumes that N is at least 3. VK */
    /*  { */
    /*      AOCL_DTL_TRACE_EXIT_ERR( */
    /*          AOCL_DTL_LEVEL_INFO, */
@@ -1908,8 +1908,8 @@ err_t bli_dgemm_small

            // This is the part of the pack and compute optimization.
            // During the first column iteration, we store the accessed A matrix into
-            // contiguous static memory. This helps to keep te A matrix in Cache and
-            // aviods the TLB misses.
+            // contiguous static memory. This helps to keep the A matrix in Cache and
+            // avoids the TLB misses.
            if (required_packing_A)
            {
                col_idx = 0;
@@ -3339,7 +3339,7 @@ err_t bli_dgemm_small
    {
        AOCL_DTL_TRACE_EXIT_ERR(
            AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for small gemm."
+            "Invalid dimensions for small gemm."
            );
        return BLIS_NONCONFORMAL_DIMENSIONS;
    }
@@ -3816,7 +3816,7 @@ static err_t bli_sgemm_small_atbn
    {
        AOCL_DTL_TRACE_EXIT_ERR(
            AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for small gemm."
+            "Invalid dimensions for small gemm."
            );
        return BLIS_NONCONFORMAL_DIMENSIONS;
    }
@@ -4256,7 +4256,7 @@ static err_t bli_dgemm_small_atbn
    {
        AOCL_DTL_TRACE_EXIT_ERR(
            AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for small gemm."
+            "Invalid dimensions for small gemm."
            );
        return BLIS_NONCONFORMAL_DIMENSIONS;
    }
@@ -4284,7 +4284,7 @@ err_t bli_dgemm_small_At
    gint_t K = bli_obj_width_after_trans( a );  // number of columns of OP(A), will be updated if OP(A) is Transpose(A) .


-    if (N<3) //Implemenation assumes that N is atleast 3.
+    if (N<3) //Implementation assumes that N is at least 3.
    {
        AOCL_DTL_TRACE_EXIT_ERR(
            AOCL_DTL_LEVEL_INFO,
@@ -5718,7 +5718,7 @@ err_t bli_dgemm_small_At
    {
        AOCL_DTL_TRACE_EXIT_ERR(
            AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for dgemm_small_At."
+            "Invalid dimensions for dgemm_small_At."
            );
        return BLIS_NONCONFORMAL_DIMENSIONS;
    }
@@ -5907,8 +5907,8 @@ err_t bli_zgemm_small
            /**
             * This is the part of the pack and compute optimization.
             * During the first column iteration, we store the accessed A
-             * matrix into contiguous static memory. This helps to keep te A
-             * matrix in Cache and aviods the TLB misses.
+             * matrix into contiguous static memory. This helps to keep the A
+             * matrix in Cache and avoids the TLB misses.
             */
            if (required_packing_A)
            {
@@ -9704,7 +9704,7 @@ err_t bli_zgemm_small
    {
        AOCL_DTL_TRACE_EXIT_ERR(
                AOCL_DTL_LEVEL_INFO,
-                "Invalid dimesions for small gemm."
+                "Invalid dimensions for small gemm."
                );
        return BLIS_NONCONFORMAL_DIMENSIONS;
    }
@@ -9733,7 +9733,7 @@ err_t bli_zgemm_small_At
    gint_t N = bli_obj_width( c );  // number of columns of Matrix C
    gint_t K = bli_obj_width_after_trans( a );  // number of columns of OP(A)

-    if (N<3) //Implemenation assumes that N is atleast 3.
+    if (N<3) //Implementation assumes that N is at least 3.
    {
        AOCL_DTL_TRACE_EXIT_ERR(
                AOCL_DTL_LEVEL_INFO,
@@ -13406,7 +13406,7 @@ err_t bli_zgemm_small_At
    {
        AOCL_DTL_TRACE_EXIT_ERR(
                AOCL_DTL_LEVEL_INFO,
-                "Invalid dimesions for dgemm_small_At."
+                "Invalid dimensions for dgemm_small_At."
                );
        return BLIS_NONCONFORMAL_DIMENSIONS;
    }
--- a/kernels/zen4/1/bli_amaxv_zen_int_avx512.c
+++ b/kernels/zen4/1/bli_amaxv_zen_int_avx512.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -627,8 +627,8 @@ void bli_samaxv_zen_int_avx512(

    // Issue vzeroupper instruction to clear upper lanes of ymm registers.
    // This avoids a performance penalty caused by false dependencies when
-    // transitioning from from AVX to SSE instructions (which may occur
-    // later, especially if BLIS is compiled with -mfpmath=sse).
+    // transitioning from AVX to SSE instructions (which may occur later,
+    // especially if BLIS is compiled with -mfpmath=sse).
    _mm256_zeroupper();

    /* Store final index to output variable. */
@@ -959,8 +959,8 @@ void bli_damaxv_zen_int_avx512(

    // Issue vzeroupper instruction to clear upper lanes of ymm registers.
    // This avoids a performance penalty caused by false dependencies when
-    // transitioning from from AVX to SSE instructions (which may occur
-    // later, especially if BLIS is compiled with -mfpmath=sse).
+    // transitioning from AVX to SSE instructions (which may occur later,
+    // especially if BLIS is compiled with -mfpmath=sse).
    _mm256_zeroupper();

    // Return value
--- a/sandbox/ref99/old/blx_gemm_front.c
+++ b/sandbox/ref99/old/blx_gemm_front.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2017-2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -98,7 +98,7 @@ void blx_gemm_front
 	}

 	{
-		// A sort of hack for communicating the desired pach schemas for A and
+		// A sort of hack for communicating the desired pack schemas for A and
 		// B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 		// bli_l3_cntl_create_if()). This allows us to access the schemas from
 		// the control tree, which hopefully reduces some confusion,