From 82c2eb4e8efd8b02a09a72e0440a43f589a2486d Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Fri, 16 Dec 2022 06:24:50 -0500 Subject: [PATCH] Code cleanup and warnings fixes Corrections for some occurances of: - Compiler warnings about initialization of float from double - Spelling mistakes in comments - Incorrect indentation of code and comments AMD-Internal: [CPUPL-2870] Change-Id: Icb68c789687bd0684844331d43071bfffecac9fc --- aocl_dtl/aocldtl.c | 4 +- config/zen3/make_defs.mk | 4 +- config/zen4/make_defs.mk | 4 +- configure | 4 +- frame/3/bli_l3_packm.c | 4 +- frame/compat/bla_gemm_amd.c | 4 +- frame/thread/bli_l3_decor_openmp.c | 4 +- frame/thread/bli_l3_decor_single.c | 4 +- frame/util/bli_util_unb_var1.c | 6 +- kernels/zen/3/bli_gemm_small.c | 22 +- kernels/zen/3/bli_trsm_small.c | 348 ++++++++++++++--------------- 11 files changed, 204 insertions(+), 204 deletions(-) diff --git a/aocl_dtl/aocldtl.c b/aocl_dtl/aocldtl.c index 6e7ee3510..a9b3db178 100644 --- a/aocl_dtl/aocldtl.c +++ b/aocl_dtl/aocldtl.c @@ -5,7 +5,7 @@ * These functions are invoked though macros by * end user. * - * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. * *=======================================================================*/ #include "blis.h" @@ -129,7 +129,7 @@ void DTL_Initialize( #if (AOCL_DTL_LOG_ENABLE || AOCL_DTL_DUMP_ENABLE) - /* Check if DTL logging is requested via envoronment variable */ + /* Check if DTL logging is requested via environment variable */ gbIsLoggingEnabled = bli_env_get_var( "AOCL_VERBOSE", TRUE ); #endif diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk index 8522a1e95..ecbb7889b 100644 --- a/config/zen3/make_defs.mk +++ b/config/zen3/make_defs.mk @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -96,7 +96,7 @@ ifeq ($(CC_VENDOR),clang) # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) -# For our prupose we just want to know if it version 2x or 3x +# For our purpose we just want to know if it version 2x or 3x # for version 3x we will enable znver3 ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) diff --git a/config/zen4/make_defs.mk b/config/zen4/make_defs.mk index f10f20a48..05975b969 100644 --- a/config/zen4/make_defs.mk +++ b/config/zen4/make_defs.mk @@ -4,7 +4,7 @@ # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -112,7 +112,7 @@ ifeq ($(CC_VENDOR),clang) # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0) -# For our prupose we just want to know if it version 2x or 3x or 4x +# For our purpose we just want to know if it version 2x or 3x or 4x # for version 4x we will enable znver4 ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1) diff --git a/configure b/configure index 73dc8cc35..36b3e3c85 100755 --- a/configure +++ b/configure @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -215,7 +215,7 @@ print_usage() echo " " echo " Set the size (in bits) of internal BLIS integers and" echo " integer types used in native BLIS interfaces. The" - echo " default inteter type size is architecture dependent." + echo " default integer type size is architecture dependent." echo " (Hint: You can always find this value printed at the" echo " beginning of the testsuite output.)" echo " " diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index 1134bdc1f..5d6914ec5 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -164,7 +164,7 @@ void bli_l3_packm // with the mem_t entry acquired from the memory broker (now cached in // the control tree node). void* buf = bli_mem_buffer( cntl_mem_p ); - bli_obj_set_buffer( buf, x_pack ); + bli_obj_set_buffer( buf, x_pack ); // Pack the contents of object x to object x_pack. diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index 7ae62b06f..3eb67b8d2 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -804,7 +804,7 @@ void zgemm_blis_impl bli_obj_set_conjtrans( blis_transa, &ao ); bli_obj_set_conjtrans( blis_transb, &bo ); - // default instance peformance tuning is done in zgemm. + // default instance performance tuning is done in zgemm. // Single instance tuning is done based on env set. //dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 ); diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c index b01c208a3..14b2bd04a 100644 --- a/frame/thread/bli_l3_decor_openmp.c +++ b/frame/thread/bli_l3_decor_openmp.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -145,7 +145,7 @@ void bli_l3_thread_decorator tls_aoclprogress_counter = 0; // We send the update only after certain threshold is reached, - // The thresold is defined as AOCL_PROGRESS_FREQUENCY. + // The threshold is defined as AOCL_PROGRESS_FREQUENCY. // This variable stores the counter value when last update was sent. // It is compared with current counter value to see if it is time to // send the next update. diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c index 444583e73..0a4d16c22 100644 --- a/frame/thread/bli_l3_decor_single.c +++ b/frame/thread/bli_l3_decor_single.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -121,7 +121,7 @@ void bli_l3_thread_decorator tls_aoclprogress_counter = 0; // We send the update only after certain threshold is reached, - // The thresold is defined as AOCL_PROGRESS_FREQUENCY. + // The threshold is defined as AOCL_PROGRESS_FREQUENCY. // This variable stores the counter value when last update was sent. // It is compared with current counter value to see if it is time to // send the next update. diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index e4b0acf04..3c4fd6ea1 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -329,8 +329,8 @@ void bli_cnormfv_unb_var1 } else { - float* zero = bli_d0; - float* one = bli_d1; + float* zero = bli_s0; + float* one = bli_s1; float scale; float sumsq; float sqrt_sumsq; diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c index 22bb48f73..069b90170 100644 --- a/kernels/zen/3/bli_gemm_small.c +++ b/kernels/zen/3/bli_gemm_small.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -165,25 +165,25 @@ err_t bli_gemm_small #ifndef BLIS_ENABLE_MULTITHREADING // bli_dgemm_small_At is called directly from blas interface for // sizes within thresholds. - // Avoinding calling of bli_dgemm_small_At from gemm_front + // Avoiding calling of bli_dgemm_small_At from gemm_front // and directing to native implementation. return BLIS_NOT_YET_IMPLEMENTED; #else return bli_dgemm_small_At(alpha, a, b, beta, c, cntx, cntl); #endif } - if(dt == BLIS_DCOMPLEX) - { + if(dt == BLIS_DCOMPLEX) + { #ifndef BLIS_ENABLE_MULTITHREADING // bli_zgemm_small_At is called directly from blas interface for // sizes within thresholds. - // Avoinding calling of bli_zgemm_small_At from gemm_front + // Avoiding calling of bli_zgemm_small_At from gemm_front // and directing to native implementation. return BLIS_NOT_YET_IMPLEMENTED; #else - return bli_zgemm_small_At(alpha, a, b, beta, c, cntx, cntl); + return bli_zgemm_small_At(alpha, a, b, beta, c, cntx, cntl); #endif - } + } if (bli_obj_has_notrans( b )) { @@ -1827,7 +1827,7 @@ static err_t bli_sgemm_small double *C = bli_obj_buffer_at_off(c); // pointer to elements of Matrix C double *tA = A, *tB = B, *tC = C;//, *tA_pack; - double *tA_packed; // temprorary pointer to hold packed A memory pointer + double *tA_packed; // temporary pointer to hold packed A memory pointer guint_t row_idx_packed; //packed A memory row index guint_t lda_packed; //lda of packed A guint_t col_idx_start; //starting index after A matrix is packed. @@ -4341,7 +4341,7 @@ err_t bli_dgemm_small_At double *C = bli_obj_buffer_at_off(c); // pointer to elements of Matrix C double *tA = A, *tB = B, *tC = C;//, *tA_pack; - double *tA_packed; // temprorary pointer to hold packed A memory pointer + double *tA_packed; // temporary pointer to hold packed A memory pointer guint_t row_idx_packed; //packed A memory row index guint_t lda_packed; //lda of packed A dim_t tb_inc_row = 1; // row stride of matrix B @@ -5822,7 +5822,7 @@ err_t bli_zgemm_small dcomplex *C = bli_obj_buffer_at_off(c); //pointer to elements of Matrix C dcomplex *tA = A, *tB = B, *tC = C;//, *tA_pack; - dcomplex *tA_packed; //temprorary pointer to hold packed A memory pointer + dcomplex *tA_packed; //temporary pointer to hold packed A memory pointer guint_t row_idx_packed; //packed A memory row index guint_t lda_packed; //lda of packed A guint_t col_idx_start; //starting index after A matrix is packed. @@ -9779,7 +9779,7 @@ err_t bli_zgemm_small_At dcomplex *C = bli_obj_buffer_at_off(c); //pointer to elements of Matrix C dcomplex *tA = A, *tB = B, *tC = C;//, *tA_pack; - dcomplex *tA_packed; // temprorary pointer to hold packed A memory pointer + dcomplex *tA_packed; // temporary pointer to hold packed A memory pointer guint_t row_idx_packed; //packed A memory row index guint_t lda_packed; //lda of packed A dim_t tb_inc_row = 1; // row stride of matrix B diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c index f2f94fc1a..12c0ee729 100644 --- a/kernels/zen/3/bli_trsm_small.c +++ b/kernels/zen/3/bli_trsm_small.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1439,8 +1439,8 @@ BLIS_INLINE err_t dtrsm_XAltB_ref /* Load b11 of size 6x8 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform DTRSM operation for left cases. + Add the GEMM output and perform in register transpose of b11 + to perform DTRSM operation for left cases. */ #define BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal) \ ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal));\ @@ -4684,8 +4684,8 @@ BLIS_INLINE err_t dtrsm_XAltB_ref /* Load b11 of size 6x8 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform DTRSM operation for left cases. + Add the GEMM output and perform in register transpose of b11 + to perform DTRSM operation for left cases. */ #define BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal) \ ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));\ @@ -5119,7 +5119,7 @@ BLIS_INLINE void bli_dtrsm_small_pack } /* Pack diagonal elements of A block (8 or 6) into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ BLIS_INLINE void dtrsm_small_pack_diag_element @@ -7772,8 +7772,8 @@ BLIS_INLINE err_t ztrsm_AuXB_ref /* * Load b11 of size 3x4 and multiply with alpha - * Add the GEMM output and perform inregister transose of b11 - * to peform ZTRSM operation for left cases. + * Add the GEMM output and perform in register transpose of b11 + * to perform ZTRSM operation for left cases. */ #define BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal) {\ ymm16 = _mm256_broadcast_pd(( __m128d const *)(&AlphaVal));\ @@ -8383,7 +8383,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB bool transa = bli_obj_has_trans(a); dim_t cs_a, rs_a; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -8480,7 +8480,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB /* Pack 6 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ @@ -8513,8 +8513,8 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_DTRSM_SMALL_GEMM_6nx8m(a01,b10,cs_b,p_lda,k_iter) @@ -8522,7 +8522,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB /* Load b11 of size 8x6 and multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_DTRSM_SMALL_6x8(AlphaVal,b11,cs_b) @@ -10785,7 +10785,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB dim_t cs_a, rs_a; dim_t d_mr = 8,d_nr = 6; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -10883,7 +10883,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB /* Pack 6 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr); @@ -10915,8 +10915,8 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ @@ -10925,7 +10925,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB /* Load b11 of size 8x6 and multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_DTRSM_SMALL_6x8(AlphaVal,b11,cs_b) @@ -13105,7 +13105,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB dim_t cs_a, rs_a; dim_t d_mr = 8,d_nr = 6; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -13174,7 +13174,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB } /* - Performs solving TRSM for 8 colmns at a time from 0 to m/d_mr in steps of d_mr + Performs solving TRSM for 8 columns at a time from 0 to m/d_mr in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-d_mr) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -13196,15 +13196,15 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB /* Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 8x(m-8) which is the maximum GEMM alone block size in A + a10 block size will be increasing by d_mr for every next iteration + until it reaches 8x(m-8) which is the maximum GEMM alone block size in A b. This packed buffer is reused to calculate all n rows of B matrix */ bli_dtrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr); /* Pack 8 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr); @@ -13221,7 +13221,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB c. This loop GEMM+TRSM loops operates with 8x6 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ for(j = (n - d_nr); (j + 1) > 0; j -= d_nr) { @@ -13235,16 +13235,16 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_DTRSM_SMALL_GEMM_8mx6n(a10,b01,cs_b,p_lda,k_iter) /* Load b11 of size 6x8 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal) @@ -15110,7 +15110,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB dim_t cs_a, rs_a; dim_t d_mr = 8,d_nr = 6; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -15178,7 +15178,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB } /* - Performs solving TRSM for 8 colmns at a time from 0 to m/8 in steps of d_mr + Performs solving TRSM for 8 columns at a time from 0 to m/8 in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-8) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -15194,17 +15194,17 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB if(transa) { /* - Load, tranpose and pack current A block (a10) into packed buffer memory D_A_pack + Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 8x(m-8) which is the maximum GEMM alone block size in A + a10 block size will be increasing by d_mr for every next iteration + until it reaches 8x(m-8) which is the maximum GEMM alone block size in A b. This packed buffer is reused to calculate all n rows of B matrix */ bli_dtrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr); /* Pack 8 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr); @@ -15221,7 +15221,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB c. This loop GEMM+TRSM loops operates with 8x6 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ dim_t temp = n - d_nr + 1; for(j = 0; j < temp; j += d_nr) //loop along 'N' dimension @@ -15237,16 +15237,16 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_DTRSM_SMALL_GEMM_8mx6n(a10,b01,cs_b,p_lda,k_iter) /* Load b11 of size 6x8 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal) @@ -17148,7 +17148,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB /* Pack diagonal elements of A block (16 or 6) into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ BLIS_INLINE void strsm_small_pack_diag_element @@ -17709,7 +17709,7 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB dim_t cs_a, rs_a; dim_t d_mr = 16,d_nr = 6; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -17805,7 +17805,7 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB /* Pack 6 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ strsm_small_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr); @@ -17837,8 +17837,8 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB BLIS_SET_S_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_STRSM_SMALL_GEMM_6nx16m(a01,b10,cs_b,p_lda,k_iter) @@ -17846,7 +17846,7 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB /* Load b11 of size 16x6 and multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_STRSM_SMALL_6x16(AlphaVal,b11,cs_b) @@ -21374,7 +21374,7 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB bool transa = bli_obj_has_trans(a); dim_t cs_a, rs_a; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -21472,7 +21472,7 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB /* Pack 6 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ @@ -21505,8 +21505,8 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB BLIS_SET_S_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_STRSM_SMALL_GEMM_6nx16m(a01,b10,cs_b,p_lda,k_iter) @@ -21514,7 +21514,7 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB /* Load b11 of size 16x6 and multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_STRSM_SMALL_6x16(AlphaVal,b11,cs_b) @@ -25217,7 +25217,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB dim_t cs_a, rs_a; dim_t d_mr = 16,d_nr = 6; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -25283,7 +25283,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB } /* - Performs solving TRSM for 16 colmns at a time from 0 to m/16 in steps of d_mr + Performs solving TRSM for 16 columns at a time from 0 to m/16 in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 16x6 to 16x (m-16) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -25299,17 +25299,17 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB if(transa) { /* - Load, tranpose and pack current A block (a10) into packed buffer memory D_A_pack + Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 16x(m-16) which is the maximum GEMM alone block size in A + a10 block size will be increasing by d_mr for every next iteration + until it reaches 16x(m-16) which is the maximum GEMM alone block size in A b. This packed buffer is reused to calculate all n rows of B matrix */ bli_strsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr); /* Pack 16 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr); @@ -25326,7 +25326,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB c. This loop GEMM+TRSM loops operates with 16x6 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ dim_t temp = n - d_nr + 1; for(j = 0; j < temp; j += d_nr) //loop along 'N' dimension @@ -25342,16 +25342,16 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB BLIS_SET_S_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_STRSM_SMALL_GEMM_16mx6n(a10,b01,cs_b,p_lda,k_iter) /* Load b11 of size 6x16 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal) @@ -25877,8 +25877,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB /* Load b11 of size 6x16 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal)); ymm0 = _mm256_broadcast_ss((float const *)(&zero)); @@ -26459,7 +26459,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB a11 += rs_a; - // N-register tranpose and store + // N-register transpose and store ymm0 = _mm256_unpacklo_ps(ymm10, ymm11); ymm1 = _mm256_unpacklo_ps(ymm17, ymm18); @@ -26556,8 +26556,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB /* Load b11 of size 6x16 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal)); ymm0 = _mm256_broadcast_ss((float const *)(&zero)); @@ -26641,8 +26641,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB /* Load b11 of size 6x16 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal)); ymm0 = _mm256_broadcast_ss((float const *)(&zero)); @@ -26724,8 +26724,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB /* Load b11 of size 6x16 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal)); ymm0 = _mm256_broadcast_ss((float const *)(&zero)); @@ -29583,7 +29583,7 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB dim_t cs_a, rs_a; dim_t d_mr = 16,d_nr = 6; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -29671,15 +29671,15 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB /* Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 16x(m-16) which is the maximum GEMM alone block size in A + a10 block size will be increasing by d_mr for every next iteration + until it reaches 16x(m-16) which is the maximum GEMM alone block size in A b. This packed buffer is reused to calculate all n rows of B matrix */ bli_strsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr); /* Pack 8 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr); @@ -29710,16 +29710,16 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB BLIS_SET_S_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_STRSM_SMALL_GEMM_16mx6n(a10,b01,cs_b,p_lda,k_iter) /* Load b11 of size 6x16 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal) @@ -31830,8 +31830,8 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB /* Load b11 of size 6x8 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal)); @@ -33724,7 +33724,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB dim_t cs_a, rs_a; dim_t d_mr = 4,d_nr = 3; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -33792,7 +33792,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB } /* - Performs solving TRSM for 4 colmns at a time from 0 to m/4 in steps of d_mr + Performs solving TRSM for 4 columns at a time from 0 to m/4 in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -33808,19 +33808,19 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB if(transa) { /* - Load, tranpose and pack current A block (a10) into packed buffer memory - D_A_pack + Load, transpose and pack current A block (a10) into packed buffer memory + D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 4x(m-4) which is the maximum GEMM alone block size - in A + a10 block size will be increasing by d_mr for every next iteration + until it reaches 4x(m-4) which is the maximum GEMM alone block size + in A b. This packed buffer is reused to calculate all n rows of B matrix */ bli_ztrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr); /* Pack 4 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr); @@ -33836,7 +33836,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB c. This loop GEMM+TRSM loops operates with 4x3 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ dim_t temp = n - d_nr + 1; for(j = 0; j < temp; j += d_nr) //loop along 'N' dimension @@ -33852,16 +33852,16 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_ZTRSM_SMALL_GEMM_4mx3n(a10,b01,cs_b,p_lda,k_iter) /* Load b11 of size 3x4 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal) /* @@ -34957,7 +34957,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB dim_t cs_a, rs_a; dim_t d_mr = 4,d_nr = 3; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -35026,7 +35026,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB } /* - Performs solving TRSM for 4 colmns at a time from 0 to m/d_mr in steps of d_mr + Performs solving TRSM for 4 columns at a time from 0 to m/d_mr in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-d_mr) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -35047,18 +35047,18 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB { /* Load, transpose and pack current A block (a10) into packed buffer memory - D_A_pack + D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 4x(m-4) which is the maximum GEMM alone block size - in A + a10 block size will be increasing by d_mr for every next iteration + until it reaches 4x(m-4) which is the maximum GEMM alone block size + in A b. This packed buffer is reused to calculate all n rows of B matrix */ bli_ztrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr); /* Pack 8 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr); @@ -35075,7 +35075,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB c. This loop GEMM+TRSM loops operates with 8x6 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ for(j = (n - d_nr); (j + 1) > 0; j -= d_nr) { @@ -35089,16 +35089,16 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_ZTRSM_SMALL_GEMM_4mx3n(a10,b01,cs_b,p_lda,k_iter) /* Load b11 of size 6x8 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal) @@ -36181,7 +36181,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB dim_t cs_a, rs_a; dim_t d_mr = 4,d_nr = 3; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -36272,7 +36272,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB /* Pack 3 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ @@ -36308,8 +36308,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ @@ -36318,7 +36318,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB /* Load b11 multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_ZTRSM_SMALL_3x4(AlphaVal,b11,cs_b) @@ -36491,8 +36491,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB * accumulations */ BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ @@ -36501,7 +36501,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB /* Load b11 multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_ZTRSM_SMALL_3x3(AlphaVal,b11,cs_b) @@ -36684,8 +36684,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB * accumulations */ BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ @@ -36694,7 +36694,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB /* Load b11 of size 8x6 and multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_ZTRSM_SMALL_3x2(AlphaVal,b11,cs_b) @@ -36832,8 +36832,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB * accumulations */ BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ @@ -36842,7 +36842,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB /* Load b11 and multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_ZTRSM_SMALL_3x1(AlphaVal,b11,cs_b) @@ -37166,8 +37166,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB /*Fill zeros into ymm registers used in gemm accumulations */ BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ //BLIS_ZTRSM_SMALL_GEMM_3nx3m(a01,b10,cs_b,p_lda,k_iter) @@ -37255,8 +37255,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB /*Fill zeros into ymm registers used in gemm accumulations */ BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_ZTRSM_SMALL_GEMM_2nx2m(a01,b10,cs_b,p_lda,k_iter) @@ -37326,8 +37326,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB /*Fill zeros into ymm registers used in gemm accumulations */ BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_ZTRSM_SMALL_GEMM_2nx1m(a01,b10,cs_b,p_lda,k_iter) @@ -37644,7 +37644,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB dim_t cs_a, rs_a; dim_t d_mr = 4,d_nr = 3; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -37734,7 +37734,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB /* Pack 3 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ @@ -37769,8 +37769,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB BLIS_SET_YMM_REG_ZEROS /* - Peform GEMM between a01 and b10 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a01 and b10 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ @@ -37779,7 +37779,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB /* Load b11 of size 4x3 and multiply with alpha Add the GEMM output to b11 - and peform TRSM operation. + and perform TRSM operation. */ BLIS_PRE_ZTRSM_SMALL_3x4(AlphaVal,b11,cs_b) @@ -42231,7 +42231,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB dim_t cs_a, rs_a; dim_t d_mr = 8,d_nr = 3; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -42308,7 +42308,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB } /* - Performs solving TRSM for 4 colmns at a time from 0 to m/4 in steps of d_mr + Performs solving TRSM for 4 columns at a time from 0 to m/4 in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -42324,11 +42324,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB if(transa) { /* - Load, tranpose and pack current A block (a10) into packed buffer memory + Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 4x(m-4) which is the maximum GEMM alone block size + a10 block size will be increasing by d_mr for every next iteration + until it reaches 4x(m-4) which is the maximum GEMM alone block size in A b. This packed buffer is reused to calculate all n rows of B matrix */ @@ -42336,7 +42336,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB /* Pack 4 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr); @@ -42352,7 +42352,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB c. This loop GEMM+TRSM loops operates with 4x3 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ dim_t temp = n - d_nr + 1; for(j = 0; j < temp; j += d_nr) //loop along 'N' dimension @@ -42368,16 +42368,16 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB BLIS_SET_S_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_CTRSM_SMALL_GEMM_8mx3n(a10,b01,cs_b,p_lda,k_iter) /* Load b11 of size 3x4 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_CTRSM_SMALL_NREG_TRANSPOSE_3x8(b11,cs_b,AlphaVal) /* @@ -44763,7 +44763,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB dim_t cs_a, rs_a; dim_t d_mr = 8,d_nr = 3; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -44840,7 +44840,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB } /* - Performs solving TRSM for 4 colmns at a time from 0 to m/4 in steps of d_mr + Performs solving TRSM for 4 columns at a time from 0 to m/4 in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -44858,11 +44858,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB if(transa) { /* - Load, tranpose and pack current A block (a10) into packed buffer memory + Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 4x(m-4) which is the maximum GEMM alone block size + a10 block size will be increasing by d_mr for every next iteration + until it reaches 4x(m-4) which is the maximum GEMM alone block size in A b. This packed buffer is reused to calculate all n rows of B matrix */ @@ -44870,7 +44870,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB /* Pack 4 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr); @@ -44886,7 +44886,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB c. This loop GEMM+TRSM loops operates with 4x3 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ for(j = (n - d_nr); (j + 1) > 0; j -= d_nr) //loop along 'N' dimension @@ -44902,16 +44902,16 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB BLIS_SET_S_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_CTRSM_SMALL_GEMM_8mx3n(a10,b01,cs_b,p_lda,k_iter) /* Load b11 of size 3x4 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_CTRSM_SMALL_NREG_TRANSPOSE_3x8(b11,cs_b,AlphaVal) /* @@ -47544,7 +47544,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB dim_t cs_a, rs_a; dim_t d_mr = 8,d_nr = 3; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -47618,7 +47618,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB } /* - Performs solving TRSM for 4 colmns at a time from 0 to m/4 in steps of d_mr + Performs solving TRSM for 4 columns at a time from 0 to m/4 in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -47634,11 +47634,11 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB if(transa) { /* - Load, tranpose and pack current A block (a10) into packed buffer memory + Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 4x(m-4) which is the maximum GEMM alone block size + a10 block size will be increasing by d_mr for every next iteration + until it reaches 4x(m-4) which is the maximum GEMM alone block size in A b. This packed buffer is reused to calculate all n rows of B matrix */ @@ -47646,7 +47646,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB /* Pack 4 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr); @@ -47662,7 +47662,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB c. This loop GEMM+TRSM loops operates with 4x3 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ for(i = (m-d_mr); (i+1) > 0; i -= d_mr) //loop along 'M' direction { @@ -47678,16 +47678,16 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB BLIS_SET_S_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_CTRSM_SMALL_GEMM_3nx8m(a01,b10,cs_b,p_lda,k_iter) /* Load b11 of size 3x4 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_PRE_CTRSM_SMALL_3x8(AlphaVal, b11, cs_b) /* @@ -49164,7 +49164,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB dim_t cs_a, rs_a; dim_t d_mr = 8,d_nr = 3; - // Swap rs_a & cs_a in case of non-tranpose. + // Swap rs_a & cs_a in case of non-transpose. if(transa) { cs_a = bli_obj_col_stride(a); // column stride of A @@ -49237,7 +49237,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB } /* - Performs solving TRSM for 4 colmns at a time from 0 to m/4 in steps of d_mr + Performs solving TRSM for 4 columns at a time from 0 to m/4 in steps of d_mr a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4) First there will be no GEMM and no packing of a10 because it is only TRSM b. Using packed a10 block and b01 block perform GEMM operation @@ -49254,11 +49254,11 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB if(transa) { /* - Load, tranpose and pack current A block (a10) into packed buffer memory + Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack a. This a10 block is used in GEMM portion only and this - a10 block size will be increasing by d_mr for every next itteration - untill it reaches 4x(m-4) which is the maximum GEMM alone block size + a10 block size will be increasing by d_mr for every next iteration + until it reaches 4x(m-4) which is the maximum GEMM alone block size in A b. This packed buffer is reused to calculate all n rows of B matrix */ @@ -49266,7 +49266,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB /* Pack 4 diagonal elements of A block into an array - a. This helps in utilze cache line efficiently in TRSM operation + a. This helps to utilize cache line efficiently in TRSM operation b. store ones when input is unit diagonal */ ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr); @@ -49282,7 +49282,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB c. This loop GEMM+TRSM loops operates with 4x3 block size along n dimension for every d_nr rows of b01 where packed A buffer is reused in computing all n rows of B. - d. Same approch is used in remaining fringe cases. + d. Same approach is used in remaining fringe cases. */ for(i = 0; (i+d_mr-1) < m; i += d_mr) //loop along 'M' direction { @@ -49298,16 +49298,16 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB BLIS_SET_S_YMM_REG_ZEROS /* - Peform GEMM between a10 and b01 blocks - For first itteration there will be no GEMM operation + Perform GEMM between a10 and b01 blocks + For first iteration there will be no GEMM operation where k_iter are zero */ BLIS_CTRSM_SMALL_GEMM_3nx8m(a01,b10,cs_b,p_lda,k_iter) /* Load b11 of size 3x4 and multiply with alpha - Add the GEMM output and perform inregister transose of b11 - to peform TRSM operation. + Add the GEMM output and perform in register transpose of b11 + to perform TRSM operation. */ BLIS_PRE_CTRSM_SMALL_3x8(AlphaVal, b11, cs_b) /*