Performance Improvement for ztrsm small sizes

Details: - Optimization of ztrsm for Non-unit Diag Variants. - Handled Overflow and Underflow Vulnerabilites in ztrsm small implementations. - Fixed failures observed in libflame testing. - Fine-tuned ztrsm small implementations for specific sizes 64<= m,n <= 256, by keeping the number of threads to the optimum value, under AOCL_DYNAMIC flag. - For small sizes, ztrsm small implementation is used for all variants. AMD-Internal: [SWLCSG-1194] Change-Id: I066491bb03e5cda390cb699182af4350ae60be2d
2026-05-11 17:50:00 +00:00 · 2022-04-06 00:53:27 +05:30
parent fe7f0a9085
commit cc3069fb5e
3 changed files with 67 additions and 102 deletions
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -639,6 +639,14 @@ void bli_nthreads_optimum(
 	  if(m<=512 && n<=512)
 	    n_threads_ideal = 4;
 	}
+	else if( family == BLIS_TRSM && bli_obj_is_dcomplex(c))
+        {
+            dim_t m = bli_obj_length(c);
+            dim_t n = bli_obj_width(c);
+
+	    if((m>=64) && (m<=256) && (n>=64) && (n<=256))
+                n_threads_ideal = 8;
+        }
 	else if( family == BLIS_GEMMT && bli_obj_is_double(c)  )
 	{
 		dim_t n = bli_obj_length(c);
--- a/frame/compat/bla_trsm_amd.c
+++ b/frame/compat/bla_trsm_amd.c
@@ -1184,7 +1184,6 @@ void ztrsm_
     * is doing better than native multithread */
    bool nt = bli_thread_get_is_parallel();

-    if((blis_side == BLIS_RIGHT) || (blis_diaga == BLIS_UNIT_DIAG)) {
    if(((nt==0) && (m0<=500) && (n0<=500)) ||
       (nt && ((m0+n0)<128)))
    {
@@ -1206,7 +1205,6 @@ void ztrsm_
            return;
        }
    }
-    }
 #endif

    bli_trsmnat
--- a/kernels/zen/3/bli_trsm_small.c
+++ b/kernels/zen/3/bli_trsm_small.c
@@ -5771,68 +5771,58 @@ BLIS_INLINE err_t ztrsm_AuXB_ref
 * Performs dcomplex division of vec1 and vec2 with ymm1.
 * vec1 and vec2 gets divided by ymm1 which holds
 * diagonal element from buffer.
- * Function gets called while performing TRSM.
+ * Using bli_zinvscals() to avoid overflow and underflow
+ * scenarios. Function gets called while performing TRSM.
 */
 #define BLIS_ZTRSM_TWO_DIV(vec1, vec2) {\
    if(!is_unitdiag) {\
        if(conjtransa){\
            ymm1 = _mm256_mul_pd(ymm1, ymm0);\
        }\
-        ymm12 = _mm256_mul_pd(ymm1, ymm0);\
-        /*perform decomplex multiplication*/\
-        /* Switch the real and imaginary elements of vec2 */\
-        ymm14 = _mm256_permute_pd(ymm12, 0x5);\
-        /* Negate the imaginary elements of vec2 */\
-        ymm14 = _mm256_mul_pd(ymm14, ymm0);\
-        /* Multiply vec1 and vec2 */ \
-        ymm13 = _mm256_mul_pd(vec1, ymm12); /*vec3*/\
-        /* Multiply vec1 and the modified vec2 */\
-        ymm14 = _mm256_mul_pd(vec1, ymm14); /*vec4*/\
-        /* Horizontally subtract the elements in vec3 and vec4 */\
-        vec1 = _mm256_hsub_pd(ymm13, ymm14);\
-        \
-        ymm14 = _mm256_permute_pd(ymm12, 0x5);\
-        /* Negate the imaginary elements of vec2 */\
-        ymm14 = _mm256_mul_pd(ymm14, ymm0);\
-        ymm13 = _mm256_mul_pd(vec2, ymm12);\
-        ymm14 = _mm256_mul_pd(vec2, ymm14);\
-        vec2 = _mm256_hsub_pd(ymm13, ymm14);\
-        /*dcomplex multiplication is done*/\
-        /*Swapping real & imaginary component position for addition with respective
-         * components*/\
-        ymm12 = _mm256_mul_pd(ymm1, ymm1);\
-        ymm13 = _mm256_permute4x64_pd(ymm12, 0xb1);\
-        ymm14 = _mm256_add_pd(ymm12, ymm13);\
-        \
-        /*Finally dividing numerator by denominator*/\
-        vec1 = _mm256_div_pd(vec1, ymm14);\
-        vec2 = _mm256_div_pd(vec2, ymm14);\
+\
+        dcomplex b_data[4];\
+        dcomplex d11_data[2];\
+\
+        _mm256_storeu_pd((double *)(b_data), vec1);\
+        _mm256_storeu_pd((double *)(b_data + 2), vec2);\
+        _mm256_storeu_pd((double *)(d11_data), ymm1);\
+\
+        for(dim_t i = 0; i < 4; i++)\
+        {\
+            bli_zinvscals(d11_data[0],b_data[i]);\
+        }\
+\
+        vec1 = _mm256_loadu_pd((double *)b_data);\
+        vec2 = _mm256_loadu_pd((double *)(b_data+2));\
+\
    }\
 }

 /**
 * Performs dcomplex division of vec1 with ymm1.
 * ymm1 holds diagonal element from buffer.
- * Function gets called while performing TRSM.
+ * Using bli_zinvscals() to avoid overflow and underflow
+ * scenarios. Function gets called while performing TRSM.
 */
 #define BLIS_ZTRSM_DIV(vec1) {\
    if(!is_unitdiag){\
        if(conjtransa){\
            ymm1 = _mm256_mul_pd(ymm1, ymm0);\
        }\
-        ymm12 = _mm256_mul_pd(ymm1, ymm0); /*vec2 and ymm8 is vec1*/\
-        ymm14 = _mm256_permute_pd(ymm12, 0x5);\
-        ymm14 = _mm256_mul_pd(ymm14, ymm0);\
-        ymm13 = _mm256_mul_pd(vec1, ymm12); /*vec3*/\
-        ymm14 = _mm256_mul_pd(vec1, ymm14); /*vec4*/\
-        vec1 = _mm256_hsub_pd(ymm13, ymm14);\
-        \
-        ymm12 = _mm256_mul_pd(ymm1, ymm1);\
-        ymm13 = _mm256_permute4x64_pd(ymm12, 0xb1);\
-        ymm14 = _mm256_add_pd(ymm12, ymm13);\
-        \
-        /*Finally dividing numerator by denominator*/\
-        vec1 = _mm256_div_pd(vec1, ymm14);\
+\
+        dcomplex b_data[2];\
+        dcomplex d11_data[2];\
+\
+        _mm256_storeu_pd((double *)(b_data), vec1);\
+        _mm256_storeu_pd((double *)(d11_data), ymm1);\
+\
+        for(dim_t i = 0; i < 2; i++)\
+        {\
+            bli_zinvscals(d11_data[0],b_data[i]);\
+        }\
+\
+        vec1 = _mm256_loadu_pd((double *)b_data);\
+\
    }\
 }

@@ -6007,7 +5997,6 @@ BLIS_INLINE void bli_ztrsm_small_pack

 }

-
 BLIS_INLINE void ztrsm_small_pack_diag_element
 (
    bool is_unitdiag,
@@ -6018,64 +6007,31 @@ BLIS_INLINE void ztrsm_small_pack_diag_element
 )
 {
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
-    __m256d  ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
-    ymm7 = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
-#else
-    __m256d ymm1, ymm2, ymm3;
-#endif
-    bool is_four = (size == 4) ? 1 : 0;
-    dcomplex ones = {1.0, 1.0};
-    ymm2 = ymm1 = _mm256_broadcast_pd((__m128d const *)&ones);
-    if(!is_unitdiag)
+    // If Preinversion is enabled, inverse the diaganol
+    // elements from A and pack into diagonal buffer.
+    // In order to avoid the overflow and underflow scenarios,
+    // bli_zinvscals is used
+    for( dim_t i = 0; i < size; i++)
    {
-        //broadcast diagonal elements of A11
-        ymm1 = _mm256_broadcast_pd((__m128d const *)a11);
-        ymm2 = _mm256_broadcast_pd((__m128d const *)a11+ cs_a +1);
-        /*Pick one element frome each column and create 3 element vector
-        and store it*/
-        ymm1 = _mm256_permute2f128_pd(ymm1, ymm2, 0x20);
-        ymm2 = _mm256_broadcast_pd((__m128d const *)a11+ cs_a*2 + 2);
-
-        if(is_four)
-        {
-            ymm3 = _mm256_broadcast_pd((__m128d const *)a11+ cs_a*2 + 2);
-            ymm2 = _mm256_broadcast_pd((__m128d const *)a11+ cs_a*3 + 3);
-            ymm2 = _mm256_permute2f128_pd(ymm3, ymm2, 0x20);
-        }
-
-#ifdef BLIS_ENABLE_TRSM_PREINVERSION
-        /*Taking denomerator multiplication of real & imaginary components*/
-        ymm4 = _mm256_mul_pd(ymm1, ymm1);
-        ymm5 = _mm256_mul_pd(ymm2,ymm2);
-        /*Swapping real & imaginary component position for addition with
-         * respective components*/
-        ymm6 = _mm256_permute4x64_pd(ymm4, 0xb1);
-        ymm4 = _mm256_add_pd(ymm4, ymm6);
-        ymm8 = _mm256_permute4x64_pd(ymm5, 0xb1);
-
-        ymm5 = _mm256_add_pd(ymm5, ymm8);
-        /*Negating imaginary component of numerator*/
-        ymm1 = _mm256_mul_pd(ymm1, ymm7);
-        ymm2 = _mm256_mul_pd(ymm2, ymm7);
-        /*Dividing numerator by denominator*/
-        ymm1 = _mm256_div_pd(ymm1, ymm4);
-        ymm2 = _mm256_div_pd(ymm2, ymm5);
-#endif
-
+        dim_t d = ((i*cs_a) + i);
+        dcomplex ones = {1.0, 0.0};
+        bli_zinvscals(a11[d], ones)
+        d11_pack[i].real = ones.real;
+        d11_pack[i].imag = ones.imag;
    }
-    _mm256_store_pd((double *)d11_pack, ymm1);
-    if(is_four)
+
+#else //BLIS_ENABLE_TRSM_PREINVERSION
+
+    // If Preinversion is disabled, pack the diaganol
+    // elements from A into diagonal buffer.
+    for( dim_t i = 0; i < size; i++)
    {
-        _mm256_store_pd((double *)(d11_pack + 2), ymm2);
+        dim_t d =  ((i*cs_a) + i);
+        bli_zcopys(a11[d],d11_pack[i]);
    }
-    else
-    {
-        _mm_store_pd((double *)(d11_pack + 2),
-                _mm256_extractf128_pd(ymm2,0));

-    }
+#endif //BLIS_ENABLE_TRSM_PREINVERSION
 }
-
 /*implements TRSM for the case XA = alpha * B
 *A is lower triangular, non-unit diagonal/unit diagonal, transpose
 *dimensions: X:mxn     A:nxn       B: mxn
@@ -14948,9 +14904,12 @@ BLIS_INLINE void strsm_small_pack_diag_element
    __m256 ymm0, ymm1, ymm2, ymm3;
    __m256 ymm4, ymm5, ymm6, ymm7;
    __m256 ymm8, ymm9, ymm10,ymm11;
-    __m256 ymm14, ymm15, ymm12,ymm13;
+    __m256 ymm14, ymm15, ymm12;
    float ones = 1.0;
-    ymm13 = ymm14 = ymm15 = _mm256_broadcast_ss((float const *)&ones);
+    ymm14 = ymm15 = _mm256_broadcast_ss((float const *)&ones);
+#ifdef  BLIS_ENABLE_TRSM_PREINVERSION
+    __m256 ymm13 = _mm256_broadcast_ss((float const *)&ones);
+#endif
    if(side=='L'||side=='l')
    {
        if(!is_unitdiag)