Added DTRSM Small Path AVX512 based LUNN/LLTN Variant Kernels

- 8x8 kernels are used for DTRSM SMALL - Matrix A(a10) is packed for GEMM operations. - Packed martix A will be re-used in all the col-block along N-dimension. - Diagonal elements of A matrix are packed(a11) for TRSM operations. - Implemented fringe cases with below block sizes 8x8, 8x4, 8x3, 8x2, 8x1 4x8, 4x4, 4x3, 4x2, 4x1 3x8, 3x4, 3x3, 3x2, 3x1 2x8, 2x4, 2x3, 2x2, 2x1 1x8, 1x4, 1x3, 1x2, 1x1 AMD-Internal: [CPUPL-2745] Change-Id: I5bb57501f6d3783eb654e375d63901467dd14734
2026-05-13 18:52:14 +00:00 · 2023-04-12 04:38:15 +00:00
parent b515643c54
commit 6ad387c2aa
2 changed files with 1794 additions and 12 deletions
--- a/frame/compat/bla_trsm_amd.c
+++ b/frame/compat/bla_trsm_amd.c
@@ -1059,9 +1059,9 @@ void dtrsm_blis_impl
            {
                case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
-                    // this is a temporary fix, will be removed when all variants are added
-                    if( ((blis_side == BLIS_RIGHT) && ((n0 > 300) && (m0 > 50))) || 
-                        ((blis_side == BLIS_LEFT && ( (blis_uploa == BLIS_LOWER && blis_transa == BLIS_NO_TRANSPOSE) || (blis_uploa == BLIS_UPPER && blis_transa == BLIS_TRANSPOSE) ) ) && ((n0 != 30 && n0 !=60 ) && (m0 > 50))) )
+                    /* For sizes where m and n < 50,avx2 kernels are performing better,
+                     except for sizes where n is multiple of 8.*/
+                    if (((n0 % 8 == 0) && (n0 < 50)) || ((m0 > 50) && (n0 > 50)))
                    {
                        ker_ft = bli_trsm_small_AVX512;
                    }
@@ -1088,14 +1088,7 @@ void dtrsm_blis_impl
            {
                case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
-                    if ( (blis_side == BLIS_LEFT && ( (blis_uploa == BLIS_LOWER && blis_transa == BLIS_TRANSPOSE) || (blis_uploa == BLIS_UPPER && blis_transa == BLIS_NO_TRANSPOSE) ) ))
-                    {
-                        ker_ft = bli_trsm_small_mt;
-                    }
-                    else
-                    {
-                        ker_ft = bli_trsm_small_mt_AVX512;
-                    }
+                    ker_ft = bli_trsm_small_mt_AVX512;
                    break;
 #endif// BLIS_KERNELS_ZEN4
                case BLIS_ARCH_ZEN:
--- a/kernels/zen4/3/bli_trsm_small_AVX512.c
+++ b/kernels/zen4/3/bli_trsm_small_AVX512.c