Add packing support M edge cases in DGEMM 24xk pack kernel

Previously, the DGEMM implementation used `dscalv` for cases where the M dimension of matrix A is not in multiple of 24, resulting in a ~40% performance drop. This commit introduces a specialized edge cases in pack kernel to optimize performance for these cases. The new packing support significantly improves the performance. - Removed reliance on `dscalv` for edge cases, addressing the performance bottleneck. AMD-Internal: [CPUPL-6677] Change-Id: I150d13eb536d84f8eb439d7f4a77a04a0d0e6d60
2026-04-19 23:28:52 +00:00 · 2025-04-03 00:16:52 +05:30
parent 8557e2f7b9
commit cd83fc38b5
2 changed files with 10278 additions and 789 deletions
--- a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c
+++ b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c
--- a/kernels/zen4/3/bli_gemm_tiny_adapter_zen4.c
+++ b/kernels/zen4/3/bli_gemm_tiny_adapter_zen4.c
@@ -67,51 +67,52 @@
 * @note
 * N = 0 case never occurs.
 */
-#define CALL_KERNEL\
-        if(N >= 8)\
-        {\
-            avx512kern_fp[8](   conja,\
-                                conjb,\
-                                M,\
-                                N,\
-                                K,\
-                                (double *)alpha,\
-                                (a_local + (0 * rs_a) + (0 * cs_a)), /*A matrix offset*/\
-                                rs_a,\
-                                cs_a,\
-                                (b_local + (0 * cs_b) + (0 * rs_b)), /*B matrix offset*/\
-                                rs_b,\
-                                cs_b,\
-                                (double *)beta,\
-                                (c_local + 0 * cs_c + 0 * rs_c),     /*C matrix offset*/\
-                                rs_c,\
-                                cs_c,\
-                                &aux,\
-                                NULL\
-                            );\
-        }\
-        else\
-        {\
-            avx512kern_fp[N](   conja,\
-                                conjb,\
-                                M,\
-                                N,\
-                                K,\
-                                (double *)alpha,\
-                                (a_local + (0 * rs_a) + (0 * cs_a)), /*A matrix offset*/\
-                                rs_a,\
-                                cs_a,\
-                                (b_local + (0 * cs_b) + (0 * rs_b)), /*B matrix offset*/\
-                                rs_b,\
-                                cs_b,\
-                                (double *)beta,\
-                                (c_local + 0 * cs_c + 0 * rs_c),     /*C matrix offset*/\
-                                rs_c,\
-                                cs_c,\
-                                &aux,\
-                                NULL\
-                            );\
+#define CALL_KERNEL                                                                      \
+        if(N >= 8)                                                                       \
+        {                                                                                \
+            avx512kern_fp[8](   conja,                                                   \
+                                conjb,                                                   \
+                                M,                                                       \
+                                N,                                                       \
+                                K,                                                       \
+                                (double *)alpha,                                         \
+                                (a_local + (0 * rs_a) + (0 * cs_a)), /*A matrix offset*/ \
+                                rs_a,                                                    \
+                                cs_a,                                                    \
+                                (b_local + (0 * cs_b) + (0 * rs_b)), /*B matrix offset*/ \
+                                rs_b,                                                    \
+                                cs_b,                                                    \
+                                (double *)beta,                                          \
+                                (c_local + 0 * cs_c + 0 * rs_c),     /*C matrix offset*/ \
+                                rs_c,                                                    \
+                                cs_c,                                                    \
+                                &aux,                                                    \
+                                NULL                                                     \
+                            );                                                           \
+        }                                                                                \
+        else                                                                             \
+        {                                                                                \
+            avx512kern_fp[N](   conja,                                                   \
+                                conjb,                                                   \
+                                M,                                                       \
+                                N,                                                       \
+                                K,                                                       \
+                                (double *)alpha,                                         \
+                                (a_local + (0 * rs_a) + (0 * cs_a)), /*A matrix offset*/ \
+                                rs_a,                                                    \
+                                cs_a,                                                    \
+                                (b_local + (0 * cs_b) + (0 * rs_b)), /*B matrix offset*/ \
+                                rs_b,                                                    \
+                                cs_b,                                                    \
+                                (double *)beta,                                          \
+                                (c_local + 0 * cs_c + 0 * rs_c),     /*C matrix offset*/ \
+                                rs_c,                                                    \
+                                cs_c,                                                    \
+                                &aux,                                                    \
+                                NULL                                                     \
+                            );                                                           \
        }
+
 /**
 * @brief bli_dgemmsup_placeholder
 * 
@@ -333,9 +334,33 @@ err_t bli_dgemm_tiny_24x8
        ps_a_use = (24 * k);
        bli_auxinfo_set_ps_a( ps_a_use, &aux );

+        /**
+         * CALL_KERNEL makes actual call to micro kernel,
+         * which is bli_dgemmsup_rv_zen4_asm_24x8m_new and the family of
+         * it based on value of N dimension.
+         * Arguments passed to it are as follows.
+         * conja                                whether A matrix is conjugate
+           conjb                                whether B matrix is conjugate
+           M                                    M dimension
+           N                                    N dimension
+           K                                    K dimension
+           (double *)alpha                      Pointer to alpha value
+           (a_local + (0 * rs_a) + (0 * cs_a)), A matrix offset
+           rs_a                                 row stride of A matrix
+           cs_a                                 column stride of A matrix
+           (b_local + (0 * cs_b) + (0 * rs_b)), B matrix offset
+           rs_b                                 row stride of B matrix
+           cs_b                                 column stride of C matrix
+           (double *)beta                       pointer to Beta value
+           (c_local + 0 * cs_c + 0 * rs_c),     C matrix offset
+           rs_c                                 row stride of C matrix
+           cs_c                                 column stride of C matrix
+           &aux                                 Aux structure which carries additional info
+           NULL                                 we do not use context in tiny path.
+        */
        CALL_KERNEL

-	//Return the allocated memory back to small block allocator
+        //Return the allocated memory back to small block allocator
        bli_pba_release(&rntm, &local_mem_buf_A_s);
    }
    else