Bug fix in LPGEMV m=1 AVX2 kernel for post-ops

Details: - Fixed loading of matadd and matmul pointers in GEMV lt16 kernel for AVX2 M=1 case. - Hard-set row-stride of B to 1(inside GEMV), when it has already been reordered. AMD-Internal:CPUPL-7197, CPUPL-7221 Co-authored-by:Balasubramanian, Vignesh <Vignesh.Balasubramanian@amd.com>
2026-04-19 23:28:52 +00:00 · 2025-08-22 18:15:05 +05:30
parent d29f3f0b5e
commit 5044b69d3d
3 changed files with 22 additions and 11 deletions
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
@@ -202,8 +202,14 @@ LPGEMV(float, float, float, f32f32f32of32)
    }
 #endif

+    // The vector is already contiguous if reordered.
+    if (mtag_b == REORDERED)
+    {
+      rs_b_use = 1;
+      cs_b_use = 1;
+    }
    // Pack B matrix if rs_b > 1
-    if( rs_b != 1 )
+    else if (rs_b != 1)
    {
      mem_b_size_req = sizeof( float ) * k;

--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32_tiny.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32_tiny.c
@@ -179,9 +179,14 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
 #ifdef BLIS_KERNELS_ZEN4
      }
 #endif
-        // Pack B matrix if rs_b > 1, ignoring the mtag_b here.
+        // The vector is already contiguous if reordered.
+        if (mtag_b == REORDERED)
+        {
+          rs_b_use = 1;
+          cs_b_use = 1;
+        }
        // For tiny sizes, it is better to pack B if it affects output accuracy.
-        if( ( rs_b != 1 ) )
+        else if( ( rs_b != 1 ) )
        {
            siz_t mem_b_size_req = sizeof( float ) * k;
            pack_b_buffer_f32f32f32of32 =
--- a/kernels/zen/lpgemm/f32f32f32/lpgemv_m_kernel_f32_avx2.c
+++ b/kernels/zen/lpgemm/f32f32f32/lpgemv_m_kernel_f32_avx2.c
@@ -502,10 +502,10 @@ POST_OPS_MATRIX_ADD_1x16F:
        {
          scl_fctr1 =
            _mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
-                post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+                post_ops_attr.post_op_c_j + ( 0 * 8 ) );
          scl_fctr2 =
            _mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
-                post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+                post_ops_attr.post_op_c_j + ( 1 * 8 ) );
        }
        else
        {
@@ -576,10 +576,10 @@ POST_OPS_MATRIX_MUL_1x16F:
        {
          scl_fctr1 =
            _mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
-                post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+                post_ops_attr.post_op_c_j + ( 0 * 8 ) );
          scl_fctr2 =
            _mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
-                post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+                post_ops_attr.post_op_c_j + ( 1 * 8 ) );
        }
        else
        {
@@ -1109,10 +1109,10 @@ POST_OPS_MATRIX_ADD_1x16F:
        {
          scl_fctr1 =
            _mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
-                post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+                post_ops_attr.post_op_c_j + ( 0 * 8 ) );
          scl_fctr2 =
            _mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
-                post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+                post_ops_attr.post_op_c_j + ( 1 * 8 ) );
        }
        else
        {
@@ -1183,10 +1183,10 @@ POST_OPS_MATRIX_MUL_1x16F:
        {
          scl_fctr1 =
            _mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
-                post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+                post_ops_attr.post_op_c_j + ( 0 * 8 ) );
          scl_fctr2 =
            _mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
-                post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+                post_ops_attr.post_op_c_j + ( 1 * 8 ) );
        }
        else
        {