Removed unnecessary pack checks in FP32 GEMV (#54)

Details: - In FP32 GEMM, when threading is disabled, rntm_pack_a and rntm_pack_b were set to true by default. This leads to perf regression for smaller sizes. Modified FP32 interface API to not overwrite the packA and packB variables in rntm structure. - In FP32 GEMV, Removed the decision making code based on mtag_A/B and should_pack_A/B for packing. Matrices will be packed only if the storage format of the matrices doesn't match the storage format required by the kernel. - Changed the control flow of checking the value of mtag to whether matrix is "reordered" or "to-be-packed" or "unpacked". checking for "reorder" first, followed by "pack". This will ensure that packing doesn't happen when the matrix is already reordered even though user forces packing by setting "BLIS_PACK_A/B" -Modified python script to generate testcases based on block sizes AMD-Internal: SWLCSG-3527
2026-04-20 15:48:50 +00:00 · 2025-06-16 12:34:11 +05:30
parent 1847a1e8c6
commit 8649cdc14b
4 changed files with 165 additions and 73 deletions
--- a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c
+++ b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c
@@ -285,10 +285,6 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
 		);
 	}
 #else
-	// Setting pack A and B by default for non open mp case.
-	bli_rntm_set_pack_a( 1, &rntm_g );
-	bli_rntm_set_pack_b( 1, &rntm_g );
-
 	// Swapping inputs to induce row major computation for column major inputs.
 	if ( is_column_major == TRUE )
 	{
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
@@ -124,6 +124,10 @@ typedef void (*lpgemv_a_pack_ft)

 LPGEMV(float, float, float, f32f32f32of32)
 {
+
+  /* Ignoring mtag_a/b and should_pack_A/B for now .
+     Matrices are packed only when the storage format is not supported by the kernel.
+  */
  const float* a_use = (float*)a;
  inc_t rs_a_use = rs_a;
  inc_t cs_a_use = cs_a;
@@ -154,12 +158,6 @@ LPGEMV(float, float, float, f32f32f32of32)
  if (c_downscale < F32) post_ops_attr.buf_downscale = c;
  else  post_ops_attr.buf_downscale = NULL;

-  // Should_pack_A/B is set either by the user through env variable
-  // or by the smart threading logic based on work distribution.
-  // Storage format of the matrices doesn't affect should_pack_A/B.
-  bool should_pack_B = bli_rntm_pack_b( rntm );
-  bool should_pack_A = bli_rntm_pack_a( rntm );
-
  // Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
  thrinfo_t thread_jc;
  thrinfo_t thread_ic;
@@ -195,7 +193,7 @@ LPGEMV(float, float, float, f32f32f32of32)
    packa_fp = packa_mr8_f32f32f32of32_col_major;
 #endif
    // Pack B matrix if rs_b > 1
-    if( (should_pack_B == TRUE) || ( rs_b != 1 ) )
+    if( rs_b != 1 )
    {
      mem_b_size_req = sizeof( float ) * k;

@@ -233,7 +231,7 @@ LPGEMV(float, float, float, f32f32f32of32)
      post_ops_attr.post_op_c_i = ic;

      // To-Do: pack A case needs to be handled for AVX2 case.
-      if( (should_pack_A == TRUE) || ( cs_a != 1 ) )
+      if( cs_a != 1 )
      {
        mem_a_size_req = sizeof(float) * mc0 * k;
        lpgemm_alloc_mem_panel
@@ -264,11 +262,11 @@ LPGEMV(float, float, float, f32f32f32of32)
        &post_ops_attr
      );
    }
-    if ( ( (should_pack_A == TRUE) || ( cs_a != 1 ) ) && ( bli_mem_is_alloc( &mem_a ) ) )
+    if ( ( cs_a != 1 ) && ( bli_mem_is_alloc( &mem_a ) ) )
    {
      bli_pba_release( rntm, &mem_a );
    }
-    if ( ( (should_pack_B == TRUE) || ( rs_b != 1 ) ) && ( bli_mem_is_alloc( &mem_b ) ) )
+    if ( ( rs_b != 1 ) && ( bli_mem_is_alloc( &mem_b ) ) )
    {
      bli_pba_release( rntm, &mem_b );
    }
@@ -300,7 +298,7 @@ LPGEMV(float, float, float, f32f32f32of32)
    thread_jc.work_id = thread->tid;
    bli_thread_range_sub(&thread_jc, n, NR, FALSE, &jc_start, &jc_end);

-    if ( (should_pack_A == TRUE) || ( cs_a != 1 ) )
+    if ( cs_a != 1 )
    {
      mem_a_size_req = sizeof( float ) * k;

@@ -346,7 +344,7 @@ LPGEMV(float, float, float, f32f32f32of32)
        rs_b_use = NR;
        cs_b_use = 1;
      }
-      else if ( (should_pack_B == TRUE) || ( mtag_b == PACK ) )
+      else if ( mtag_b == PACK )
      {
        // nc0 needs to be a multiple of 16 since this gives maximum
        // vectorization. Packing B always results in buffers with width
@@ -412,12 +410,12 @@ LPGEMV(float, float, float, f32f32f32of32)
    } // jc loop

    // Release pack buffers.
-    if ( ( (should_pack_B == TRUE) || ( mtag_b == PACK ) ) && ( bli_mem_is_alloc( &mem_b ) ) )
+    if ( ( mtag_b == PACK ) && ( bli_mem_is_alloc( &mem_b ) ) )
    {
      bli_pba_release( rntm, &mem_b );
    }

-    if ( ( (should_pack_A == TRUE) || ( cs_a != 1 ) ) && ( bli_mem_is_alloc( &mem_a ) ) )
+    if ( ( cs_a != 1 ) && ( bli_mem_is_alloc( &mem_a ) ) )
    {
      bli_pba_release( rntm, &mem_a );
    }
@@ -569,7 +567,20 @@ LPGEMM_5LOOP(float, float, float, f32f32f32of32)
            is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
            post_ops_attr.is_last_k = is_last_k;

-            if ( ( mtag_b == PACK ) || ( should_pack_B == TRUE ) )
+            if ( mtag_b == REORDERED )
+            {
+                // In multi-threaded scenarios, an extra offset into a given
+                // packed B panel is required, since the jc loop split can
+                // result in per thread start offset inside the panel, instead
+                // of panel boundaries.
+                b_use = b + ( jc_cur_loop * k ) +
+                        ( n_sub_updated * pc ) + ( jc_cur_loop_rem * kc0 );
+
+                rs_b_use = NR;
+                cs_b_use = 1;
+                ps_b_use = kc0;
+            }
+            else if ( ( mtag_b == PACK ) || ( should_pack_B == TRUE ) )
            {
                // Pack B chunks are based on jc work id.
                dim_t jc_work_id = bli_thread_work_id( &thread_jc );
@@ -649,19 +660,6 @@ LPGEMM_5LOOP(float, float, float, f32f32f32of32)
                );
                b_use = pack_b_buffer_f32f32f32of32;
            }
-            else if ( mtag_b == REORDERED )
-            {
-                // In multi-threaded scenarios, an extra offset into a given
-                // packed B panel is required, since the jc loop split can
-                // result in per thread start offset inside the panel, instead
-                // of panel boundaries.
-                b_use = b + ( jc_cur_loop * k ) +
-                        ( n_sub_updated * pc ) + ( jc_cur_loop_rem * kc0 );
-
-                rs_b_use = NR;
-                cs_b_use = 1;
-                ps_b_use = kc0;
-            }
            else
            {
                b_use = b + ( pc * rs_b ) + ( jc * cs_b );
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32_tiny.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32_tiny.c
@@ -266,7 +266,13 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
          cs_a_use = 1;
        }

-        if ( ( mtag_b == PACK ) )
+        if ( mtag_b == REORDERED )
+        {
+          b_use = ( float* )b;
+          rs_b_use = NR;
+          cs_b_use = 1;
+        }
+        else if ( ( mtag_b == PACK ) )
        {
          dim_t nc0_updated = make_multiple_of_n(n, NR);
          siz_t mem_b_size_req = sizeof(float) * nc0_updated * k;
@@ -288,12 +294,6 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)

          b_use = pack_b_buffer_f32f32f32of32;
        }
-        else if ( mtag_b == REORDERED )
-        {
-          b_use = ( float* )b;
-          rs_b_use = NR;
-          cs_b_use = 1;
-        }
        else
        {
          b_use = ( float* )b;
@@ -388,7 +388,14 @@ LPGEMM_TINY(float,float,float,f32f32f32of32)
    // Even if the mtag_b is set to PACK, for tiny sizes its better to
    // pack only if it affects output accuracy (like column major B),
    // else ignore it.
-    if ( ( mtag_b == PACK ) )
+    if ( mtag_b == REORDERED )
+    {
+        b_use = b;
+        rs_b_use = NR;
+        cs_b_use = 1;
+        ps_b_use = k;
+    }
+    else if ( ( mtag_b == PACK ) )
    {
        dim_t nc0_updated = make_multiple_of_n( n, NR );
        mem_b_size_req = sizeof( float ) * nc0_updated * k;
@@ -410,13 +417,6 @@ LPGEMM_TINY(float,float,float,f32f32f32of32)

        b_use = pack_b_buffer_f32f32f32of32;
    }
-    else if ( mtag_b == REORDERED )
-    {
-        b_use = b;
-        rs_b_use = NR;
-        cs_b_use = 1;
-        ps_b_use = k;
-    }
    else
    {
        b_use = b;