Added ability to handle non unit incx in GEMV transpose kernel. (#145)

- GEMV transpose kernels lack ability to compute directly on non-unit stride inputs. - This limitation is stopping libflame to use blis kernel directly instead of going through framework. - Added ability to handle non-unit incx in the kernel by packing x into a temporary buffer. AMD-Internal: [CPUPL-6903]
2026-04-20 07:38:53 +00:00 · 2025-08-20 23:33:53 +05:30
parent 67616752c3
commit 805f36965d
2 changed files with 79 additions and 4 deletions
--- a/gtestsuite/testsuite/ukr/gemv/dgemv/dgemv_ukr.cpp
+++ b/gtestsuite/testsuite/ukr/gemv/dgemv/dgemv_ukr.cpp
@@ -456,8 +456,8 @@ INSTANTIATE_TEST_SUITE_P(
        ::testing::Range( gtint_t(1), gtint_t(16), gtint_t(1)),         // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
-        ::testing::Values(gtint_t(1)),                                  // stride size for x (non-unit incx is handled by frame)
-        ::testing::Values(gtint_t(1), gtint_t(3)),                                  // stride size for y
+        ::testing::Values(gtint_t(1), gtint_t(8)),                      // stride size for x
+        ::testing::Values(gtint_t(1), gtint_t(3)),                      // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
        ::testing::Values(false, true)                                  // is_memory_test
    ),
--- a/kernels/zen4/2/bli_gemv_t_zen4_int.c
+++ b/kernels/zen4/2/bli_gemv_t_zen4_int.c
@@ -110,7 +110,7 @@ static dgemv_ker_ft_conja n_ker_fp[8] =
       dim_t   n,
       double* alpha,
       double* a, inc_t inca, inc_t lda,
-       double* x, inc_t incx,
+       double* x0, inc_t incx0,
       double* beta,
       double* y, inc_t incy,
       cntx_t* cntx
@@ -118,8 +118,75 @@ static dgemv_ker_ft_conja n_ker_fp[8] =
 {
    double* restrict a_buf = a;
    double* restrict y_buf = y;
-    double* restrict x_buf = x;
+    double*              x = x0;  // optionally packed buffer

+    inc_t             incx = incx0;
+    
+    bool    is_x_temp_buf_created = false;
+    mem_t   mem_bufX;
+    rntm_t  rntm_l;
+    
+    if (incx != 1)
+    {
+        /*
+              Initialize mem pool buffer to NULL and size to 0
+              "buf" and "size" fields are assigned once memory
+              is allocated from the pool in bli_pba_acquire_m().
+              This will ensure bli_mem_is_alloc() will be passed on
+              an allocated memory if created or a NULL .
+        */
+
+        mem_bufX.pblk.buf = NULL;
+        mem_bufX.pblk.block_size = 0;
+        mem_bufX.buf_type = 0;
+        mem_bufX.size = 0;
+        mem_bufX.pool = NULL;
+
+        // In order to get the buffer from pool via rntm_l access to memory broker
+        // is needed.Following are initializations for rntm_l.
+        bli_rntm_init_from_global(&rntm_l);
+        bli_rntm_set_num_threads_only(1, &rntm_l);
+        bli_pba_rntm_set_pba(&rntm_l);
+
+        //calculate the size required for m0 double elements in vector X.
+        size_t buffer_size = m * sizeof(double);
+
+#ifdef BLIS_ENABLE_MEM_TRACING
+        printf("bli_dgemv_t_zen4_int(): get mem pool block for vector x\n");
+#endif
+
+        // acquire a Buffer(m0*size(double)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_pba_acquire_m(&rntm_l,
+                            buffer_size,
+                            BLIS_BUFFER_FOR_B_PANEL,
+                            &mem_bufX);
+
+        // Continue packing X if buffer memory is allocated.
+        if ( bli_mem_is_alloc(&mem_bufX) )
+        {
+            x = bli_mem_buffer(&mem_bufX);
+
+            // stride of vector x_buf =1
+            incx = 1;
+
+            // Invoke the COPYV function using the function pointer.
+            bli_dcopyv_zen4_asm
+            (
+                BLIS_NO_CONJUGATE,
+                m,
+                x0, incx0,
+                x, incx,
+                cntx
+            );
+
+            // Set x is packed as the memory allocation was successful
+            // and contents have been copied to a temp buffer.
+            is_x_temp_buf_created = TRUE;
+        }
+    }
+
+    double*          x_buf = x;  // temporary buffer
    // i denotes the number of rows completed
    // The variable 'j' is used to denote the number of rows that have been completed.
    dim_t i = 0, j = 0;
@@ -1034,6 +1101,14 @@ static dgemv_ker_ft_conja n_ker_fp[8] =
            cntx
        );
    }
+    if (is_x_temp_buf_created)
+    {
+ #ifdef BLIS_ENABLE_MEM_TRACING
+        printf("bli_dgemv_t_zen4_int(): releasing mem pool block for vector x\n");
+ #endif
+        // Return the buffer to pool
+        bli_pba_release(&rntm_l, &mem_bufX);
+    }
    return;
 }