DGEMV Optimizations for NO_TRANSPOSE Cases

- AVX512 specific DGEMV native kernels are added for Zen4/5 architectures to handle the NO_TRANSPOSE cases and are independent of the AXPYF fused kernels. - The following set of kernels biased towards the n-dimension perform beta scaling of y vector within the kernel itself and handle cases where n is less than 5: - bli_dgemv_n_zen_int_32x8n_avx512( ... ) - bli_dgemv_n_zen_int_32x4n_avx512( ... ) - bli_dgemv_n_zen_int_32x2n_avx512( ... ) - bli_dgemv_n_zen_int_32x1n_avx512( ... ) - The bli_dgemv_n_zen_int_16mx8_avx512( ... ) is biased towards the m-dimension and for this kernel beta scaling is handled beforehand within the framework. - Added unit-tests for the new kernels. - AVX2 path for Zen/2/3 architectures still follows the old approach of using fused kernel, namely AXPYF, to perform the GEMV operation. AMD-Internal: [CPUPL-5560] Change-Id: I22bc2a865cd28b9cdcb383e17d1ff38bdd28de79
2026-04-19 23:28:52 +00:00 · 2024-10-04 14:44:21 +05:30
parent 615789e196
commit 25e59fcbb9
10 changed files with 4633 additions and 103 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,6 @@ bin/*
 *.exe

 .vscode
+
+# Gtestsuite build files
+gtestsuite/build/*
--- a/frame/2/bli_l2.h
+++ b/frame/2/bli_l2.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -37,6 +38,9 @@
 // Define function types.
 #include "bli_l2_ft_unb.h"

+// Define kernel function types for level-2 kernels.
+#include "bli_l2_ft_ker.h"
+
 // Prototype object APIs (expert and non-expert).
 #include "bli_oapi_ex.h"
 #include "bli_l2_oapi.h"
--- a/frame/2/bli_l2_ft_ker.h
+++ b/frame/2/bli_l2_ft_ker.h
@@ -0,0 +1,60 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Define kernel function types for level-2 kernels.
+//
+
+// gemv
+#undef  GENTDEF
+#define GENTDEF( ctype, ch, opname, tsuf ) \
+\
+typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
+     ( \
+       conj_t  conja, \
+       conj_t  conjx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  x, inc_t incx, \
+       ctype*  beta, \
+       ctype*  y, inc_t incy, \
+       cntx_t* restrict cntx  \
+     );
+
+// INSERT_GENTDEF( gemv )
+// Currently only generating the function type for double datatype.
+GENTDEF( double, d, gemv, _ft )
--- a/frame/2/gemv/bli_gemv_unf_var2_amd.c
+++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c
@@ -257,35 +257,15 @@ void bli_dgemv_unf_var2
       conj_t  conjx,
       dim_t   m,
       dim_t   n,
-       double*  alpha,
-       double*  a, inc_t rs_a, inc_t cs_a,
-       double*  x, inc_t incx,
-       double*  beta,
-       double*  y, inc_t incy,
+       double* alpha,
+       double* a, inc_t rs_a, inc_t cs_a,
+       double* x, inc_t incx,
+       double* beta,
+       double* y, inc_t incy,
       cntx_t* cntx
     )
 {
-
    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
-    double*  A1;
-    double*  x1;
-    dim_t   i;
-    dim_t   f, b_fuse;
-    dim_t   n_elem, n_iter;
-    inc_t   rs_at, cs_at;
-    conj_t  conja;
-
-    // Memory pool declarations for packing vector Y.
-    mem_t   mem_bufY;
-    rntm_t  rntm;
-    double* y_temp = y;
-    inc_t   temp_incy = incy;
-
-    bli_set_dims_incs_with_trans( transa,
-                                  m, n, rs_a, cs_a,
-                                  &n_elem, &n_iter, &rs_at, &cs_at );
-
-    conja = bli_extract_conj( transa );

    /*
      Fatbinary config amdzen when run on non-AMD X86 will query for
@@ -294,6 +274,9 @@ void bli_dgemv_unf_var2
    */
    arch_t id = bli_arch_query_id();

+    // b_fuse stores the fusing factor for AXPYF kernel.
+    dim_t b_fuse;
+
    /*
      Function pointer declaration for the functions
      that will be used by this API
@@ -303,73 +286,91 @@ void bli_dgemv_unf_var2
    dscalv_ker_ft   scalv_kr_ptr; // DSCALV
    dcopyv_ker_ft   copyv_kr_ptr; // DCOPYV

+    switch (id)
+    {
+        case BLIS_ARCH_ZEN5:
+        case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+            bli_dgemv_n_avx512(
+                transa,
+                conjx,
+                m,
+                n,
+                alpha,
+                a, rs_a, cs_a,
+                x, incx,
+                beta,
+                y, incy,
+                cntx
+            );
+            return;
+#endif
+        case BLIS_ARCH_ZEN:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN3:
+            bli_dgemv_n_avx2(
+                transa,
+                conjx,
+                m,
+                n,
+                alpha,
+                a, rs_a, cs_a,
+                x, incx,
+                beta,
+                y, incy,
+                cntx
+            );
+            return;
+
+        default:
+            // For non-Zen architectures, query the context if it is NULL
+            if (cntx == NULL) cntx = bli_gks_query_cntx();
+
+            /*
+            Query the context for the kernel function pointers for
+            AXPYF, SCALV, COPYV and corresponding fusing
+            factor of AXPYF kernel
+            */
+            axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_AXPYF_KER, cntx);
+            b_fuse       = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx);
+
+            scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
+
+            copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx);
+    }
+
+    double*  A1;
+    double*  x1;
+    dim_t   i;
+    dim_t   f;
+    dim_t   n_elem, n_iter;
+    inc_t   rs_at, cs_at;
+    conj_t  conja;
+
+    // Memory pool declarations for packing vector Y.
+    mem_t   mem_bufY;
+    rntm_t  rntm;
+    double* y_temp = y;
+    inc_t   temp_incy = incy;
+
    /*
      Boolean to check if the y has been packed
      and memory needs to be freed in the end
    */
    bool is_y_temp_buf_created = FALSE;

-    switch (id)
-    {
-      case BLIS_ARCH_ZEN5:
-      case BLIS_ARCH_ZEN4:
-#if defined(BLIS_KERNELS_ZEN4)
-        /*
-          Assign the AVX512 based kernel function pointers for
-          AXPYF, SCALV, COPYV and corresponding fusing
-          factor of DAXPYF kernel
-        */
+    bli_set_dims_incs_with_trans( transa,
+                                  m, n, rs_a, cs_a,
+                                  &n_elem, &n_iter, &rs_at, &cs_at );

-        axpyf_kr_ptr = bli_daxpyf_zen_int_avx512;
-        b_fuse = 32;
-
-        scalv_kr_ptr = bli_dscalv_zen_int_avx512;
-
-        copyv_kr_ptr = bli_dcopyv_zen_int;
-
-        break;
-#endif
-      case BLIS_ARCH_ZEN:
-      case BLIS_ARCH_ZEN2:
-      case BLIS_ARCH_ZEN3:
-
-        /*
-          Assign the AVX2 based kernel function pointers for
-          AXPYF, SCALV, COPYV and corresponding fusing
-          factor of DAXPYF kernel
-        */
-
-        axpyf_kr_ptr = bli_daxpyf_zen_int_8;
-        b_fuse = 8;
-
-        scalv_kr_ptr = bli_dscalv_zen_int10;
-
-        copyv_kr_ptr = bli_dcopyv_zen_int;
-
-        break;
-      default:
-        // For non-Zen architectures, query the context if it is NULL
-        if(cntx == NULL) cntx = bli_gks_query_cntx();
-
-        /*
-          Query the context for the kernel function pointers for
-          AXPYF, SCALV, COPYV and corresponding fusing
-          factor of AXPYF kernel
-        */
-        axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_AXPYF_KER, cntx);
-        b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx);
-
-        scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
-
-        copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx);
-    }
+    conja = bli_extract_conj( transa );

    /*
      If alpha is equal to zero, y is only scaled by beta and returned.
      In this case, packing and unpacking y will be costly and it is
      avoided.
    */
-    if ( (incy > 1) && (!bli_deq0( *alpha )))
+    if ( (incy != 1) && (!bli_deq0( *alpha )))
    {
        /*
          Initialize mem pool buffer to NULL and size to 0
@@ -398,13 +399,16 @@ void bli_dgemv_unf_var2

        /*acquire a Buffer(n_elem*size(double)) from the memory broker
        and save the associated mem_t entry to mem_bufY.*/
-        bli_pba_acquire_m(&rntm,
-                                buffer_size,
-                                BLIS_BUFFER_FOR_B_PANEL,
-                                &mem_bufY);
+        bli_pba_acquire_m
+        (
+          &rntm,
+          buffer_size,
+          BLIS_BUFFER_FOR_B_PANEL,
+          &mem_bufY
+        );

        /*Continue packing Y if buffer memory is allocated*/
-        if ((bli_mem_is_alloc( &mem_bufY )))
+        if ( bli_mem_is_alloc( &mem_bufY ) )
        {
            y_temp = bli_mem_buffer(&mem_bufY);

@@ -455,23 +459,23 @@ void bli_dgemv_unf_var2

    for (i = 0; i < n_iter; i += f)
    {
-      f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse);
+        f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse);

-      A1 = a + (i * cs_at);
-      x1 = x + (i * incx);
+        A1 = a + (i * cs_at);
+        x1 = x + (i * incx);

-      axpyf_kr_ptr
-      (
-        conja,
-        conjx,
-        n_elem,
-        f,
-        alpha,
-        A1, rs_at, cs_at,
-        x1, incx,
-        y_temp, temp_incy,
-        cntx
-      );
+        axpyf_kr_ptr
+        (
+          conja,
+          conjx,
+          n_elem,
+          f,
+          alpha,
+          A1, rs_at, cs_at,
+          x1, incx,
+          y_temp, temp_incy,
+          cntx
+        );
    }

    if (is_y_temp_buf_created)
@@ -1108,6 +1112,3 @@ void bli_cgemv_unf_var2

    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
 }
-
-
-
--- a/frame/compat/bla_gemv_amd.c
+++ b/frame/compat/bla_gemv_amd.c
@@ -322,7 +322,7 @@ void dgemv_blis_impl
     * If the matrix dimensions are within 8x8 then calculate the result
     * using DGEMV Reference kernel.
     */
-    if ( m0 < 8 && n0 < 8 )
+    if ( (m0 < 8 && n0 < 8) )
    {
        bli_dgemv_zen_ref
        (
--- a/gtestsuite/testsuite/level2/gemv/dgemv/dgemv_generic.cpp
+++ b/gtestsuite/testsuite/level2/gemv/dgemv/dgemv_generic.cpp
@@ -104,6 +104,54 @@ TEST_P( dgemvGeneric, API )
    test_gemv<T>( storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, is_memory_test );
 }

+// Unit-tests for NO_TRANSPOSE m-biased kernels
+INSTANTIATE_TEST_SUITE_P(
+        dgemv_n_m,
+        dgemvGeneric,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS_LIKE
+            ,'r'
+#endif
+            ),                                                              // storage format
+            ::testing::Values('n', 'c', 't'),                               // transa
+            ::testing::Values('n'),                                         // conjx
+            ::testing::Values( 47 ),                                        // m
+            ::testing::Values( 1, 2, 3, 4, 5, 6, 7, 8, 16, 44 ),            // n
+            ::testing::Values( 0, 1, 2 ),                                   // alpha
+            ::testing::Values( 0, 1, 2 ),                                   // beta
+            ::testing::Values(gtint_t(1), gtint_t(3), gtint_t(-1) ),        // stride size for x
+            ::testing::Values(gtint_t(1) ),                                 // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(7) ),                     // increment to the leading dim of a
+            ::testing::Values(false, true)                                  // is_memory_test
+        ),
+        ::gemvGenericPrint<T>()
+    );
+
+// Unit-tests for NO_TRANSPOSE n-biased kernels
+INSTANTIATE_TEST_SUITE_P(
+        dgemv_n_n,
+        dgemvGeneric,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS_LIKE
+            ,'r'
+#endif
+            ),                                                              // storage format
+            ::testing::Values('n', 'c', 't'),                               // transa
+            ::testing::Values('n'),                                         // conjx
+            ::testing::Values( 95 ),                                        // m
+            ::testing::Values( 1, 2, 3, 4, 15 ),                            // n
+            ::testing::Values( 0, 1, 2 ),                                   // alpha
+            ::testing::Values( 0, 1, 2 ),                                   // beta
+            ::testing::Values(gtint_t(1), gtint_t(3), gtint_t(-1) ),        // stride size for x
+            ::testing::Values(gtint_t(1) ),                                 // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(7) ),                     // increment to the leading dim of a
+            ::testing::Values(false, true)                                  // is_memory_test
+        ),
+        ::gemvGenericPrint<T>()
+    );
+
 // Black box testing.
 INSTANTIATE_TEST_SUITE_P(
        BlackboxSmall,
--- a/kernels/zen/2/bli_gemv_avx2.c
+++ b/kernels/zen/2/bli_gemv_avx2.c
@@ -0,0 +1,228 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+/**
+ * bli_dgemv_n_avx2(...) handles cases where op(A) = NO_TRANSPOSE for Zen/2/3
+ * architectures and is based on the previous approach of using the fused
+ * kernels, namely AXPYF, to perform the GEMV operation.
+ */
+void bli_dgemv_n_avx2
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       double* alpha,
+       double* a, inc_t rs_a, inc_t cs_a,
+       double* x, inc_t incx,
+       double* beta,
+       double* y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_4 );
+    double*  A1;
+    double*  x1;
+    dim_t   i;
+    dim_t   f, b_fuse;
+    dim_t   m0, n0;
+    inc_t   rs_at, cs_at;
+    conj_t  conja;
+
+    // Memory pool declarations for packing vector Y.
+    mem_t   mem_bufY;
+    rntm_t  rntm;
+    double* y_temp    = y;
+    inc_t   temp_incy = incy;
+
+    // Boolean to check if y vector is packed and memory needs to be freed.
+    bool is_y_temp_buf_created = FALSE;
+
+    // Update dimensions and strides based on op(A).
+    bli_set_dims_incs_with_trans( transa,
+                                  m, n, rs_a, cs_a,
+                                  &m0, &n0, &rs_at, &cs_at );
+
+    conja = bli_extract_conj( transa );
+
+    // Function pointer declaration for the functions that will be used.
+    daxpyf_ker_ft  axpyf_kr_ptr;        // DAXPYF
+    dscal2v_ker_ft scal2v_kr_ptr;       // DSCAL2V
+    dscalv_ker_ft  scalv_kr_ptr;        // DSCALV
+    dcopyv_ker_ft  copyv_kr_ptr;        // DCOPYV
+
+    // Setting the fuse factor based on bli_daxpyf_zen_int_8 kernel.
+    b_fuse        = 8;
+    axpyf_kr_ptr  = bli_daxpyf_zen_int_8;       // DAXPYF
+    scal2v_kr_ptr = bli_dscal2v_zen_int;        // DSCAL2V
+    scalv_kr_ptr  = bli_dscalv_zen_int10;       // DSCALV
+    copyv_kr_ptr  = bli_dcopyv_zen_int;         // DCOPYV
+
+    /*
+      If alpha is equal to zero, y is only scaled by beta and returned.
+      In this case, packing and unpacking y will be costly and it is
+      avoided.
+    */
+    if ( (incy != 1) && (!bli_deq0( *alpha )))
+    {
+        /*
+          Initialize mem pool buffer to NULL and size to 0
+          "buf" and "size" fields are assigned once memory
+          is allocated from the pool in bli_pba_acquire_m().
+          This will ensure bli_mem_is_alloc() will be passed on
+          an allocated memory if created or a NULL .
+        */
+        mem_bufY.pblk.buf = NULL;   mem_bufY.pblk.block_size = 0;
+        mem_bufY.buf_type = 0;      mem_bufY.size = 0;
+        mem_bufY.pool = NULL;
+
+        /* In order to get the buffer from pool via rntm access to memory broker
+        is needed. Following are initializations for rntm */
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_pba_rntm_set_pba( &rntm );
+
+        //calculate the size required for m0 double elements in vector Y.
+        size_t buffer_size = m0 * sizeof(double);
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dgemv_n_avx2(): get mem pool block\n" );
+        #endif
+
+        /* Acquire a Buffer(m0*size(double)) from the memory broker
+        and save the associated mem_t entry to mem_bufY. */
+        bli_pba_acquire_m
+        (
+          &rntm,
+          buffer_size,
+          BLIS_BUFFER_FOR_B_PANEL,
+          &mem_bufY
+        );
+
+        /* Continue packing Y if buffer memory is allocated. */
+        if ( bli_mem_is_alloc( &mem_bufY ) )
+        {
+            y_temp = bli_mem_buffer(&mem_bufY);
+
+            // Stride of vector y_temp
+            temp_incy = 1;
+
+            // Query the context if it is NULL. This will be necessary for Zen architectures
+            if(cntx == NULL) cntx = bli_gks_query_cntx();
+
+            scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCAL2V_KER, cntx);
+
+            // Invoke the SCAL2V function using the function pointer
+            scal2v_kr_ptr
+            (
+              BLIS_NO_CONJUGATE,
+              m0,
+              beta,
+              y, incy,
+              y_temp, temp_incy,
+              cntx
+            );
+
+            /*
+              Set y is packed as the memory allocation was successful
+              and contents have been scaled and copied to a temp buffer
+            */
+            is_y_temp_buf_created = TRUE;
+        }
+    }
+    else
+    {
+        // Invoke the DSCALV function using the function pointer
+        scalv_kr_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          m0,
+          beta,
+          y_temp, temp_incy,
+          cntx
+        );
+    }
+
+    if( bli_deq0( *alpha ) )
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
+        return;
+    }
+
+    for ( i = 0; i < n0; i += f )
+    {
+        f = bli_determine_blocksize_dim_f(i, n0, b_fuse);
+
+        A1 = a + (i * cs_at);
+        x1 = x + (i * incx);
+
+        axpyf_kr_ptr
+        (
+          conja,
+          conjx,
+          m0,
+          f,
+          alpha,
+          A1, rs_at, cs_at,
+          x1, incx,
+          y_temp, temp_incy,
+          cntx
+        );
+    }
+
+    // If y was packed into y_temp, copy the contents back to y and free memory.
+    if ( is_y_temp_buf_created )
+    {
+        // Store the result from unit strided y_buf to non-unit strided Y.
+        // Invoke the COPYV function using the function pointer.
+        copyv_kr_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          m0,
+          y_temp, temp_incy,
+          y, incy,
+          cntx
+        );
+
+#ifdef BLIS_ENABLE_MEM_TRACING
+        printf( "bli_dgemv_n_avx2(): releasing mem pool block\n" );
+#endif
+        // Return the buffer to pool
+        bli_pba_release( &rntm , &mem_bufY );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_4 );
+}
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -499,3 +499,17 @@ void bli_dgemv_zen_ref
       double* restrict y, inc_t incy,
       cntx_t* restrict cntx
     );
+
+void bli_dgemv_n_avx2
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       double* alpha,
+       double* a, inc_t rs_a, inc_t cs_a,
+       double* x, inc_t incx,
+       double* beta,
+       double* y, inc_t incy,
+       cntx_t* cntx
+     );
--- a/kernels/zen4/2/bli_gemv_n_zen_int_avx512.c
+++ b/kernels/zen4/2/bli_gemv_n_zen_int_avx512.c
--- a/kernels/zen4/bli_kernels_zen4.h
+++ b/kernels/zen4/bli_kernels_zen4.h
@@ -109,6 +109,34 @@ DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_8_avx512 )
 DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_4_avx512 )
 DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_2_avx512 )

+// gemv (intrinsics)
+// dgemv_n kernels for handling op(A) = 'n', i.e., transa = 'n' cases.
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16mx8_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16mx7_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16mx6_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16mx5_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16mx4_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16mx3_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16mx2_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16mx1_avx512 )
+
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_32x8n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16x8n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_8x8n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_m_leftx8n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_32x4n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16x4n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_8x4n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_m_leftx4n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_32x2n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16x2n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_8x2n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_m_leftx2n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_32x1n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_16x1n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_8x1n_avx512 )
+GEMV_KER_PROT( double,  d, gemv_n_zen_int_m_leftx1n_avx512 )
+
 GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_l_zen_asm_16x14)
 GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_zen_asm_16x14)
 GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_l_zen4_asm_8x24)
@@ -358,3 +386,17 @@ void bli_dynamic_blkszs_zen4

 // function for resetting zmm registers after L3 apis
 void bli_zero_zmm();
+
+void bli_dgemv_n_avx512
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       double* alpha,
+       double* a, inc_t rs_a, inc_t cs_a,
+       double* x, inc_t incx,
+       double* beta,
+       double* y, inc_t incy,
+       cntx_t* cntx
+     );