Implemented Multithreading Support and Optimization of DGEMV API (#10)

- Implemented multithreading framework for the DGEMV API on Zen architectures. Architecture specific AOCL-dynamic logic determines the optimal number of threads for improved performance. - The condition check for the value of beta is optimized by utilizing masked operations. The mask value is set based on value of beta, and the masked operations are applied when the vector y is loaded or scaled with beta. AMD-Internal: [CPUPL-6746]
2026-04-20 07:38:53 +00:00 · 2025-06-17 12:39:48 +05:30
parent 26e5c63781
commit e097346658
10 changed files with 1370 additions and 1722 deletions
--- a/frame/2/gemv/bli_gemv_unf_var1_amd.c
+++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c
@@ -216,31 +216,13 @@ void bli_dgemv_unf_var1
    inc_t lda = cs_a, inca = rs_a;
    conj_t conja;

+    double *a_buf = a;
    double *x_buf = x;
    double *y_buf = y;

    inc_t buf_incx = incx;
    inc_t buf_incy = incy;

-    // 'bli_dgemv_unf_var1' is dot-based kernel. This kernel is called for the following cases:
-    //
-    // When op(A) = n and row-storage( lda = rs_a ), we compute dot product as y[i] = <A(i,:), x>, i = 0:m-1.
-    // gemv dot kernel always computes dot-product along the columns of A, we interchange m and n. Here m0 = n, n0 = m.
-    //
-    // op(A) = n   ->     lda  = rs_a;
-    //                    inca = cs_a;
-    //                    m0   = n;
-    //                    n0   = m;
-    //
-    // when op(A) = t and col-storage( lda = cs_a ), we compute dot product as y[i] = <A(:, i), x>, i = 0:n-1. Anyways
-    // the kernel computes dot along the columns of A, we don't interchange m & n, so here m0 = m and n0 = n.
-    //
-    // op(A) = t   ->     lda  = cs_a;
-    //                    inca = rs_a;
-    //                    m0   = m;
-    //                    n0   = n;
-    //
-
    // Invoking the reference kernel to handle general stride.
    if ( ( rs_a != 1 ) && ( cs_a != 1 ) )
    {
@@ -261,10 +243,29 @@ void bli_dgemv_unf_var1
        return;
    }

+    // 'bli_dgemv_unf_var1' is dot-based kernel. This kernel is called for the following cases:
+    //
+    // When op(A) = n and row-storage( lda = rs_a ), we compute dot product as y[i] = <A(i,:), x>, i = 0:m-1.
+    // gemv dot kernel always computes dot-product along the columns of A, we interchange m and n. Here m0 = n, n0 = m.
+    //
+    // op(A) = n   ->     lda  = rs_a;
+    //                    inca = cs_a;
+    //                    m0   = n;
+    //                    n0   = m;
+    //
+    // when op(A) = t and col-storage( lda = cs_a ), we compute dot product as y[i] = <A(:, i), x>, i = 0:n-1. Anyways
+    // the kernel computes dot along the columns of A, we don't interchange m & n, so here m0 = m and n0 = n.
+    //
+    // op(A) = t   ->     lda  = cs_a;
+    //                    inca = rs_a;
+    //                    m0   = m;
+    //                    n0   = n;
+    //
    bli_set_dims_incs_with_trans(transa,
                                m, n, rs_a, cs_a,
                                &n0, &m0, &lda, &inca);

+    // Extract the conjugation from transa.
    conja = bli_extract_conj(transa);

    //memory pool declarations for packing vector X and Y.
@@ -288,21 +289,34 @@ void bli_dgemv_unf_var1
    */
    arch_t id = bli_arch_query_id();

+#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
+    // Setting the threshold to invoke the fast-path
+    // The fast-path is intended to directly call the kernel
+    // in case the criteria for single threaded execution is met.
+    dim_t fast_path_thresh = 0;
+#endif
+
    switch (id)
    {
      case BLIS_ARCH_ZEN5:
 #if defined(BLIS_KERNELS_ZEN5)
-      gemv_kr_ptr   = bli_dgemv_t_zen_int_avx512;
+      gemv_kr_ptr   = bli_dgemv_t_zen4_int;     // DGEMV
      scalv_kr_ptr  = bli_dscalv_zen_int_avx512;      // DSCALV
      copyv_kr_ptr  = bli_dcopyv_zen5_asm_avx512;     // DCOPYV
+#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
+      fast_path_thresh = 12000;
+#endif
      break;
 #endif
      case BLIS_ARCH_ZEN4:

 #if defined(BLIS_KERNELS_ZEN4)
-        gemv_kr_ptr   = bli_dgemv_t_zen_int_avx512;
+        gemv_kr_ptr   = bli_dgemv_t_zen4_int;     // DGEMV
        scalv_kr_ptr  = bli_dscalv_zen_int_avx512;      // DSCALV
        copyv_kr_ptr  = bli_dcopyv_zen4_asm_avx512;     // DCOPYV
+#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
+        fast_path_thresh = 11000;
+#endif
        break;
 #endif

@@ -310,9 +324,12 @@ void bli_dgemv_unf_var1
      case BLIS_ARCH_ZEN2:
      case BLIS_ARCH_ZEN3:

-        gemv_kr_ptr = bli_dgemv_t_zen_int_avx2;
+        gemv_kr_ptr   = bli_dgemv_t_zen_int;       // DGEMV
        scalv_kr_ptr  = bli_dscalv_zen_int;             // DSCALV
        copyv_kr_ptr  = bli_dcopyv_zen_int;             // DCOPYV
+#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
+          fast_path_thresh = 13000;
+#endif
        break;

      default:
@@ -329,20 +346,22 @@ void bli_dgemv_unf_var1

        PASTECH(d,dotxf_ker_ft) kfp_df;

-        // Query the context for the kernel function pointer and fusing factor.
+        // Query the context for the ddotxf kernel function pointer and fusing factor.
        kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
-
        dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );

+        // 
        for ( i = 0; i < n0; i += f )
        {
+            // Determine the blocksize for the current iteration.
            f  = bli_determine_blocksize_dim_f( i, n0, b_fuse );

-            A1 = a + ( i * lda ) + ( 0 * inca );
+            // Calculate the pointers to the current block of A, x, and y.
+            A1 = a_buf + ( i * lda ) + ( 0 * inca );
            x1 = x_buf;
-            y1 = y + ( i * incy );
+            y1 = y_buf + ( i * incy );

-            // y1 = beta * y1 + alpha * A1 * x;
+            // kfp_df is a function pointer to the dotxf kernel
            kfp_df
            (
                conja,
@@ -415,7 +434,7 @@ void bli_dgemv_unf_var1
            // Using unit-stride for y_temp vector.
            buf_incy = 1;

-            // Invoke the SCAL2V function using the function pointer.
+            // Invoke the COPYV function using the function pointer.
            copyv_kr_ptr
            (
              BLIS_NO_CONJUGATE,
@@ -493,6 +512,7 @@ void bli_dgemv_unf_var1
            // stride of vector x_buf =1
            buf_incx = 1;

+            // Invoke the COPYV function using the function pointer.
            copyv_kr_ptr
            (
                BLIS_NO_CONJUGATE,
@@ -502,27 +522,120 @@ void bli_dgemv_unf_var1
                cntx
            );

+            // Set x is packed as the memory allocation was successful
+            // and contents have been copied to a temp buffer.
            is_x_temp_buf_created = TRUE;
        }
    }

-    // Calling the selected kernel for the API
+    // If the increments of x and y are unit stride, we can use the
+    // optimized kernel path. The optimized kernel does not support
+    // non-unit stride for x and y.
    if ( buf_incx == 1 && buf_incy == 1 )
    {
-      gemv_kr_ptr
-      (
-          conja,
-          conjx,
-          m0,
-          n0,
-          alpha,
-          a, inca, lda,
-          x_buf, buf_incx,
-          beta,
-          y_buf, buf_incy,
-          cntx
-      );
+#if defined(BLIS_ENABLE_OPENMP)
+      // If the problem size is small, we can use a fast-path to avoid
+      // the overhead of threading.
+      if ( ((n0 * m0) <= fast_path_thresh) || ((n0 < 100) && (m0 < 100)) )
+      {
+#endif
+        // Call the DGEMV kernel directly with the packed buffers.
+        gemv_kr_ptr
+        (
+            conja,
+            conjx,
+            m0,
+            n0,
+            alpha,
+            a_buf, inca, lda,
+            x_buf, buf_incx,
+            beta,
+            y_buf, buf_incy,
+            cntx
+        );
+
+#if defined(BLIS_ENABLE_OPENMP)
+      }
+      else
+      {
+        // Initializing nt as 1 to avoid compiler warnings
+        dim_t nt = 1;
+
+        /*
+        For the given problem size and architecture, the function
+        returns the optimum number of threads with AOCL dynamic enabled
+        else it returns the number of threads requested by the user.
+        */
+
+        bli_nthreads_l2
+        (
+            BLIS_GEMV_KER,
+            BLIS_DOUBLE,
+            BLIS_TRANSPOSE,
+            id,
+            n0,
+            m0,
+            &nt
+        );
+
+        _Pragma("omp parallel num_threads(nt)")
+        {
+          dim_t start, end;
+          thrinfo_t thread;
+
+          // The factor by which the size should be a multiple during thread partition.
+          // The main loop of the kernel can handle 8 elements at a time hence 8 is selected for block_size.
+          dim_t block_size = 8;
+
+          // Get the thread ID
+          bli_thrinfo_set_work_id( omp_get_thread_num(), &thread );
+
+          // Get the actual number of threads spawned
+          bli_thrinfo_set_n_way( omp_get_num_threads(), &thread );
+
+          /*
+          Calculate the compute range (start and end) for the current thread
+          based on the actual number of threads spawned
+          */
+
+          bli_thread_range_sub
+          (
+              &thread,
+              n0,
+              block_size,
+              FALSE,
+              &start,
+              &end
+          );
+
+          // Calculating the value of n for the particular thread
+          dim_t n_thread_local = end - start;
+
+          // Calculating thread specific pointers
+          double *a_thread_local = a_buf + (start * lda);
+          double *y_thread_local = y_buf + start;
+          double *x_thread_local = x_buf;
+
+          // Call the DGEMV kernel with the thread-local pointers.
+          gemv_kr_ptr
+          (
+              conja,
+              conjx,
+              m0,
+              n_thread_local,
+              alpha,
+              a_thread_local, inca, lda,
+              x_thread_local, buf_incx,
+              beta,
+              y_thread_local, buf_incy,
+              cntx
+          );
+        }
+      }
+#endif
    }
+
+    // If the increments of x and y are not unit stride, we call the reference kernel.
    else
    {
      bli_dgemv_zen_ref
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -2784,3 +2784,270 @@ void bli_nthreads_l1f

 #endif
 }
+
+/*
+	Functionality:
+	--------------
+
+	This function decides the AOCL dynamic logic for L2 dgemv API based on the
+	architecture ID and size of the input variable.
+
+	Function signature
+	-------------------
+
+	This function takes the following input:
+
+	* 'arch_id' - Architecture ID of the system (copy of BLIS global arch id)
+	* 'm_elem' - Number of rows in the matrix
+	* 'n_elem' - Number of columns in the matrix
+	* 'variant' 	- Transpose / Non-Transpose variant of the kernel
+	* 'nt_ideal' - Ideal number of threads
+
+	Exception
+	----------
+
+	1. For non-Zen architectures, return -1. The expectation is that this is handled
+	   in the higher layer
+*/
+BLIS_INLINE void aocl_dgemv_dynamic
+     (
+       arch_t arch_id,
+       dim_t  m_elem,
+       dim_t  n_elem,
+	   trans_t  variant,
+       dim_t* nt_ideal
+     )
+{
+	// Pick the AOCL dynamic logic based on the
+	// architecture ID
+	dim_t size = n_elem * m_elem;
+
+	// AOCL dynamic logic for transpose case
+	if (variant == BLIS_TRANSPOSE)
+	{
+		switch ( arch_id )
+		{
+			case BLIS_ARCH_ZEN5:
+
+				if ( size <  12000 )
+					*nt_ideal = 1;
+				else if ( size <  27500 )
+					*nt_ideal = 4;
+				else if ( size <  758000 )
+					*nt_ideal = 8;
+				else if ( size <  1580000 )
+					*nt_ideal = 16;
+				else if ( size <  3390000 )
+					*nt_ideal = 32;
+				else if ( size <  10140000 )
+					*nt_ideal = 64;
+				else if ( size <  14600000 )
+					*nt_ideal = 96;
+				else
+					// For sizes in this range, AOCL dynamic does not make any change
+					*nt_ideal = -1;
+				break;
+
+			case BLIS_ARCH_ZEN4:
+
+				if ( size < 11000 )
+					*nt_ideal = 1;
+				else if ( size < 34500 )
+					*nt_ideal = 4;
+				else if ( size < 707000 )
+					*nt_ideal = 8;
+				else if ( size < 1870000 )
+					*nt_ideal = 16;
+				else if ( size < 4800000 )
+					*nt_ideal = 32;
+				else if ( size < 9000000 )
+					*nt_ideal = 64;
+				else
+					// For sizes in this range, AOCL dynamic does not make any change
+					*nt_ideal = -1;
+				break;
+
+			case BLIS_ARCH_ZEN:
+			case BLIS_ARCH_ZEN2:
+			case BLIS_ARCH_ZEN3:
+
+				if ( size < 13000 )
+					*nt_ideal = 1;
+				else if ( size < 17300 )
+					*nt_ideal = 4;
+				else if ( size < 300000 )
+					*nt_ideal = 8;
+				else if ( size < 640000 )
+					*nt_ideal = 16;
+				else if ( size < 1700000 )
+					*nt_ideal = 32;
+				else
+					// For sizes in this range, AOCL dynamic does not make any change
+					*nt_ideal = -1;
+				break;
+
+			default:
+			/*
+				Without this default condition, compiler will throw
+				a warning saying other conditions are not handled
+			*/
+
+			/*
+				For other architectures, AOCL dynamic does not make any change
+			*/
+			*nt_ideal = -1;
+
+		}
+	}
+
+	// AOCL dynamic logic for non-transpose case
+	else
+	{
+		*nt_ideal = -1;
+	}
+
+}
+
+/*
+	Functionality:
+	--------------
+
+	This function does the following:
+
+	1. Reads the number of threads requested by the user from the rntm variable
+	2. Acts as the gateway to the AOCL dynamic logic if AOCL dynamic is enabled
+	   and alters the count of the number of threads accordingly
+
+	Function signature
+	-------------------
+
+	This function takes the following input:
+
+	* 'ker_id' 		- ID of kernel invoking this function
+	* 'datatype' 	- Datatype of kernel
+	* 'variant' 	- Transpose / Non-Transpose variant of the kernel
+	* 'arch_id' 	- Architecture ID of the system (copy of BLIS global arch id)
+	* 'm_elem' 		- Number of row in the matrix
+	* 'n_elem' 		- Number of columns in the matrix
+	* 'nt_ideal' 	- Ideal number of threads
+
+	Exception
+	----------
+
+	None
+*/
+void bli_nthreads_l2
+     (
+       l2kr_t   ker_id,
+       num_t    data_type,
+       trans_t  variant,
+       arch_t   arch_id,
+       dim_t    m_elem,
+       dim_t    n_elem,
+       dim_t*   nt_ideal
+     )
+{
+#ifdef AOCL_DYNAMIC
+	/*
+		This code sections dispatches the AOCL dynamic logic kernel for
+		L2 APIs based on the kernel ID and the data type.
+	*/
+	// Function pointer to AOCL Dynamic logic kernel
+	void (*aocl_dynamic_func_l2)(arch_t, dim_t, dim_t, trans_t, dim_t* ) = NULL;
+
+	// Pick the aocl dynamic thread decision kernel based on the kernel ID
+	switch (ker_id)
+	{
+		case BLIS_GEMV_KER:
+
+			if ( data_type == BLIS_DOUBLE )
+			{
+				// Function for DGEMV
+				aocl_dynamic_func_l2 = aocl_dgemv_dynamic;
+			}
+			else
+			{
+				*nt_ideal = -1;
+			}
+			break;
+
+		default:
+			/*
+				For kernels that do no have AOCL dynamic logic,
+				use the number of threads requested by the user.
+			*/
+			*nt_ideal = -1;
+	}
+
+	/*
+		For APIs that do not have AOCL dynamic
+		logic, aocl_dynamic_func_l2 will be NULL.
+	*/
+	if( aocl_dynamic_func_l2 != NULL)
+	{
+		// Call the AOCL dynamic logic kernel
+		aocl_dynamic_func_l2
+		(
+			arch_id,
+			m_elem,
+			n_elem,
+			variant,
+			nt_ideal
+		);
+
+		if (*nt_ideal == 1)
+		{
+			// Return early when the number of threads is 1
+			return;
+		}
+	}
+
+#endif
+	// Initialized to avoid compiler warning
+	rntm_t rntm_local;
+
+	// Initialize a local runtime with global settings.
+	bli_rntm_init_from_global(&rntm_local);
+
+	// Query the total number of threads from the rntm_t object.
+	dim_t nt_rntm = bli_rntm_num_threads(&rntm_local);
+
+	if (nt_rntm <= 0)
+	{
+		// nt is less than one if BLIS manual setting of parallelism
+		// has been used. Parallelism here will be product of values.
+		nt_rntm = bli_rntm_calc_num_threads(&rntm_local);
+	}
+
+#ifdef AOCL_DYNAMIC
+
+	// Calculate the actual number of threads that will be spawned
+	if (*nt_ideal != -1)
+	{
+		// The if block is executed for all Zen architectures
+		*nt_ideal = bli_min(nt_rntm, *nt_ideal);
+	}
+	else
+	{
+		/*
+			For non-Zen architectures and very large sizes,
+			spawn the actual number of threads requested
+		*/
+		*nt_ideal = nt_rntm;
+	}
+
+	/*
+	  When the number of element to be processed is less
+	  than the number of threads spawn n_elem number of threads.
+	*/
+	if (n_elem < *nt_ideal)
+	{
+		*nt_ideal = n_elem;
+	}
+#else
+
+	// Calculate the actual number of threads that will be spawned
+	*nt_ideal = nt_rntm;
+
+#endif
+}
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -6,7 +6,7 @@

   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -137,6 +137,17 @@ void bli_nthreads_l1f
       dim_t*  nt_ideal
     );

+void bli_nthreads_l2
+     (
+       l2kr_t   ker_id,
+       num_t    data_type,
+       trans_t  variant,
+       arch_t   arch_id,
+       dim_t    m_elem,
+       dim_t    n_elem,
+       dim_t*   nt_ideal
+     );
+
 // Runtime object type (defined in bli_type_defs.h)

 /*
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -668,6 +668,13 @@ typedef enum

 #define BLIS_NUM_LEVEL1F_KERS 5

+typedef enum
+{
+	BLIS_GEMV_KER = 0,
+	BLIS_TRSV_KER
+} l2kr_t;
+
+#define BLIS_NUM_LEVEL2_KERS 2

 typedef enum
 {
--- a/gtestsuite/testinghelpers/inc/common/blis_version_defs.h
+++ b/gtestsuite/testinghelpers/inc/common/blis_version_defs.h
@@ -76,22 +76,22 @@
    #define K_bli_zgemmsup_cd_zen4_asm_8x2 1
    #define K_bli_zgemmsup_cd_zen4_asm_8x4 1
    #define K_bli_dgemmsup_rv_zen4_asm_24x8m_new 1
-    #define K_bli_dgemv_t_zen_int_avx2 1
-    #define K_bli_dgemv_t_zen_int_16x7m_avx2 1
-    #define K_bli_dgemv_t_zen_int_16x6m_avx2 1
-    #define K_bli_dgemv_t_zen_int_16x5m_avx2 1
-    #define K_bli_dgemv_t_zen_int_16x4m_avx2 1
-    #define K_bli_dgemv_t_zen_int_16x3m_avx2 1
-    #define K_bli_dgemv_t_zen_int_16x2m_avx2 1
-    #define K_bli_dgemv_t_zen_int_16x1m_avx2 1
-    #define K_bli_dgemv_t_zen_int_avx512 1
-    #define K_bli_dgemv_t_zen_int_32x7m_avx512 1
-    #define K_bli_dgemv_t_zen_int_32x6m_avx512 1
-    #define K_bli_dgemv_t_zen_int_32x5m_avx512 1
-    #define K_bli_dgemv_t_zen_int_32x4m_avx512 1
-    #define K_bli_dgemv_t_zen_int_32x3m_avx512 1
-    #define K_bli_dgemv_t_zen_int_32x2m_avx512 1
-    #define K_bli_dgemv_t_zen_int_32x1m_avx512 1
+    #define K_bli_dgemv_t_zen_int 1
+    #define K_bli_dgemv_t_zen_int_16x7m 1
+    #define K_bli_dgemv_t_zen_int_16x6m 1
+    #define K_bli_dgemv_t_zen_int_16x5m 1
+    #define K_bli_dgemv_t_zen_int_16x4m 1
+    #define K_bli_dgemv_t_zen_int_16x3m 1
+    #define K_bli_dgemv_t_zen_int_16x2m 1
+    #define K_bli_dgemv_t_zen_int_16x1m 1
+    #define K_bli_dgemv_t_zen4_int 1
+    #define K_bli_dgemv_t_zen4_int_32x7m 1
+    #define K_bli_dgemv_t_zen4_int_32x6m 1
+    #define K_bli_dgemv_t_zen4_int_32x5m 1
+    #define K_bli_dgemv_t_zen4_int_32x4m 1
+    #define K_bli_dgemv_t_zen4_int_32x3m 1
+    #define K_bli_dgemv_t_zen4_int_32x2m 1
+    #define K_bli_dgemv_t_zen4_int_32x1m 1
    #define K_bli_ztrsm_small_ZEN5 1
    #define K_bli_dgemv_n_zen_int_16mx8_avx512 1
    #define K_bli_dgemv_n_zen_int_16mx7_avx512 1
--- a/gtestsuite/testsuite/ukr/gemv/dgemv/dgemv_ukr.cpp
+++ b/gtestsuite/testsuite/ukr/gemv/dgemv/dgemv_ukr.cpp
@@ -112,12 +112,12 @@ TEST_P( dgemvGeneric, UKR )

 #if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
 // Unit-tests
-#ifdef K_bli_dgemv_t_zen_int_avx2
+#ifdef K_bli_dgemv_t_zen_int
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_primary_zen,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_avx2),
+        ::testing::Values(bli_dgemv_t_zen_int),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -133,7 +133,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(95)),                                // 5 * L16 + L8 + L4 + Lfringe
        ::testing::Range( gtint_t(1), gtint_t(16), gtint_t(1)),         // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x (non-unit incx is handled by frame)
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -143,12 +143,12 @@ INSTANTIATE_TEST_SUITE_P(
    );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_16x7m_avx2
+#ifdef K_bli_dgemv_t_zen_int_16x7m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx7_zen,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_16x7m_avx2),
+        ::testing::Values(bli_dgemv_t_zen_int_16x7m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -164,7 +164,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(95)),                                // 5 * L16 + L8 + L4 + Lfringe
        ::testing::Values( gtint_t(7)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -174,12 +174,12 @@ INSTANTIATE_TEST_SUITE_P(
    );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_16x6m_avx2
+#ifdef K_bli_dgemv_t_zen_int_16x6m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx6_zen,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_16x6m_avx2),
+        ::testing::Values(bli_dgemv_t_zen_int_16x6m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -195,7 +195,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(95)),                                // 5 * L16 + L8 + L4 + Lfringe
        ::testing::Values( gtint_t(6)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -205,12 +205,12 @@ INSTANTIATE_TEST_SUITE_P(
    );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_16x5m_avx2
+#ifdef K_bli_dgemv_t_zen_int_16x5m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx5_zen,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_16x5m_avx2),
+        ::testing::Values(bli_dgemv_t_zen_int_16x5m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -226,7 +226,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(95)),                                // 5 * L16 + L8 + L4 + Lfringe
        ::testing::Values( gtint_t(5)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -236,12 +236,12 @@ INSTANTIATE_TEST_SUITE_P(
    );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_16x4m_avx2
+#ifdef K_bli_dgemv_t_zen_int_16x4m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx4_zen,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_16x4m_avx2),
+        ::testing::Values(bli_dgemv_t_zen_int_16x4m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -257,7 +257,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(95)),                                // 5 * L16 + L8 + L4 + Lfringe
        ::testing::Values( gtint_t(4)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -267,12 +267,12 @@ INSTANTIATE_TEST_SUITE_P(
    );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_16x3m_avx2
+#ifdef K_bli_dgemv_t_zen_int_16x3m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx3_zen,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_16x3m_avx2),
+        ::testing::Values(bli_dgemv_t_zen_int_16x3m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -288,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(95)),                                // 5 * L16 + L8 + L4 + Lfringe
        ::testing::Values( gtint_t(3)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -298,12 +298,12 @@ INSTANTIATE_TEST_SUITE_P(
    );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_16x2m_avx2
+#ifdef K_bli_dgemv_t_zen_int_16x2m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx2_zen,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_16x2m_avx2),
+        ::testing::Values(bli_dgemv_t_zen_int_16x2m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -319,7 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(95)),                                // 5 * L16 + L8 + L4 + Lfringe
        ::testing::Values( gtint_t(2)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -329,12 +329,12 @@ INSTANTIATE_TEST_SUITE_P(
    );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_16x1m_avx2
+#ifdef K_bli_dgemv_t_zen_int_16x1m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx1_zen,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_16x1m_avx2),
+        ::testing::Values(bli_dgemv_t_zen_int_16x1m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -350,7 +350,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(95)),                                // 5 * L16 + L8 + L4 + Lfringe
        ::testing::Values( gtint_t(1)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -363,12 +363,12 @@ INSTANTIATE_TEST_SUITE_P(

 #if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
 // Unit-tests
-#ifdef K_bli_dgemv_t_zen_int_avx512
+#ifdef K_bli_dgemv_t_zen4_int
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_primary_zen4,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_avx512),
+        ::testing::Values(bli_dgemv_t_zen4_int),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -384,7 +384,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(191)),                               // 5 * L32 + L16 + L8 + Lfringe
        ::testing::Range( gtint_t(1), gtint_t(16), gtint_t(1)),         // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x (non-unit incx is handled by frame)
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -394,12 +394,12 @@ INSTANTIATE_TEST_SUITE_P(
 );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_32x7m_avx512
+#ifdef K_bli_dgemv_t_zen4_int_32x7m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx7_zen4,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_32x7m_avx512),
+        ::testing::Values(bli_dgemv_t_zen4_int_32x7m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -415,7 +415,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(191)),                               // 5 * L32 + L16 + L8 + Lfringe
        ::testing::Values( gtint_t(7)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -425,12 +425,12 @@ INSTANTIATE_TEST_SUITE_P(
 );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_32x6m_avx512
+#ifdef K_bli_dgemv_t_zen4_int_32x6m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx6_zen4,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_32x6m_avx512),
+        ::testing::Values(bli_dgemv_t_zen4_int_32x6m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -446,7 +446,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(191)),                               // 5 * L32 + L16 + L8 + Lfringe
        ::testing::Values( gtint_t(6)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -456,12 +456,12 @@ INSTANTIATE_TEST_SUITE_P(
 );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_32x5m_avx512
+#ifdef K_bli_dgemv_t_zen4_int_32x5m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx5_zen4,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_32x5m_avx512),
+        ::testing::Values(bli_dgemv_t_zen4_int_32x5m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -477,7 +477,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(191)),                               // 5 * L32 + L16 + L8 + Lfringe
        ::testing::Values( gtint_t(5)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -487,12 +487,12 @@ INSTANTIATE_TEST_SUITE_P(
 );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_32x4m_avx512
+#ifdef K_bli_dgemv_t_zen4_int_32x4m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx4_zen4,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_32x4m_avx512),
+        ::testing::Values(bli_dgemv_t_zen4_int_32x4m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -508,7 +508,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(191)),                               // 5 * L32 + L16 + L8 + Lfringe
        ::testing::Values( gtint_t(4)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -518,12 +518,12 @@ INSTANTIATE_TEST_SUITE_P(
 );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_32x3m_avx512
+#ifdef K_bli_dgemv_t_zen4_int_32x3m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx3_zen4,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_32x3m_avx512),
+        ::testing::Values(bli_dgemv_t_zen4_int_32x3m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -539,7 +539,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(191)),                               // 5 * L32 + L16 + L8 + Lfringe
        ::testing::Values( gtint_t(3)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -549,12 +549,12 @@ INSTANTIATE_TEST_SUITE_P(
 );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_32x2m_avx512
+#ifdef K_bli_dgemv_t_zen4_int_32x2m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx2_zen4,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_32x2m_avx512),
+        ::testing::Values(bli_dgemv_t_zen4_int_32x2m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -570,7 +570,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(191)),                               // 5 * L32 + L16 + L8 + Lfringe
        ::testing::Values( gtint_t(2)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
@@ -580,12 +580,12 @@ INSTANTIATE_TEST_SUITE_P(
 );
 #endif

-#ifdef K_bli_dgemv_t_zen_int_32x1m_avx512
+#ifdef K_bli_dgemv_t_zen4_int_32x1m
 INSTANTIATE_TEST_SUITE_P(
    dgemv_t_mx1_zen4,
    dgemvGeneric,
    ::testing::Combine(
-        ::testing::Values(bli_dgemv_t_zen_int_32x1m_avx512),
+        ::testing::Values(bli_dgemv_t_zen4_int_32x1m),
        ::testing::Values('c'),                                         // storage format
        ::testing::Values('t'),                                         // transa
        ::testing::Values('n'),                                         // conjx
@@ -601,7 +601,7 @@ INSTANTIATE_TEST_SUITE_P(
                           gtint_t(191)),                               // 5 * L32 + L16 + L8 + Lfringe
        ::testing::Values( gtint_t(1)),                                 // n
        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // alpha
-        ::testing::Values( double(0.0), double(1.0), double(2.0)),                               // beta
+        ::testing::Values( double(0.0), double(1.0), double(2.0)),      // beta
        ::testing::Values(gtint_t(1)),                                  // stride size for x
        ::testing::Values(gtint_t(1)),                                  // stride size for y
        ::testing::Values(gtint_t(0), gtint_t(7)),                      // increment to the leading dim of a
--- a/kernels/zen/2/bli_gemv_t_zen_int.c
+++ b/kernels/zen/2/bli_gemv_t_zen_int.c
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -160,14 +160,14 @@ GEMV_KER_PROT( scomplex, c,  gemv_zen_int_4x4 )
 GEMV_KER_PROT( dcomplex, z,  gemv_zen_int_4x4 )

 // gemv (intrinsics)
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_avx2 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x7m_avx2 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x6m_avx2 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x5m_avx2 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x4m_avx2 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x3m_avx2 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x2m_avx2 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x1m_avx2 )
+GEMV_KER_PROT( double,  d, gemv_t_zen_int )
+GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x7m )
+GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x6m )
+GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x5m )
+GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x4m )
+GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x3m )
+GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x2m )
+GEMV_KER_PROT( double,  d, gemv_t_zen_int_16x1m )

 // her (intrinsics)
 HER_KER_PROT( dcomplex, z,  her_zen_int_var1 )
--- a/kernels/zen4/2/bli_gemv_t_zen_int_avx512.c
+++ b/kernels/zen4/2/bli_gemv_t_zen_int_avx512.c
--- a/kernels/zen4/bli_kernels_zen4.h
+++ b/kernels/zen4/bli_kernels_zen4.h
@@ -146,14 +146,14 @@ GEMV_KER_PROT( double,  d, gemv_n_zen_int_8x1n_avx512 )
 GEMV_KER_PROT( double,  d, gemv_n_zen_int_m_leftx1n_avx512 )

 // dgemv_t kernels for handling op(A) = 't', i.e., transa = 't' cases.
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_avx512 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_32x7m_avx512 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_32x6m_avx512 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_32x5m_avx512 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_32x4m_avx512 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_32x3m_avx512 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_32x2m_avx512 )
-GEMV_KER_PROT( double,  d, gemv_t_zen_int_32x1m_avx512 )
+GEMV_KER_PROT( double,  d, gemv_t_zen4_int )
+GEMV_KER_PROT( double,  d, gemv_t_zen4_int_32x7m )
+GEMV_KER_PROT( double,  d, gemv_t_zen4_int_32x6m )
+GEMV_KER_PROT( double,  d, gemv_t_zen4_int_32x5m )
+GEMV_KER_PROT( double,  d, gemv_t_zen4_int_32x4m )
+GEMV_KER_PROT( double,  d, gemv_t_zen4_int_32x3m )
+GEMV_KER_PROT( double,  d, gemv_t_zen4_int_32x2m )
+GEMV_KER_PROT( double,  d, gemv_t_zen4_int_32x1m )

 GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_l_zen_asm_16x14)
 GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_zen_asm_16x14)