From 4fcc4e499d807259f510af1c1f750402107ecdaa Mon Sep 17 00:00:00 2001
From: Meghana Vankadari <Meghana.Vankadari@amd.com>
Date: Thu, 14 May 2020 11:04:05 +0530
Subject: [PATCH] Optimized DGEMV kernel and changed BLAS interface call

Details:
- Optimized daxpyf kernel with fuse_factor=5 and iter_unroll=2.
- Modified framework files of dgemv to remove dependency on cntx variable.
- Updated cntx_init file of zen2 to choose optimized kernels.
- Modified BLAS interface call for DGEMV to reduce framework overhread.
- Currently these changes are applicable for zen2 configuration.
  They will be enabled for zen family processors in future.
- Changed naming convention for new BLAS macros to indicate their use.
- Added new optimized kernel for axpyf under zen2 folder.
- Implemented basic GEMV kernel without using axpyv or axpyf.
  This kernel is chosen for small sizes.

Change-Id: I4278d37e494854879c71499b8b9da8c5dbe3bf5b
Signed-off-by: Meghana Vankadari <Meghana.Vankadari@amd.com>
AMD-Internal: [CPUPL-885]
---
 common.mk                               |   2 +
 config/zen2/bli_cntx_init_zen2.c        |   4 +-
 frame/2/bli_l2_ker.h                    |  50 +++
 frame/2/bli_l2_ker_prot.h               |  54 +++
 frame/2/gemv/bli_gemv_unf_var1.c        |  74 +++-
 frame/2/gemv/bli_gemv_unf_var2.c        |  81 ++++-
 frame/3/trsm/bli_trsm_front.c           |   9 +-
 frame/compat/bla_amax.c                 |   2 +-
 frame/compat/bla_axpy.c                 |   2 +-
 frame/compat/bla_copy.c                 |   2 +-
 frame/compat/bla_dot.c                  |   2 +-
 frame/compat/bla_gemv.c                 | 449 ++++++++++++++++++++----
 frame/compat/bla_scal.c                 |   2 +-
 frame/compat/bla_swap.c                 |   2 +-
 frame/include/bli_arch_config.h         |   8 +-
 frame/include/bli_gentfunc_macro_defs.h |  14 +-
 frame/include/blis.h                    |   3 +-
 kernels/zen2/.gitignore                 |   4 -
 kernels/zen2/1f/bli_axpyf_zen_int_5.c   | 326 +++++++++++++++++
 kernels/zen2/2/bli_gemv_zen_ref.c       | 103 ++++++
 kernels/zen2/bli_kernels_zen2.h         |  45 +++
 21 files changed, 1138 insertions(+), 100 deletions(-)
 create mode 100644 frame/2/bli_l2_ker.h
 create mode 100644 frame/2/bli_l2_ker_prot.h
 delete mode 100644 kernels/zen2/.gitignore
 create mode 100644 kernels/zen2/1f/bli_axpyf_zen_int_5.c
 create mode 100644 kernels/zen2/2/bli_gemv_zen_ref.c
 create mode 100644 kernels/zen2/bli_kernels_zen2.h

diff --git a/common.mk b/common.mk
index e1ec24a29..cc53eaad2 100644
--- a/common.mk
+++ b/common.mk
@@ -5,6 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2020, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -153,6 +154,7 @@ get-aocldtl-cflags-for     = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
 get-kernel-cflags-for    = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
                                    $(call load-var-for,CKVECFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
+                                   $(COMPSIMDFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index 5dd8c041b..205ae774a 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -69,7 +69,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  4,
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
@@ -128,7 +128,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
 #endif
 
-	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     5,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
diff --git a/frame/2/bli_l2_ker.h b/frame/2/bli_l2_ker.h
new file mode 100644
index 000000000..ca6e1d247
--- /dev/null
+++ b/frame/2/bli_l2_ker.h
@@ -0,0 +1,50 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Define template prototypes for level-2 kernels.
+//
+
+// Note: Instead of defining function prototype macro templates and then
+// instantiating those macros to define the individual function prototypes,
+// we simply alias the official operations' prototypes as defined in
+// bli_l2_ker_prot.h.
+
+#undef  GENTPROT
+#define GENTPROT GEMV_KER_PROT
+
+INSERT_GENTPROT_BASIC0( gemv_ker_name )
+
diff --git a/frame/2/bli_l2_ker_prot.h b/frame/2/bli_l2_ker_prot.h
new file mode 100644
index 000000000..15888a55f
--- /dev/null
+++ b/frame/2/bli_l2_ker_prot.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Define template prototypes for level-1f kernels.
+//
+
+#define GEMV_KER_PROT( ctype, ch, opname ) \
+\
+void PASTEMAC(ch,opname) \
+     ( \
+       dim_t            m, \
+       dim_t            n, \
+       ctype*  restrict alpha, \
+       ctype*  restrict a, inc_t lda, \
+       ctype*  restrict x, inc_t incx, \
+       ctype*  restrict beta, \
+       ctype*  restrict y, inc_t incy, \
+       cntx_t* restrict cntx  \
+     );
+
diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c
index e392e830e..cfbaa8fd4 100644
--- a/frame/2/gemv/bli_gemv_unf_var1.c
+++ b/frame/2/gemv/bli_gemv_unf_var1.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -51,6 +52,10 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      ) \
 { \
+	bli_init_once(); \
+\
+	if( cntx == NULL) cntx = bli_gks_query_cntx(); \
+\
 	const num_t dt = PASTEMAC(ch,type); \
 \
 	ctype*  A1; \
@@ -100,5 +105,72 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
+#ifdef BLIS_CONFIG_ZEN2
+void bli_dgemv_unf_var1
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       double*  alpha,
+       double*  a, inc_t rs_a, inc_t cs_a,
+       double*  x, inc_t incx,
+       double*  beta,
+       double*  y, inc_t incy,
+       cntx_t* cntx 
+     )
+{
 
+	double*  A1;
+	double*  x1;
+	double*  y1;
+	dim_t   i;
+	dim_t   b_fuse, f;
+	dim_t   n_elem, n_iter;
+	inc_t   rs_at, cs_at;
+	conj_t  conja;
+
+	bli_init_once();
+
+	if( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	bli_set_dims_incs_with_trans( transa,
+	                              m, n, rs_a, cs_a,
+	                              &n_iter, &n_elem, &rs_at, &cs_at );
+
+	conja = bli_extract_conj( transa );
+
+
+	/* Query the context for the kernel function pointer and fusing factor. */
+	b_fuse = 8;
+ 
+	for ( i = 0; i < n_iter; i += f )
+	{
+		f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+		A1 = a + (i  )*rs_at + (0  )*cs_at;
+		x1 = x + (0  )*incy;
+		y1 = y + (i  )*incy;
+
+		/* y1 = beta * y1 + alpha * A1 * x; */
+		bli_ddotxf_zen_int_8
+		(
+		  conja,
+		  conjx,
+		  n_elem,
+		  f,
+		  alpha,
+		  A1,   cs_at, rs_at,
+		  x1,   incx,
+		  beta,
+		  y1,   incy,
+		  cntx 
+		);
+
+	}
+}
+
+INSERT_GENTFUNC_BASIC0_SCZ( gemv_unf_var1 )
+#else
+INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
+#endif
diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c
index fe7702e4c..403bd4f78 100644
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -51,6 +52,10 @@ void PASTEMAC(ch,varname) \
        cntx_t* cntx  \
      ) \
 { \
+	bli_init_once(); \
+\
+	if(cntx == NULL) cntx = bli_gks_query_cntx(); \
+\
 	const num_t dt = PASTEMAC(ch,type); \
 \
 	ctype*  zero       = PASTEMAC(ch,0); \
@@ -127,5 +132,79 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
+#ifdef BLIS_CONFIG_ZEN2
 
+void bli_dgemv_unf_var2
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       double*  alpha,
+       double*  a, inc_t rs_a, inc_t cs_a,
+       double*  x, inc_t incx,
+       double*  beta,
+       double*  y, inc_t incy,
+       cntx_t* cntx 
+     )
+{
+
+	double*  A1;
+	double*  x1;
+	double*  y1;
+	dim_t   i;
+	dim_t   b_fuse, f;
+	dim_t   n_elem, n_iter;
+	inc_t   rs_at, cs_at;
+	conj_t  conja;
+
+	bli_set_dims_incs_with_trans( transa,
+	                              m, n, rs_a, cs_a,
+	                              &n_elem, &n_iter, &rs_at, &cs_at );
+
+	conja = bli_extract_conj( transa );
+
+	/* If beta is zero, use setv. Otherwise, scale by beta. */
+		/* y = beta * y; */
+	/* beta=0 case is hadled by scalv internally */
+
+		bli_dscalv_zen_int10
+		(
+		  BLIS_NO_CONJUGATE,
+		  n_elem,
+		  beta,
+		  y, incy,
+		  NULL
+		);
+
+	/* Query the context for the kernel function pointer and fusing factor. */
+	b_fuse = 5;
+
+	for ( i = 0; i < n_iter; i += f )
+	{
+		f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+		A1 = a + (0  )*rs_at + (i  )*cs_at;
+		x1 = x + (i  )*incx;
+		y1 = y + (0  )*incy;
+
+		/* y = y + alpha * A1 * x1; */
+		bli_daxpyf_zen_int_5
+		(
+		  conja,
+		  conjx,
+		  n_elem,
+		  f,
+		  alpha,
+		  A1, rs_at, cs_at,
+		  x1, incx,
+		  y1, incy,
+		  NULL
+		);
+	}
+}
+
+INSERT_GENTFUNC_BASIC0_SCZ( gemv_unf_var2 )
+#else
+INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
+#endif
diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
index eef982e0f..6685e756c 100644
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -48,16 +48,11 @@ void bli_trsm_front
      )
 {
 	bli_init_once();
-	int i, j;
+
 	obj_t   a_local;
 	obj_t   b_local;
 	obj_t   c_local;
  
-//int m = bli_obj_length(*b);
-//int n = bli_obj_width(*b); 
-//float *L =  a->buffer;
- // float *B =  b->buffer;
-
 #ifdef PRINT_SMALL_TRSM_INFO
         printf("Side:: %c\n", side ? 'R' : 'L');
         if (bli_obj_datatype(*a) == BLIS_FLOAT)
diff --git a/frame/compat/bla_amax.c b/frame/compat/bla_amax.c
index 33fe83b90..cad33e1bf 100644
--- a/frame/compat/bla_amax.c
+++ b/frame/compat/bla_amax.c
@@ -238,7 +238,7 @@ f77_int idamax_
     return f77_index;
 }
 
-INSERT_GENTFUNC_BLAS_ZEN2( amax, amaxv )
+INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
 #else
 INSERT_GENTFUNC_BLAS( amax, amaxv )
 #endif
diff --git a/frame/compat/bla_axpy.c b/frame/compat/bla_axpy.c
index 0c0943419..ad1689898 100644
--- a/frame/compat/bla_axpy.c
+++ b/frame/compat/bla_axpy.c
@@ -224,7 +224,7 @@ void daxpy_
 //	bli_finalize_auto();
 }
 
-INSERT_GENTFUNC_BLAS_ZEN2( axpy, axpyv )
+INSERT_GENTFUNC_BLAS_CZ( axpy, axpyv )
 
 #else
 INSERT_GENTFUNC_BLAS( axpy, axpyv )
diff --git a/frame/compat/bla_copy.c b/frame/compat/bla_copy.c
index 369476b19..d73796664 100644
--- a/frame/compat/bla_copy.c
+++ b/frame/compat/bla_copy.c
@@ -236,7 +236,7 @@ void dcopy_
 //    bli_finalize_auto();
 }
 
-INSERT_GENTFUNC_BLAS_ZEN2(copy, copyv)
+INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
 #else
 INSERT_GENTFUNC_BLAS(copy, copyv)
 #endif
diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c
index ed97d89b1..eaf2022af 100644
--- a/frame/compat/bla_dot.c
+++ b/frame/compat/bla_dot.c
@@ -247,7 +247,7 @@ double ddot_
 	return rho;
 }
 
-INSERT_GENTFUNCDOT_BLAS_ZEN2( dot, dotv )
+INSERT_GENTFUNCDOT_BLAS_CZ( dot, dotv )
 #else
 INSERT_GENTFUNCDOT_BLAS( dot, dotv )
 #endif
diff --git a/frame/compat/bla_gemv.c b/frame/compat/bla_gemv.c
index 85c65dde4..9e117f537 100644
--- a/frame/compat/bla_gemv.c
+++ b/frame/compat/bla_gemv.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -53,90 +54,396 @@ void PASTEF77(ch,blasname) \
              ftype*    y, const f77_int* incy  \
      ) \
 { \
-	trans_t blis_transa; \
-	dim_t   m0, n0; \
-	dim_t   m_y, n_x; \
-	ftype*  x0; \
-	ftype*  y0; \
-	inc_t   incx0; \
-	inc_t   incy0; \
-	inc_t   rs_a, cs_a; \
+    trans_t blis_transa; \
+    dim_t   m0, n0; \
+    dim_t   m_y, n_x; \
+    ftype*  x0; \
+    ftype*  y0; \
+    inc_t   incx0; \
+    inc_t   incy0; \
+    inc_t   rs_a, cs_a; \
 \
-	/* Initialize BLIS. */ \
-	bli_init_auto(); \
+    /* Initialize BLIS. */ \
+    bli_init_auto(); \
 \
-	/* Perform BLAS parameter checking. */ \
-	PASTEBLACHK(blasname) \
-	( \
-	  MKSTR(ch), \
-	  MKSTR(blasname), \
-	  transa, \
-	  m, \
-	  n, \
-	  lda, \
-	  incx, \
-	  incy  \
-	); \
+    /* Perform BLAS parameter checking. */ \
+    PASTEBLACHK(blasname) \
+    ( \
+      MKSTR(ch), \
+      MKSTR(blasname), \
+      transa, \
+      m, \
+      n, \
+      lda, \
+      incx, \
+      incy  \
+    ); \
 \
-	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
-	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+    bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
 \
-	/* Convert/typecast negative values of m and n to zero. */ \
-	bli_convert_blas_dim1( *m, m0 ); \
-	bli_convert_blas_dim1( *n, n0 ); \
+    /* Convert/typecast negative values of m and n to zero. */ \
+    bli_convert_blas_dim1( *m, m0 ); \
+    bli_convert_blas_dim1( *n, n0 ); \
 \
-	/* Determine the dimensions of x and y so we can adjust the increments,
-	   if necessary.*/ \
-	bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
+    /* Determine the dimensions of x and y so we can adjust the increments,
+       if necessary.*/ \
+    bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
 \
-	/* BLAS handles cases where trans(A) has no columns, and x has no elements,
-	   in a peculiar way. In these situations, BLAS returns without performing
-	   any action, even though most sane interpretations of gemv would have the
-	   the operation reduce to y := beta * y. Here, we catch those cases that
-	   BLAS would normally mishandle and emulate the BLAS exactly so as to
-	   provide "bug-for-bug" compatibility. Note that this extreme level of
-	   compatibility would not be as much of an issue if it weren't for the
-	   fact that some BLAS test suites actually test for these cases. Also, it
-	   should be emphasized that BLIS, if called natively, does NOT exhibit
-	   this quirky behavior; it will scale y by beta, as one would expect. */ \
-	if ( m_y > 0 && n_x == 0 ) \
-	{ \
-		/* Finalize BLIS. */ \
-		bli_finalize_auto(); \
+    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
+       in a peculiar way. In these situations, BLAS returns without performing
+       any action, even though most sane interpretations of gemv would have the
+       the operation reduce to y := beta * y. Here, we catch those cases that
+       BLAS would normally mishandle and emulate the BLAS exactly so as to
+       provide "bug-for-bug" compatibility. Note that this extreme level of
+       compatibility would not be as much of an issue if it weren't for the
+       fact that some BLAS test suites actually test for these cases. Also, it
+       should be emphasized that BLIS, if called natively, does NOT exhibit
+       this quirky behavior; it will scale y by beta, as one would expect. */ \
+    if ( m_y > 0 && n_x == 0 ) \
+    { \
+        /* Finalize BLIS. */ \
+        bli_finalize_auto(); \
 \
-		return; \
-	} \
+        return; \
+    } \
 \
-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */ \
-	bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
-	bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */ \
+    bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
+    bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
 \
-	/* Set the row and column strides of A. */ \
-	rs_a = 1; \
-	cs_a = *lda; \
+    /* Set the row and column strides of A. */ \
+    rs_a = 1; \
+    cs_a = *lda; \
 \
-	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
-	( \
-	  blis_transa, \
-	  BLIS_NO_CONJUGATE, \
-	  m0, \
-	  n0, \
-	  (ftype*)alpha, \
-	  (ftype*)a,  rs_a, cs_a, \
-	  x0, incx0, \
-	  (ftype*)beta, \
-	  y0, incy0, \
-	  NULL, \
-	  NULL  \
-	); \
+    /* Call BLIS interface. */ \
+    PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+    ( \
+      blis_transa, \
+      BLIS_NO_CONJUGATE, \
+      m0, \
+      n0, \
+      (ftype*)alpha, \
+      (ftype*)a,  rs_a, cs_a, \
+      x0, incx0, \
+      (ftype*)beta, \
+      y0, incy0, \
+      NULL, \
+      NULL  \
+    ); \
 \
-	/* Finalize BLIS. */ \
-	bli_finalize_auto(); \
+    /* Finalize BLIS. */ \
+    bli_finalize_auto(); \
 }
 
+
 #ifdef BLIS_ENABLE_BLAS
+#ifdef BLIS_CONFIG_ZEN2
+void dgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const double*    alpha,
+       const double*    a, const f77_int* lda,
+       const double*    x, const f77_int* incx,
+       const double*    beta,
+             double*    y, const f77_int* incy
+     )
+{
+    trans_t blis_transa;
+    dim_t   m0, n0;
+    dim_t   m_y, n_x;
+    double*  x0;
+    double*  y0;
+    inc_t   incx0;
+    inc_t   incy0;
+    inc_t   rs_a, cs_a;
+
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemv)
+    (
+      MKSTR(d),
+      MKSTR(gemv),
+      transa,
+      m,
+      n,
+      lda,
+      incx,
+      incy
+    );
+
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+    if      ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
+    else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
+    else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
+    else
+    {
+        // See comment for bli_param_map_netlib_to_blis_side() above.
+        //bli_check_error_code( BLIS_INVALID_TRANS );
+        blis_transa = BLIS_NO_TRANSPOSE;
+    }
+
+    /* Convert/typecast negative values of m and n to zero. */
+    if ( *m < 0 ) m0 = ( dim_t )0;
+    else              m0 = ( dim_t )(*m);
+
+    if ( *n < 0 ) n0 = ( dim_t )0; 
+    else              n0 = ( dim_t )(*n);
+
+    /* Determine the dimensions of x and y so we can adjust the increments,
+       if necessary.*/
+    if ( bli_does_notrans( blis_transa ) ) { m_y = m0; n_x = n0; }
+    else                             { m_y = n0; n_x = m0; }
+
+    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
+       in a peculiar way. In these situations, BLAS returns without performing
+       any action, even though most sane interpretations of gemv would have the
+       the operation reduce to y := beta * y. Here, we catch those cases that
+       BLAS would normally mishandle and emulate the BLAS exactly so as to
+       provide "bug-for-bug" compatibility. Note that this extreme level of
+       compatibility would not be as much of an issue if it weren't for the
+       fact that some BLAS test suites actually test for these cases. Also, it
+       should be emphasized that BLIS, if called natively, does NOT exhibit
+       this quirky behavior; it will scale y by beta, as one would expect. */
+    if ( m_y > 0 && n_x == 0 )
+    {
+        /* Finalize BLIS. */
+//      bli_finalize_auto();
+
+        return;
+    }
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        x0    = ((double*)x) + (n_x-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+    }
+    else
+    {
+        x0    = ((double*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((double*)y) + (m_y-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+    }
+    else
+    {
+        y0    = ((double*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Set the row and column strides of A. */
+    rs_a = 1;
+    cs_a = *lda;
+
+    /* Call variants based on transpose value. */
+    if(bli_does_notrans(blis_transa))
+    {
+        if((m0 <1200) && (n0 < 1200))
+        {
+            bli_dgemv_zen_basic
+            (
+                m0,
+                n0,
+                (double*) alpha,
+                (double*)a, cs_a,
+                x0, incx0,
+                (double*) beta,
+                y0, incy0,
+                NULL
+            );
+        }
+        else
+        {
+            //variant_2 is chosen for column-storage
+            // and uses axpyf-based implementation
+            bli_dgemv_unf_var2
+            (
+                blis_transa,
+                BLIS_NO_CONJUGATE,
+                m0,
+                n0,
+               (double*)alpha,
+               (double*)a,  rs_a, cs_a,
+               x0, incx0,
+               (double*)beta,
+               y0, incy0,
+               NULL
+            );
+        }
+    }
+    else
+    {
+        //var_1 is chosen for row-storage
+        //and uses dotxf-based implementation
+        bli_dgemv_unf_var1
+        (
+        blis_transa,
+        BLIS_NO_CONJUGATE,
+        m0,
+        n0,
+        (double*)alpha,
+        (double*)a,  rs_a, cs_a,
+        x0, incx0,
+        (double*)beta,
+        y0, incy0,
+        NULL
+        );
+
+    }
+
+}
+
+void sgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const float*    alpha,
+       const float*    a, const f77_int* lda,
+       const float*    x, const f77_int* incx,
+       const float*    beta,
+             float*    y, const f77_int* incy
+     )
+{
+    trans_t blis_transa;
+    dim_t   m0, n0;
+    dim_t   m_y, n_x;
+    float*  x0;
+    float*  y0;
+    inc_t   incx0;
+    inc_t   incy0;
+    inc_t   rs_a, cs_a;
+
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemv)
+    (
+      MKSTR(s),
+      MKSTR(gemv),
+      transa,
+      m,
+      n,
+      lda,
+      incx,
+      incy
+    );
+
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+    if      ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
+    else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
+    else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
+    else
+    {
+        // See comment for bli_param_map_netlib_to_blis_side() above.
+        //bli_check_error_code( BLIS_INVALID_TRANS );
+        blis_transa = BLIS_NO_TRANSPOSE;
+    }
+
+    /* Convert/typecast negative values of m and n to zero. */
+    if ( *m < 0 ) m0 = ( dim_t )0;
+    else              m0 = ( dim_t )(*m);
+
+    if ( *n < 0 ) n0 = ( dim_t )0; 
+    else              n0 = ( dim_t )(*n);
+
+    /* Determine the dimensions of x and y so we can adjust the increments,
+       if necessary.*/
+    if ( bli_does_notrans( blis_transa ) ) { m_y = m0; n_x = n0; }
+    else                             { m_y = n0; n_x = m0; }
+
+    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
+       in a peculiar way. In these situations, BLAS returns without performing
+       any action, even though most sane interpretations of gemv would have the
+       the operation reduce to y := beta * y. Here, we catch those cases that
+       BLAS would normally mishandle and emulate the BLAS exactly so as to
+       provide "bug-for-bug" compatibility. Note that this extreme level of
+       compatibility would not be as much of an issue if it weren't for the
+       fact that some BLAS test suites actually test for these cases. Also, it
+       should be emphasized that BLIS, if called natively, does NOT exhibit
+       this quirky behavior; it will scale y by beta, as one would expect. */
+    if ( m_y > 0 && n_x == 0 )
+    {
+        /* Finalize BLIS. */
+//      bli_finalize_auto();
+
+        return;
+    }
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        x0    = ((float*)x) + (n_x-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+    }
+    else
+    {
+        x0    = ((float*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((float*)y) + (m_y-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+    }
+    else
+    {
+        y0    = ((float*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Set the row and column strides of A. */
+    rs_a = 1;
+    cs_a = *lda;
+
+    /* Call variants based on transpose value. */
+    if(bli_does_notrans(blis_transa))
+    {
+        bli_sgemv_unf_var2
+        (
+        blis_transa,
+        BLIS_NO_CONJUGATE,
+        m0,
+        n0,
+        (float*)alpha,
+        (float*)a,  rs_a, cs_a,
+        x0, incx0,
+        (float*)beta,
+        y0, incy0,
+        NULL
+        );
+
+    }
+    else
+    {   
+        bli_sgemv_unf_var1
+        (
+        blis_transa,
+        BLIS_NO_CONJUGATE,
+        m0,
+        n0,
+        (float*)alpha,
+        (float*)a,  rs_a, cs_a,
+        x0, incx0,
+        (float*)beta,
+        y0, incy0,
+        NULL
+        );
+
+    }
+
+}
+
+INSERT_GENTFUNC_BLAS_CZ( gemv, gemv )
+#else
 INSERT_GENTFUNC_BLAS( gemv, gemv )
 #endif
-
+#endif
diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c
index 3b8dda884..848d42094 100644
--- a/frame/compat/bla_scal.c
+++ b/frame/compat/bla_scal.c
@@ -203,7 +203,7 @@ void dscal_
 //    bli_finalize_auto();
 }
 
-INSERT_GENTFUNCSCAL_BLAS_ZEN2( scal, scalv )
+INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
 #else
 INSERT_GENTFUNCSCAL_BLAS( scal, scalv )
 #endif
diff --git a/frame/compat/bla_swap.c b/frame/compat/bla_swap.c
index 3b589b212..8d8d40628 100644
--- a/frame/compat/bla_swap.c
+++ b/frame/compat/bla_swap.c
@@ -229,7 +229,7 @@ void dswap_
 //    bli_finalize_auto();
 }
 
-INSERT_GENTFUNC_BLAS_ZEN2( swap, swapv )
+INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
 
 #else
 INSERT_GENTFUNC_BLAS( swap, swapv )
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index 3aca1cba5..1dd722ed3 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -6,7 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -244,9 +244,9 @@ CNTX_INIT_PROTS( generic )
 
 // -- AMD64 architectures --
 
-//#ifdef BLIS_KERNELS_ZEN2
-//#include "bli_kernels_zen2.h"
-//#endif
+#ifdef BLIS_KERNELS_ZEN2
+#include "bli_kernels_zen2.h"
+#endif
 #ifdef BLIS_KERNELS_ZEN
 #include "bli_kernels_zen.h"
 #endif
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index 6ea378ee5..e2a06b332 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -57,7 +57,7 @@ GENTFUNC( scomplex, c, blasname, blisname ) \
 GENTFUNC( dcomplex, z, blasname, blisname )
 
 
-#define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \
+#define INSERT_GENTFUNC_BLAS_CZ( blasname, blisname ) \
 \
 GENTFUNC( scomplex, c, blasname, blisname ) \
 GENTFUNC( dcomplex, z, blasname, blisname )
@@ -82,7 +82,7 @@ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname )
 
 // -- Basic one-operand macro with conjugation (used only for dot, ger) --
 
-#define INSERT_GENTFUNCDOT_BLAS_ZEN2( blasname, blisname ) \
+#define INSERT_GENTFUNCDOT_BLAS_CZ( blasname, blisname ) \
 \
 GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE,    blasname, blisname ) \
 GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \
@@ -125,7 +125,7 @@ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname )
 // -- Extended two-operand macro (used only for scal) --
 
 
-#define INSERT_GENTFUNCSCAL_BLAS_ZEN2( blasname, blisname ) \
+#define INSERT_GENTFUNCSCAL_BLAS_CZ( blasname, blisname ) \
 \
 GENTFUNCSCAL( scomplex, scomplex, c,  , blasname, blisname ) \
 GENTFUNCSCAL( dcomplex, dcomplex, z,  , blasname, blisname ) \
@@ -157,6 +157,14 @@ GENTFUNC( double,   d, tfuncname ) \
 GENTFUNC( scomplex, c, tfuncname ) \
 GENTFUNC( dcomplex, z, tfuncname )
 
+
+#define INSERT_GENTFUNC_BASIC0_SCZ( tfuncname ) \
+\
+GENTFUNC( float,    s, tfuncname ) \
+GENTFUNC( scomplex, c, tfuncname ) \
+GENTFUNC( dcomplex, z, tfuncname )
+
+
 // -- (one auxiliary argument) --
 
 #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \
diff --git a/frame/include/blis.h b/frame/include/blis.h
index 4725ee83f..3b4d85f9e 100644
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -6,7 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -87,6 +87,7 @@ extern "C" {
 #include "bli_l1v_ker_prot.h"
 #include "bli_l1f_ker_prot.h"
 #include "bli_l1m_ker_prot.h"
+#include "bli_l2_ker_prot.h"
 #include "bli_l3_ukr_prot.h"
 #include "bli_l3_sup_ker_prot.h"
 
diff --git a/kernels/zen2/.gitignore b/kernels/zen2/.gitignore
deleted file mode 100644
index 5e7d2734c..000000000
--- a/kernels/zen2/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-# Ignore everything in this directory
-*
-# Except this file
-!.gitignore
diff --git a/kernels/zen2/1f/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
new file mode 100644
index 000000000..36654f9c2
--- /dev/null
+++ b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
@@ -0,0 +1,326 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+    __m256d v;
+    double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+    double* restrict a2;
+    double* restrict a3;
+    double* restrict a4;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v, chi2v, chi3v;
+    v4df_t           chi4v;
+
+    v4df_t           a00v, a01v, a02v, a03v;
+    v4df_t           a04v;
+
+    v4df_t           a10v, a11v, a12v, a13v;
+    v4df_t           a14v;
+
+    v4df_t           y0v, y1v;
+
+    double           chi0, chi1, chi2, chi3;
+    double           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+#ifdef BLIS_CONFIG_ZEN2
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
+            PASTEMAC(d,scals)( *alpha, alpha_chi1 );
+
+            bli_daxpyv_zen_int10
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#else
+        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
+            PASTEMAC(d,scals)( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#endif
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    PASTEMAC(d,scals)( *alpha, chi0 );
+    PASTEMAC(d,scals)( *alpha, chi1 );
+    PASTEMAC(d,scals)( *alpha, chi2 );
+    PASTEMAC(d,scals)( *alpha, chi3 );
+    PASTEMAC(d,scals)( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+    chi2v.v = _mm256_broadcast_sd( &chi2 );
+    chi3v.v = _mm256_broadcast_sd( &chi3 );
+    chi4v.v = _mm256_broadcast_sd( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 3) < m; i += 4 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+    
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca; 
+            y0 += incy;
+        }
+
+    }
+}
+
diff --git a/kernels/zen2/2/bli_gemv_zen_ref.c b/kernels/zen2/2/bli_gemv_zen_ref.c
new file mode 100644
index 000000000..31138f2a8
--- /dev/null
+++ b/kernels/zen2/2/bli_gemv_zen_ref.c
@@ -0,0 +1,103 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+/* This routine assumes that the matrix is stored in column-major order
+ * and does not have transpose.
+ */
+void bli_dgemv_zen_ref_c
+     (
+      dim_t            m,
+      dim_t            n,
+      double* restrict alpha,
+      double* restrict a, inc_t lda,
+      double* restrict x, inc_t incx,
+      double* restrict beta,
+      double* restrict y, inc_t incy,
+      cntx_t* restrict cntx
+     )
+{
+
+	double* restrict x0 = x;
+	double* restrict y0 = y;
+	double* restrict a0 = a;
+
+	dim_t i,j;
+
+	if((incy == 1) && (incx == 1))
+	{
+
+	/* n==0 case will be handled in the framework,
+	   where it returns immediately.
+	 * For the first column of A, execute Y = alpha*A*X + beta*Y
+	 */
+		double x0_val = (x0[0]);
+		PRAGMA_SIMD
+		for(i = 0; i < m; i++)
+		{
+			(y0[i]) = (a0[i]) * (x0_val) * (*alpha) + y0[i] * (*beta);	
+		}
+		a0 += lda;
+
+	/* For remaining columns of A, execute,  Y = alpha*A*X + Y; */
+		for(j = 1; j < n; j++)
+		{
+			const double xp = (x0[j]);
+			PRAGMA_SIMD
+			for(i = 0; i < m; i++)
+			{
+				(y0[i]) += (a0[i]) * xp * (*alpha);	
+			}
+			a0 += lda;
+		}
+
+	}
+	else
+	{
+		for(j = 0; j < m; j++)
+			PASTEMAC(d,scals)(*beta, *(y0+j*incy))
+
+		for(j = 0; j < n; j++)
+		{
+			const double xp = *(x0+j*incx);
+			for(i = 0; i < m; i++)
+			{
+				*(y0 + i*incy) += (a0[j*lda+i]) * xp * (*alpha);	
+			}
+		}
+	}
+	return;
+}
diff --git a/kernels/zen2/bli_kernels_zen2.h b/kernels/zen2/bli_kernels_zen2.h
new file mode 100644
index 000000000..837eac344
--- /dev/null
+++ b/kernels/zen2/bli_kernels_zen2.h
@@ -0,0 +1,45 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+// -- level-1f --
+AXPYF_KER_PROT( double,   d, axpyf_zen_int_5 )
+
+// -- level-2 --
+
+//gemv(scalar code)
+GEMV_KER_PROT( double,    d, gemv_zen_basic )
+
+