Removed Arch specific code from BLIS framework.

- Removed BLIS_CONFIG_EPYC macro - The code dependent on this macro is handled in one of the three ways -- It is updated to work across platforms. -- Added in architecture/feature specific runtime checks. -- Duplicated in AMD specific files. Build system is updated to pick AMD specific files when library is built for any of the zen architecture AMD-Internal: [CPUPL-1960] Change-Id: I6f9f8018e41fa48eb43ae4245c9c2c361857f43b
2026-05-11 17:50:00 +00:00 · 2021-12-20 09:43:13 +05:30
parent 79c6aa5643
commit f63f78d783
53 changed files with 11226 additions and 8028 deletions
--- a/24
+++ b/24
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -212,6 +212,27 @@ MK_REFKERN_OBJS     := $(foreach arch, $(CONFIG_LIST), \
 # Generate object file paths for all of the portable framework source code.
 MK_FRAME_OBJS       := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH))

+# AMD has optimized some of the framework files, these optimizations
+# may not be compatible with other platforms.
+#
+# In order to keep main framework code independent of AMD changes,
+# AMD has duplicated the files and updated them for example
+# frame/compact/bla_gemm.c : generic framework file
+# frame/compact/bla_gemm_amd.c : AMD optimized framework file
+# Based on the archiecture we choose correct files
+
+ifeq ($(MK_IS_ARCH_ZEN),yes)
+# Build is being done for AMD platforms, remove the objects which
+# don't have amd suffix (for which exists AMD specific implementation).
+MK_FRAME_AMD_OBJS  := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS))
+FILES_TO_REMOVE := $(subst _amd.o,.o, $(MK_FRAME_AMD_OBJS))
+MK_FRAME_OBJS := $(filter-out $(FILES_TO_REMOVE), $(MK_FRAME_OBJS))
+else
+# Build is done for non AMD platforms, remove the amd specific objects
+MK_FRAME_AMD_OBJS  := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS))
+MK_FRAME_OBJS := $(filter-out $(MK_FRAME_AMD_OBJS), $(MK_FRAME_OBJS))
+endif
+
 # Generate object file paths for all of the debgu and trace logger.
 MK_AOCLDTL_OBJS       := $(call gen-obj-paths-from-src,$(AOCLDTL_SRC_SUFS),$(MK_AOCLDTL_SRC),$(AOCLDTL_PATH),$(BASE_OBJ_AOCLDTL_PATH))

@@ -1338,4 +1359,3 @@ else
 	@echo "Uninstalling $(@F) from $(@D)/"
 	@- $(RM_F) $@
 endif
-
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -204,5 +204,7 @@ MK_ENABLE_AOCL_DYNAMIC := @enable_aocl_dynamic@
 # BLAS int size
 MK_BLAS_INT_TYPE_SIZE := @blas_int_type_size@

+MK_IS_ARCH_ZEN := @enable_aocl_zen@
+
 # end of ifndef CONFIG_MK_INCLUDED conditional block
 endif
--- a/config/amdzen/make_defs.mk
+++ b/config/amdzen/make_defs.mk
@@ -4,7 +4,7 @@
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -49,16 +49,6 @@ else
 COPTFLAGS      := -O3
 endif

-# This will add BLIS_CONFIG_EPYC for all framework files
-# FIXME: framework files should not have architecture specific
-#        checks at least at compile time. Once the macro
-#        is defined it is applicable to every build in the
-#        Family including any non AMD configuration.
-#        However, it is still better to define it in makefiles
-#        instead of headers so we can have slighly more
-#        control on this.
-COPTFLAGS +=  -DBLIS_CONFIG_EPYC
-
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
--- a/config/zen/make_defs.mk
+++ b/config/zen/make_defs.mk
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -46,25 +46,12 @@ AMD_CONFIG_FILE := amd_config.mk
 AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
 -include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)

-
-# Since we removed BLIS_CONFIG_EPYC from header file, we need to
-# add it here at two places,
-#     CPPROCFLAGS = This will enable it for framework code
-#                   This flag is used when configure is invoked with specific architecture
-#     CKOPTFLAGS  = This will enable it for architecture specific kernels
-#                   This flag is used for kernels assocaited with this architecture
-#                   irrespective of the configuration it is built for.
-
-CPPROCFLAGS    := -DBLIS_CONFIG_EPYC
-
-
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
 COPTFLAGS      := -O3
 endif

-
 #
 # --- Enable ETRACE across the library if enabled ETRACE_ENABLE=[0,1] -----------------------
 #
@@ -86,10 +73,6 @@ else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif

-# Add this after updating variables for reference kernels
-# we don't want this defined for them
-CKOPTFLAGS += -DBLIS_CONFIG_EPYC
-
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
--- a/config/zen2/make_defs.mk
+++ b/config/zen2/make_defs.mk
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -50,15 +50,7 @@ THIS_CONFIG    := zen2
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.

-# Since we removed BLIS_CONFIG_EPYC from header file, we need to
-# add it here at two places,
-#     CPPROCFLAGS = This will enable it for framework code
-#                   This flag is used when configure is invoked with specific architecture
-#     CKOPTFLAGS  = This will enable it for architecture specific kernels
-#                   This flag is used for kernels assocaited with this architecture
-#                   irrespective of the configuration it is built for.
-
-CPPROCFLAGS    := -DBLIS_CONFIG_EPYC
+CPPROCFLAGS    :=
 CMISCFLAGS     :=
 CPICFLAGS      :=
 CWARNFLAGS     :=
@@ -111,10 +103,6 @@ endif
 CROPTFLAGS     := $(CKOPTFLAGS)
 CRVECFLAGS     := $(CKVECFLAGS)

-# Add this after updating variables for reference kernels
-# we don't want this defined for them
-CKOPTFLAGS += -DBLIS_CONFIG_EPYC
-
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -50,15 +50,7 @@ THIS_CONFIG    := zen3
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.

-# Since we removed BLIS_CONFIG_EPYC from header file, we need to
-# add it here at two places,
-#     CPPROCFLAGS = This will enable it for framework code
-#                   This flag is used when configure is invoked with specific architecture
-#     CKOPTFLAGS  = This will enable it for architecture specific kernels
-#                   This flag is used for kernels assocaited with this architecture
-#                   irrespective of the configuration it is built for.
-
-CPPROCFLAGS    := -DBLIS_CONFIG_EPYC
+CPPROCFLAGS    :=
 CMISCFLAGS     :=
 CPICFLAGS      :=
 CWARNFLAGS     :=
@@ -132,10 +124,6 @@ endif # gcc
 CROPTFLAGS     := $(CKOPTFLAGS)
 CRVECFLAGS     := $(CKVECFLAGS)

-# Add this after updating variables for reference kernels
-# we don't want this defined for them
-CKOPTFLAGS += -DBLIS_CONFIG_EPYC
-
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
--- a/config/zen4/make_defs.mk
+++ b/config/zen4/make_defs.mk
@@ -4,7 +4,7 @@
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -49,15 +49,7 @@ THIS_CONFIG    := zen4
 # general-purpose/configuration-agnostic flags in common.mk. You
 # may specify additional flags here as needed.

-# Since we removed BLIS_CONFIG_EPYC from header file, we need to
-# add it here at two places,
-#     CPPROCFLAGS = This will enable it for framework code
-#                   This flag is used when configure is invoked with specific architecture
-#     CKOPTFLAGS  = This will enable it for architecture specific kernels
-#                   This flag is used for kernels assocaited with this architecture
-#                   irrespective of the configuration it is built for.
-
-CPPROCFLAGS    := -DBLIS_CONFIG_EPYC
+CPPROCFLAGS    :=
 CMISCFLAGS     :=
 CPICFLAGS      :=
 CWARNFLAGS     :=
@@ -131,10 +123,6 @@ endif # gcc
 CROPTFLAGS     := $(CKOPTFLAGS)
 CRVECFLAGS     := $(CKVECFLAGS)

-# Add this after updating variables for reference kernels
-# we don't want this defined for them
-CKOPTFLAGS += -DBLIS_CONFIG_EPYC
-
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
--- a/3
+++ b/3
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -3370,6 +3370,7 @@ main()
 		| sed -e "s/@enable_aocl_dynamic@/${enable_aocl_dynamic}/g" \
 		| sed -e "s/@complex_return@/${complex_return}/g" \
 		| sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
+		| sed   -e "s/\@enable_aocl_zen\@/${enable_aocl_zen}/g" \
 		> "${config_mk_out_path}"


--- a/frame/2/gemv/bli_gemv_unf_var1.c
+++ b/frame/2/gemv/bli_gemv_unf_var1.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -104,357 +104,5 @@ void PASTEMAC(ch,varname) \
    } \
 }

-#ifdef BLIS_CONFIG_EPYC
-void bli_dgemv_unf_var1
-     (
-       trans_t transa,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   n,
-       double*  alpha,
-       double*  a, inc_t rs_a, inc_t cs_a,
-       double*  x, inc_t incx,
-       double*  beta,
-       double*  y, inc_t incy,
-       cntx_t* cntx
-     )
-{
-
-    double *A1;
-    double *y1;
-    dim_t i;
-    dim_t f;
-    dim_t n_elem, n_iter;
-    inc_t rs_at, cs_at;
-    conj_t conja;
-    //memory pool declarations for packing vector X.
-    mem_t mem_bufX;
-    rntm_t rntm;
-    double *x_buf = x;
-    inc_t buf_incx = incx;
-
-    bli_init_once();
-
-    if (cntx == NULL)
-      cntx = bli_gks_query_cntx();
-
-    bli_set_dims_incs_with_trans(transa,
-                                 m, n, rs_a, cs_a,
-                                 &n_iter, &n_elem, &rs_at, &cs_at);
-
-    conja = bli_extract_conj(transa);
-
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen == 0)
-    {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-        const num_t dt = PASTEMAC(d,type);
-        double*  x1;
-        double*  y1;
-        PASTECH(d,dotxf_ker_ft) kfp_df;
-        /* Query the context for the kernel function pointer and fusing factor. */
-        kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
-        dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-
-            A1 = a + (i  )*rs_at + (0  )*cs_at;
-            x1 = x + (0  )*incy;
-            y1 = y + (i  )*incy;
-
-            /* y1 = beta * y1 + alpha * A1 * x; */
-            kfp_df
-            (
-            conja,
-            conjx,
-            n_elem,
-            f,
-            alpha,
-            A1,   cs_at, rs_at,
-            x1,   incx,
-            beta,
-            y1,   incy,
-            cntx
-            );
-
-        }
-
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-      return;
-    }
-    
-    if (incx > 1)
-    {
-    /*
-          Initialize mem pool buffer to NULL and size to 0
-          "buf" and "size" fields are assigned once memory
-          is allocated from the pool in bli_membrk_acquire_m().
-          This will ensure bli_mem_is_alloc() will be passed on
-          an allocated memory if created or a NULL .
-    */
-
-    mem_bufX.pblk.buf = NULL;
-    mem_bufX.pblk.block_size = 0;
-    mem_bufX.buf_type = 0;
-    mem_bufX.size = 0;
-    mem_bufX.pool = NULL;
-
-    /* In order to get the buffer from pool via rntm access to memory broker
-        is needed.Following are initializations for rntm */
-
-    bli_rntm_init_from_global(&rntm);
-    bli_rntm_set_num_threads_only(1, &rntm);
-    bli_membrk_rntm_set_membrk(&rntm);
-
-    //calculate the size required for n_elem double elements in vector X.
-    size_t buffer_size = n_elem * sizeof(double);
-
-#ifdef BLIS_ENABLE_MEM_TRACING
-    printf("bli_dgemv_unf_var1(): get mem pool block\n");
-#endif
-
-    /*acquire a Buffer(n_elem*size(double)) from the memory broker
-      and save the associated mem_t entry to mem_bufX.*/
-    bli_membrk_acquire_m(&rntm,
-                         buffer_size,
-                         BLIS_BUFFER_FOR_B_PANEL,
-                         &mem_bufX);
-
-    /*Continue packing X if buffer memory is allocated*/
-    if ((bli_mem_is_alloc(&mem_bufX)))
-    {
-      x_buf = bli_mem_buffer(&mem_bufX);
-
-      //pack X vector with non-unit stride to a temp buffer x_buf with unit stride
-      for (dim_t x_index = 0; x_index < n_elem; x_index++)
-      {
-        *(x_buf + x_index) = *(x + (x_index * incx));
-      }
-      // stride of vector x_buf =1
-      buf_incx = 1;
-    }
-  }
-
-  dim_t fuse_factor = 8;
-  dim_t f_temp =0;
-
-  if (n < 4)
-  {
-     fuse_factor = 2;
-  } else if (n < 8)
-  {
-     fuse_factor = 4;
-  }
-  
-
-  for (i = 0; i < n_iter; i += f)
-  {
-    f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor);
-
-    //A = a + i * row_increment + 0 * column_increment
-    A1 = a + (i)*rs_at;
-    y1 = y + (i)*incy;
-
-    /* y1 = beta * y1 + alpha * A1 * x; */
-    switch (f)
-    {
-    case 8:
-
-      bli_ddotxf_zen_int_8(
-          conja,
-          conjx,
-          n_elem,
-          f,
-          alpha,
-          A1, cs_at, rs_at,
-          x_buf, buf_incx,
-          beta,
-          y1, incy,
-          cntx);
-
-      break;
-    default:
-
-      if (f < 4)
-      {
-        bli_ddotxf_zen_int_2(
-            conja,
-            conjx,
-            n_elem,
-            f,
-            alpha,
-            A1, cs_at, rs_at,
-            x_buf, buf_incx,
-            beta,
-            y1, incy,
-            cntx);
-      }
-      else
-      {
-        bli_ddotxf_zen_int_4(
-            conja,
-            conjx,
-            n_elem,
-            f,
-            alpha,
-            A1, cs_at, rs_at,
-            x_buf, buf_incx,
-            beta,
-            y1, incy,
-            cntx);
-      }
-    }
-
-    f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor);
-
-    if (f_temp < fuse_factor)
-    {
-      switch (fuse_factor)
-      {
-      case 8:
-        fuse_factor = 4;
-        break;
-      case 4:
-        fuse_factor = 2;
-        break;
-      }
-    }
-  }
-
-  if ((incx > 1) && bli_mem_is_alloc(&mem_bufX))
-  {
-#ifdef BLIS_ENABLE_MEM_TRACING
-    printf("bli_dgemv_unf_var1(): releasing mem pool block\n");
-#endif
-    // Return the buffer to pool
-    bli_membrk_release(&rntm, &mem_bufX);
-  }
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-}
-
-void bli_sgemv_unf_var1
-     (
-       trans_t transa,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   n,
-       float*  alpha,
-       float*  a, inc_t rs_a, inc_t cs_a,
-       float*  x, inc_t incx,
-       float*  beta,
-       float*  y, inc_t incy,
-       cntx_t* cntx
-     )
-{
-
-    float*  A1;
-    float*  x1;
-    float*  y1;
-    dim_t   i;
-    dim_t   b_fuse, f;
-    dim_t   n_elem, n_iter;
-    inc_t   rs_at, cs_at;
-    conj_t  conja;
-
-    bli_init_once();
-
-    if( cntx == NULL ) cntx = bli_gks_query_cntx();
-
-    bli_set_dims_incs_with_trans( transa,
-                                  m, n, rs_a, cs_a,
-                                  &n_iter, &n_elem, &rs_at, &cs_at );
-
-    conja = bli_extract_conj( transa );
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen == 0)
-    {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-        const num_t dt = PASTEMAC(s,type);
-        float*  x1 ;
-        PASTECH(s,dotxf_ker_ft) kfp_df;
-        /* Query the context for the kernel function pointer and fusing factor. */
-        kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
-        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-
-            A1 = a + (i  )*rs_at + (0  )*cs_at;
-            x1 = x + (0  )*incy;
-            y1 = y + (i  )*incy;
-
-            /* y1 = beta * y1 + alpha * A1 * x; */
-            kfp_df
-            (
-            conja,
-            conjx,
-            n_elem,
-            f,
-            alpha,
-            A1,   cs_at, rs_at,
-            x1,   incx,
-            beta,
-            y1,   incy,
-            cntx
-            );
-
-        }
-
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-      return;
-    }
-
-    /* Query the context for the kernel function pointer and fusing factor. */
-    b_fuse = 8;
-
-    for ( i = 0; i < n_iter; i += f )
-    {
-        f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-
-        A1 = a + (i  )*rs_at + (0  )*cs_at;
-        x1 = x + (0  )*incy;
-        y1 = y + (i  )*incy;
-
-        /* y1 = beta * y1 + alpha * A1 * x; */
-        bli_sdotxf_zen_int_8
-        (
-          conja,
-          conjx,
-          n_elem,
-          f,
-          alpha,
-          A1,   cs_at, rs_at,
-          x1,   incx,
-          beta,
-          y1,   incy,
-          cntx
-        );
-
-    }
-}
-
-INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )
-#else
 INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
-#endif
+
--- a/frame/2/gemv/bli_gemv_unf_var1_amd.c
+++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c
@@ -0,0 +1,440 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       trans_t transa, \
+       conj_t  conjx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  x, inc_t incx, \
+       ctype*  beta, \
+       ctype*  y, inc_t incy, \
+       cntx_t* cntx  \
+     ) \
+{ \
+\
+    if(cntx == NULL) cntx = bli_gks_query_cntx(); \
+\
+    const num_t dt = PASTEMAC(ch,type); \
+\
+    ctype*  A1; \
+    ctype*  x1; \
+    ctype*  y1; \
+    dim_t   i; \
+    dim_t   b_fuse, f; \
+    dim_t   n_elem, n_iter; \
+    inc_t   rs_at, cs_at; \
+    conj_t  conja; \
+\
+    bli_set_dims_incs_with_trans( transa, \
+                                  m, n, rs_a, cs_a, \
+                                  &n_iter, &n_elem, &rs_at, &cs_at ); \
+\
+    conja = bli_extract_conj( transa ); \
+\
+    PASTECH(ch,dotxf_ker_ft) kfp_df; \
+\
+    /* Query the context for the kernel function pointer and fusing factor. */ \
+    kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+    b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
+\
+    for ( i = 0; i < n_iter; i += f ) \
+    { \
+        f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
+\
+        A1 = a + (i  )*rs_at + (0  )*cs_at; \
+        x1 = x + (0  )*incy; \
+        y1 = y + (i  )*incy; \
+\
+        /* y1 = beta * y1 + alpha * A1 * x; */ \
+        kfp_df \
+        ( \
+          conja, \
+          conjx, \
+          n_elem, \
+          f, \
+          alpha, \
+          A1,   cs_at, rs_at, \
+          x1,   incx, \
+          beta, \
+          y1,   incy, \
+          cntx  \
+        ); \
+\
+    } \
+}
+
+void bli_dgemv_unf_var1
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       double*  alpha,
+       double*  a, inc_t rs_a, inc_t cs_a,
+       double*  x, inc_t incx,
+       double*  beta,
+       double*  y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+
+    double *A1;
+    double *y1;
+    dim_t i;
+    dim_t f;
+    dim_t n_elem, n_iter;
+    inc_t rs_at, cs_at;
+    conj_t conja;
+    //memory pool declarations for packing vector X.
+    mem_t mem_bufX;
+    rntm_t rntm;
+    double *x_buf = x;
+    inc_t buf_incx = incx;
+
+    bli_init_once();
+
+    if (cntx == NULL)
+      cntx = bli_gks_query_cntx();
+
+    bli_set_dims_incs_with_trans(transa,
+                                 m, n, rs_a, cs_a,
+                                 &n_iter, &n_elem, &rs_at, &cs_at);
+
+    conja = bli_extract_conj(transa);
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+        const num_t dt = PASTEMAC(d,type);
+        double*  x1;
+        double*  y1;
+        PASTECH(d,dotxf_ker_ft) kfp_df;
+        /* Query the context for the kernel function pointer and fusing factor. */
+        kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
+        dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
+
+        for ( i = 0; i < n_iter; i += f )
+        {
+            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+            A1 = a + (i  )*rs_at + (0  )*cs_at;
+            x1 = x + (0  )*incy;
+            y1 = y + (i  )*incy;
+
+            /* y1 = beta * y1 + alpha * A1 * x; */
+            kfp_df
+            (
+            conja,
+            conjx,
+            n_elem,
+            f,
+            alpha,
+            A1,   cs_at, rs_at,
+            x1,   incx,
+            beta,
+            y1,   incy,
+            cntx
+            );
+
+        }
+
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+      return;
+    }
+    if (incx > 1)
+    {
+    /*
+          Initialize mem pool buffer to NULL and size to 0
+          "buf" and "size" fields are assigned once memory
+          is allocated from the pool in bli_membrk_acquire_m().
+          This will ensure bli_mem_is_alloc() will be passed on
+          an allocated memory if created or a NULL .
+    */
+
+    mem_bufX.pblk.buf = NULL;
+    mem_bufX.pblk.block_size = 0;
+    mem_bufX.buf_type = 0;
+    mem_bufX.size = 0;
+    mem_bufX.pool = NULL;
+
+    /* In order to get the buffer from pool via rntm access to memory broker
+        is needed.Following are initializations for rntm */
+
+    bli_rntm_init_from_global(&rntm);
+    bli_rntm_set_num_threads_only(1, &rntm);
+    bli_membrk_rntm_set_membrk(&rntm);
+
+    //calculate the size required for n_elem double elements in vector X.
+    size_t buffer_size = n_elem * sizeof(double);
+
+#ifdef BLIS_ENABLE_MEM_TRACING
+    printf("bli_dgemv_unf_var1(): get mem pool block\n");
+#endif
+
+    /*acquire a Buffer(n_elem*size(double)) from the memory broker
+      and save the associated mem_t entry to mem_bufX.*/
+    bli_membrk_acquire_m(&rntm,
+                         buffer_size,
+                         BLIS_BUFFER_FOR_B_PANEL,
+                         &mem_bufX);
+
+    /*Continue packing X if buffer memory is allocated*/
+    if ((bli_mem_is_alloc(&mem_bufX)))
+    {
+      x_buf = bli_mem_buffer(&mem_bufX);
+
+      //pack X vector with non-unit stride to a temp buffer x_buf with unit stride
+      for (dim_t x_index = 0; x_index < n_elem; x_index++)
+      {
+        *(x_buf + x_index) = *(x + (x_index * incx));
+      }
+      // stride of vector x_buf =1
+      buf_incx = 1;
+    }
+  }
+
+  dim_t fuse_factor = 8;
+  dim_t f_temp =0;
+
+  if (n < 4)
+  {
+     fuse_factor = 2;
+  } else if (n < 8)
+  {
+     fuse_factor = 4;
+  }
+
+  for (i = 0; i < n_iter; i += f)
+  {
+    f = bli_determine_blocksize_dim_f(i, n_iter, fuse_factor);
+
+    //A = a + i * row_increment + 0 * column_increment
+    A1 = a + (i)*rs_at;
+    y1 = y + (i)*incy;
+
+    /* y1 = beta * y1 + alpha * A1 * x; */
+    switch (f)
+    {
+    case 8:
+
+      bli_ddotxf_zen_int_8(
+          conja,
+          conjx,
+          n_elem,
+          f,
+          alpha,
+          A1, cs_at, rs_at,
+          x_buf, buf_incx,
+          beta,
+          y1, incy,
+          cntx);
+
+      break;
+    default:
+
+      if (f < 4)
+      {
+        bli_ddotxf_zen_int_2(
+            conja,
+            conjx,
+            n_elem,
+            f,
+            alpha,
+            A1, cs_at, rs_at,
+            x_buf, buf_incx,
+            beta,
+            y1, incy,
+            cntx);
+      }
+      else
+      {
+        bli_ddotxf_zen_int_4(
+            conja,
+            conjx,
+            n_elem,
+            f,
+            alpha,
+            A1, cs_at, rs_at,
+            x_buf, buf_incx,
+            beta,
+            y1, incy,
+            cntx);
+      }
+    }
+
+    f_temp = bli_determine_blocksize_dim_f(i + f, n_iter, fuse_factor);
+
+    if (f_temp < fuse_factor)
+    {
+      switch (fuse_factor)
+      {
+      case 8:
+        fuse_factor = 4;
+        break;
+      case 4:
+        fuse_factor = 2;
+        break;
+      }
+    }
+  }
+
+  if ((incx > 1) && bli_mem_is_alloc(&mem_bufX))
+  {
+#ifdef BLIS_ENABLE_MEM_TRACING
+    printf("bli_dgemv_unf_var1(): releasing mem pool block\n");
+#endif
+    // Return the buffer to pool
+    bli_membrk_release(&rntm, &mem_bufX);
+  }
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+}
+
+void bli_sgemv_unf_var1
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       float*  alpha,
+       float*  a, inc_t rs_a, inc_t cs_a,
+       float*  x, inc_t incx,
+       float*  beta,
+       float*  y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+
+    float*  A1;
+    float*  x1;
+    float*  y1;
+    dim_t   i;
+    dim_t   b_fuse, f;
+    dim_t   n_elem, n_iter;
+    inc_t   rs_at, cs_at;
+    conj_t  conja;
+
+    bli_init_once();
+
+    if( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+    bli_set_dims_incs_with_trans( transa,
+                                  m, n, rs_a, cs_a,
+                                  &n_iter, &n_elem, &rs_at, &cs_at );
+
+    conja = bli_extract_conj( transa );
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+        const num_t dt = PASTEMAC(s,type);
+        float*  x1 ;
+        PASTECH(s,dotxf_ker_ft) kfp_df;
+        /* Query the context for the kernel function pointer and fusing factor. */
+        kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
+        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
+
+        for ( i = 0; i < n_iter; i += f )
+        {
+            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+            A1 = a + (i  )*rs_at + (0  )*cs_at;
+            x1 = x + (0  )*incy;
+            y1 = y + (i  )*incy;
+
+            /* y1 = beta * y1 + alpha * A1 * x; */
+            kfp_df
+            (
+            conja,
+            conjx,
+            n_elem,
+            f,
+            alpha,
+            A1,   cs_at, rs_at,
+            x1,   incx,
+            beta,
+            y1,   incy,
+            cntx
+            );
+
+        }
+
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+      return;
+    }
+
+    /* Query the context for the kernel function pointer and fusing factor. */
+    b_fuse = 8;
+
+    for ( i = 0; i < n_iter; i += f )
+    {
+        f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+        A1 = a + (i  )*rs_at + (0  )*cs_at;
+        x1 = x + (0  )*incy;
+        y1 = y + (i  )*incy;
+
+        /* y1 = beta * y1 + alpha * A1 * x; */
+        bli_sdotxf_zen_int_8
+        (
+          conja,
+          conjx,
+          n_elem,
+          f,
+          alpha,
+          A1,   cs_at, rs_at,
+          x1,   incx,
+          beta,
+          y1,   incy,
+          cntx
+        );
+
+    }
+}
+
+INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )
+
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -137,764 +137,4 @@ void PASTEMAC(ch,varname) \
    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \
 }

-#ifdef BLIS_CONFIG_EPYC
-
-void bli_dgemv_unf_var2
-     (
-       trans_t transa,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   n,
-       double*  alpha,
-       double*  a, inc_t rs_a, inc_t cs_a,
-       double*  x, inc_t incx,
-       double*  beta,
-       double*  y, inc_t incy,
-       cntx_t* cntx
-     )
-{
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
-    double*  A1;
-    double*  x1;
-    dim_t   i;
-    dim_t   f;
-    dim_t   n_elem, n_iter;
-    inc_t   rs_at, cs_at;
-    conj_t  conja;
-    //memory pool declarations for packing vector Y.
-    mem_t   mem_bufY;
-    rntm_t  rntm;
-    double  *y_buf = y;
-    inc_t   buf_incy = incy;
-
-    bli_set_dims_incs_with_trans( transa,
-                                  m, n, rs_a, cs_a,
-                                  &n_elem, &n_iter, &rs_at, &cs_at );
-
-    conja = bli_extract_conj( transa );
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen == 0)
-    {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-        const num_t dt = PASTEMAC(d,type);
-        double*  x1;
-        double*  y1;
-        /* If beta is zero, use setv. Otherwise, scale by beta. */
-        if ( PASTEMAC(d,eq0)( *beta ) )
-        {
-            double*  zero = PASTEMAC(d,0);
-            /* y = 0; */
-            PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              zero,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-        else
-        {
-            /* y = beta * y; */
-            PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              beta,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-
-        PASTECH(d,axpyf_ker_ft) kfp_af;
-
-        /* Query the context for the kernel function pointer and fusing factor. */
-        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
-        dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-
-            A1 = a + (0  )*rs_at + (i  )*cs_at;
-            x1 = x + (i  )*incx;
-            y1 = y + (0  )*incy;
-
-            /* y = y + alpha * A1 * x1; */
-            kfp_af
-            (
-              conja,
-              conjx,
-              n_elem,
-              f,
-              alpha,
-              A1, rs_at, cs_at,
-              x1, incx,
-              y1, incy,
-              cntx
-            );
-        }
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-        return;
-    }
-
-    /* If beta is zero, use setv. Otherwise, scale by beta. */
-        /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
-
-    bli_dscalv_zen_int10
-    (
-        BLIS_NO_CONJUGATE,
-        n_elem,
-        beta,
-        y, incy,
-        NULL
-    );
-
-    if( bli_deq0( *alpha ) )
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
-        return;
-    }
-
-    if (incy > 1)
-    {
-        /*
-          Initialize mem pool buffer to NULL and size to 0
-          "buf" and "size" fields are assigned once memory
-          is allocated from the pool in bli_membrk_acquire_m().
-          This will ensure bli_mem_is_alloc() will be passed on
-          an allocated memory if created or a NULL .
-        */
-        mem_bufY.pblk.buf = NULL;   mem_bufY.pblk.block_size = 0;
-        mem_bufY.buf_type = 0;      mem_bufY.size = 0;
-        mem_bufY.pool = NULL;
-
-        /* In order to get the buffer from pool via rntm access to memory broker
-        is needed.Following are initializations for rntm */
-
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        //calculate the size required for n_elem double elements in vector Y.
-        size_t buffer_size = n_elem * sizeof(double);
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dgemv_unf_var2(): get mem pool block\n" );
-        #endif
-
-        /*acquire a Buffer(n_elem*size(double)) from the memory broker
-        and save the associated mem_t entry to mem_bufY.*/
-        bli_membrk_acquire_m(&rntm,
-                                buffer_size,
-                                BLIS_BUFFER_FOR_B_PANEL,
-                                &mem_bufY);
-
-        /*Continue packing Y if buffer memory is allocated*/
-        if ((bli_mem_is_alloc( &mem_bufY )))
-        {
-            y_buf = bli_mem_buffer(&mem_bufY);
-
-            //pack Y vector with non-unit stride to a temp buffer y_buf with unit stride
-            for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
-            {
-                *(y_buf + y_index) =  *(y + (y_index * incy)) ;
-            }
-            // stride of vector y_buf =1
-            buf_incy = 1;
-        }
-    }
-
-    for ( i = 0; i < n_iter; i += f )
-    {
-        f  = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE );
-
-        A1 = a + (0  )*rs_at + (i  )*cs_at;
-        x1 = x + (i  )*incx;
-
-        /* y = y + alpha * A1 * x1; */
-        bli_daxpyf_zen_int_16x4
-        (
-          conja,
-          conjx,
-          n_elem,
-          f,
-          alpha,
-          A1, rs_at, cs_at,
-          x1, incx,
-          y_buf, buf_incy,
-          NULL
-        );
-    }
-    if ((incy > 1) && bli_mem_is_alloc( &mem_bufY ))
-    {
-        //store the result from unit strided y_buf to non-unit strided Y
-        for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
-        {
-            *(y + (y_index * incy)) = *(y_buf + y_index) ;
-        }
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool
-        bli_membrk_release(&rntm , &mem_bufY);
-    }
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-}
-
-void bli_sgemv_unf_var2
-     (
-       trans_t transa,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   n,
-       float*  alpha,
-       float*  a, inc_t rs_a, inc_t cs_a,
-       float*  x, inc_t incx,
-       float*  beta,
-       float*  y, inc_t incy,
-       cntx_t* cntx
-     )
-{
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
-    float*  A1;
-    float*  x1;
-    float*  y1;
-    dim_t   i;
-    dim_t   b_fuse, f;
-    dim_t   n_elem, n_iter;
-    inc_t   rs_at, cs_at;
-    conj_t  conja;
-
-    bli_set_dims_incs_with_trans( transa,
-                                  m, n, rs_a, cs_a,
-                                  &n_elem, &n_iter, &rs_at, &cs_at );
-
-    conja = bli_extract_conj( transa );
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen == 0)
-    {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-        const num_t dt = PASTEMAC(s,type);
-        /* If beta is zero, use setv. Otherwise, scale by beta. */
-        if ( PASTEMAC(s,eq0)( *beta ) )
-        {
-            float*  zero = PASTEMAC(s,0);
-            /* y = 0; */
-            PASTEMAC2(s,setv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              zero,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-        else
-        {
-            /* y = beta * y; */
-            PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              beta,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-
-        PASTECH(s,axpyf_ker_ft) kfp_af;
-
-        /* Query the context for the kernel function pointer and fusing factor. */
-        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
-        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-
-            A1 = a + (0  )*rs_at + (i  )*cs_at;
-            x1 = x + (i  )*incx;
-            y1 = y + (0  )*incy;
-
-            /* y = y + alpha * A1 * x1; */
-            kfp_af
-            (
-              conja,
-              conjx,
-              n_elem,
-              f,
-              alpha,
-              A1, rs_at, cs_at,
-              x1, incx,
-              y1, incy,
-              cntx
-            );
-        }
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-        return;
-    }
-
-    /* If beta is zero, use setv. Otherwise, scale by beta. */
-        /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
-
-    bli_sscalv_zen_int10
-    (
-      BLIS_NO_CONJUGATE,
-      n_elem,
-      beta,
-      y, incy,
-      NULL
-    );
-
-    if( bli_seq0( *alpha ) )
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
-        return;
-    }
-
-    /* Query the context for the kernel function pointer and fusing factor. */
-    b_fuse = 6;
-
-    for ( i = 0; i < n_iter; i += f )
-    {
-        f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-
-        A1 = a + (0  )*rs_at + (i  )*cs_at;
-        x1 = x + (i  )*incx;
-        y1 = y + (0  )*incy;
-
-        /* y = y + alpha * A1 * x1; */
-        bli_saxpyf_zen_int_6
-        (
-          conja,
-          conjx,
-          n_elem,
-          f,
-          alpha,
-          A1, rs_at, cs_at,
-          x1, incx,
-          y1, incy,
-          NULL
-        );
-    }
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-}
-
-
-void bli_zgemv_unf_var2
-     (
-       trans_t transa,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   n,
-       dcomplex*  alpha,
-       dcomplex*  a, inc_t rs_a, inc_t cs_a,
-       dcomplex*  x, inc_t incx,
-       dcomplex*  beta,
-       dcomplex*  y, inc_t incy,
-       cntx_t* cntx
-     )
-{
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
-    dcomplex*  A1;
-    dcomplex*  x1;
-    dcomplex*  y1;
-    dim_t   i;
-    dim_t   b_fuse, f;
-    dim_t   n_elem, n_iter;
-    inc_t   rs_at, cs_at;
-    conj_t  conja;
-
-    bli_set_dims_incs_with_trans( transa,
-                                  m, n, rs_a, cs_a,
-                                  &n_elem, &n_iter, &rs_at, &cs_at );
-
-    conja = bli_extract_conj( transa );
-
-    /* If beta is zero, use setv. Otherwise, scale by beta. */
-      /* y = beta * y; */
-
-    /* beta=0 case is hadled by scalv internally */
-    /*    bli_zscalv_zen_int10
-    (
-      BLIS_NO_CONJUGATE,
-      n_elem,
-      beta,
-      y,
-      incy,
-      cntx
-    );*/
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen == 0)
-    {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-        const num_t dt = PASTEMAC(z,type);
-        /* If beta is zero, use setv. Otherwise, scale by beta. */
-        if ( PASTEMAC(z,eq0)( *beta ) )
-        {
-            dcomplex*  zero = PASTEMAC(z,0);
-            /* y = 0; */
-            PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              zero,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-        else
-        {
-            /* y = beta * y; */
-            PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              beta,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-
-        PASTECH(z,axpyf_ker_ft) kfp_af;
-
-        /* Query the context for the kernel function pointer and fusing factor. */
-        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
-        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-
-            A1 = a + (0  )*rs_at + (i  )*cs_at;
-            x1 = x + (i  )*incx;
-            y1 = y + (0  )*incy;
-
-            /* y = y + alpha * A1 * x1; */
-            kfp_af
-            (
-              conja,
-              conjx,
-              n_elem,
-              f,
-              alpha,
-              A1, rs_at, cs_at,
-              x1, incx,
-              y1, incy,
-              cntx
-            );
-        }
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-        return;
-    }
-
-    bli_zscalv_ex
-    (
-      BLIS_NO_CONJUGATE,
-      n_elem,
-      beta,
-      y, incy,
-      cntx,
-      NULL
-    );
-
-    if( bli_zeq0( *alpha ) )
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-        return;
-    }
-
-    // for non-unit incx, incy and rs_at and conjugate will be added in the next patch
-    if( (incx == 1 && incy == 1 && rs_at == 1 ) &&
-         !bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa))
-    {
-        // This gemv code deals with the followint conditions only
-        // 1. incx, incy, and row stride equal to one
-        // 2. Non conjugate A matrix and X vector
-        // 3. No Transpose for A Martix
-        // Rest is taken care by the else part (axpyf implementation)
-        bli_zgemv_zen_int_4x4
-        (
-            conja,
-            conjx,
-            m,
-            n,
-            alpha,
-            a, rs_at, cs_at,
-            x, incx,
-            beta,
-            y, incy,
-            NULL
-        );
-    }
-    else
-    {
-        /* fusing factor */
-        b_fuse = 4;
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-            A1 = a + (0  )*rs_at + (i  )*cs_at;
-            x1 = x + (i  )*incx;
-            y1 = y + (0  )*incy;
-
-            /* y = y + alpha * A1 * x1; */
-            bli_zaxpyf_zen_int_4
-            (
-                conja,
-                conjx,
-                n_elem,
-                f,
-                alpha,
-                A1, rs_at, cs_at,
-                x1, incx,
-                y1, incy,
-                NULL
-            );
-        }
-    }
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-}
-
-void bli_cgemv_unf_var2
-     (
-       trans_t transa,
-       conj_t  conjx,
-       dim_t   m,
-       dim_t   n,
-       scomplex*  alpha,
-       scomplex*  a, inc_t rs_a, inc_t cs_a,
-       scomplex*  x, inc_t incx,
-       scomplex*  beta,
-       scomplex*  y, inc_t incy,
-       cntx_t* cntx
-     )
-{
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
-    scomplex*  A1;
-    scomplex*  x1;
-    scomplex*  y1;
-    dim_t   i;
-    dim_t   b_fuse, f;
-    dim_t   n_elem, n_iter;
-    inc_t   rs_at, cs_at;
-    conj_t  conja;
-
-    bli_set_dims_incs_with_trans( transa,
-                                  m, n, rs_a, cs_a,
-                                  &n_elem, &n_iter, &rs_at, &cs_at );
-
-    conja = bli_extract_conj( transa );
-
-    /* If beta is zero, use setv. Otherwise, scale by beta. */
-        /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
-    /*bli_cscalv_zen_int10
-    (
-      BLIS_NO_CONJUGATE,
-      n_elem,
-      beta,
-      y,
-      incy,
-      cntx
-    );*/
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen == 0)
-    {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-        const num_t dt = PASTEMAC(c,type);
-        /* If beta is zero, use setv. Otherwise, scale by beta. */
-        if ( PASTEMAC(c,eq0)( *beta ) )
-        {
-            scomplex*  zero = PASTEMAC(c,0);
-            /* y = 0; */
-            PASTEMAC2(c,setv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              zero,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-        else
-        {
-            /* y = beta * y; */
-            PASTEMAC2(c,scalv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              beta,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-
-        PASTECH(c,axpyf_ker_ft) kfp_af;
-
-        /* Query the context for the kernel function pointer and fusing factor. */
-        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
-        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-
-            A1 = a + (0  )*rs_at + (i  )*cs_at;
-            x1 = x + (i  )*incx;
-            y1 = y + (0  )*incy;
-
-            /* y = y + alpha * A1 * x1; */
-            kfp_af
-            (
-              conja,
-              conjx,
-              n_elem,
-              f,
-              alpha,
-              A1, rs_at, cs_at,
-              x1, incx,
-              y1, incy,
-              cntx
-            );
-        }
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-        return;
-    }
-
-    bli_cscalv_ex
-	    (
-	     BLIS_NO_CONJUGATE,
-	     n_elem,
-	     beta,
-	     y, incy,
-	     cntx,
-	     NULL
-	    );
-
-
-
-    if( bli_ceq0( *alpha ) )
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
-        return;
-    }
-
-    // for non-unit incx, incy and rs_at and conjugate will be added in the next patch
-    if( ( (incx == 1) && (incy == 1) && (rs_at == 1) ) &&
-         !bli_is_conj(conja) && !bli_is_conj(conjx) &&
-         !bli_is_trans(transa))
-    {
-        // This gemv code deals with the followint conditions only
-        // 1. incx, incy, and row stride equal to one
-        // 2. Non conjugate A matrix and X vector
-        // 3. No Transpose for A Martix
-        // Rest is taken care by the else part (axpyf implementation)
-        bli_cgemv_zen_int_4x4
-        (
-            conja,
-            conjx,
-            m,
-            n,
-            alpha,
-            a, rs_at, cs_at,
-            x, incx,
-            beta,
-            y, incy,
-            NULL
-        );
-    }
-    else
-    {
-        /* fusing factor. */
-        b_fuse = 4;
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-            A1 = a + (0  )*rs_at + (i  )*cs_at;
-            x1 = x + (i  )*incx;
-            y1 = y + (0  )*incy;
-
-            /* y = y + alpha * A1 * x1; */
-            bli_caxpyf_zen_int_4
-            (
-              conja,
-              conjx,
-              n_elem,
-              f,
-              alpha,
-              A1, rs_at, cs_at,
-              x1, incx,
-              y1, incy,
-              NULL
-            );
-        }
-    }
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-}
-
-
-#else
-INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
-#endif
+INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
--- a/frame/2/gemv/bli_gemv_unf_var2_amd.c
+++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c
@@ -0,0 +1,879 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#define BLIS_DGEMV_VAR2_FUSE 4
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       trans_t transa, \
+       conj_t  conjx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  x, inc_t incx, \
+       ctype*  beta, \
+       ctype*  y, inc_t incy, \
+       cntx_t* cntx  \
+     ) \
+{ \
+\
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); \
+\
+    bli_init_once(); \
+\
+    if(cntx == NULL) cntx = bli_gks_query_cntx(); \
+\
+    const num_t dt = PASTEMAC(ch,type); \
+\
+    ctype*  zero       = PASTEMAC(ch,0); \
+    ctype*  A1; \
+    ctype*  x1; \
+    ctype*  y1; \
+    dim_t   i; \
+    dim_t   b_fuse, f; \
+    dim_t   n_elem, n_iter; \
+    inc_t   rs_at, cs_at; \
+    conj_t  conja; \
+\
+    bli_set_dims_incs_with_trans( transa, \
+                                  m, n, rs_a, cs_a, \
+                                  &n_elem, &n_iter, &rs_at, &cs_at ); \
+\
+    conja = bli_extract_conj( transa ); \
+\
+    /* If beta is zero, use setv. Otherwise, scale by beta. */ \
+    if ( PASTEMAC(ch,eq0)( *beta ) ) \
+    { \
+        /* y = 0; */ \
+        PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+        ( \
+          BLIS_NO_CONJUGATE, \
+          n_elem, \
+          zero, \
+          y, incy, \
+          cntx, \
+          NULL  \
+        ); \
+    } \
+    else \
+    { \
+        /* y = beta * y; */ \
+        PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+        ( \
+          BLIS_NO_CONJUGATE, \
+          n_elem, \
+          beta, \
+          y, incy, \
+          cntx, \
+          NULL  \
+        ); \
+    } \
+\
+    PASTECH(ch,axpyf_ker_ft) kfp_af; \
+\
+    /* Query the context for the kernel function pointer and fusing factor. */ \
+    kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+    b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
+\
+    for ( i = 0; i < n_iter; i += f ) \
+    { \
+        f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
+\
+        A1 = a + (0  )*rs_at + (i  )*cs_at; \
+        x1 = x + (i  )*incx; \
+        y1 = y + (0  )*incy; \
+\
+        /* y = y + alpha * A1 * x1; */ \
+        kfp_af \
+        ( \
+          conja, \
+          conjx, \
+          n_elem, \
+          f, \
+          alpha, \
+          A1, rs_at, cs_at, \
+          x1, incx, \
+          y1, incy, \
+          cntx  \
+        ); \
+    } \
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); \
+}
+
+void bli_dgemv_unf_var2
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       double*  alpha,
+       double*  a, inc_t rs_a, inc_t cs_a,
+       double*  x, inc_t incx,
+       double*  beta,
+       double*  y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
+    double*  A1;
+    double*  x1;
+    dim_t   i;
+    dim_t   f;
+    dim_t   n_elem, n_iter;
+    inc_t   rs_at, cs_at;
+    conj_t  conja;
+    //memory pool declarations for packing vector Y.
+    mem_t   mem_bufY;
+    rntm_t  rntm;
+    double  *y_buf = y;
+    inc_t   buf_incy = incy;
+
+    // For AMD these APIS are invoked skipping intermediate framework layers
+    // Hence we need to ensure that cntx is set here.
+    bli_init_once();
+    if(cntx == NULL) cntx = bli_gks_query_cntx();
+
+    bli_set_dims_incs_with_trans( transa,
+                                  m, n, rs_a, cs_a,
+                                  &n_elem, &n_iter, &rs_at, &cs_at );
+
+    conja = bli_extract_conj( transa );
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        const num_t dt = PASTEMAC(d,type);
+        double*  x1;
+        double*  y1;
+        /* If beta is zero, use setv. Otherwise, scale by beta. */
+        if ( PASTEMAC(d,eq0)( *beta ) )
+        {
+            double*  zero = PASTEMAC(d,0);
+            /* y = 0; */
+            PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
+            (
+              BLIS_NO_CONJUGATE,
+              n_elem,
+              zero,
+              y, incy,
+              cntx,
+              NULL
+            );
+        }
+        else
+        {
+            /* y = beta * y; */
+            PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
+            (
+              BLIS_NO_CONJUGATE,
+              n_elem,
+              beta,
+              y, incy,
+              cntx,
+              NULL
+            );
+        }
+
+        PASTECH(d,axpyf_ker_ft) kfp_af;
+
+        /* Query the context for the kernel function pointer and fusing factor. */
+        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
+        dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
+
+        for ( i = 0; i < n_iter; i += f )
+        {
+            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+            A1 = a + (0  )*rs_at + (i  )*cs_at;
+            x1 = x + (i  )*incx;
+            y1 = y + (0  )*incy;
+
+            /* y = y + alpha * A1 * x1; */
+            kfp_af
+            (
+              conja,
+              conjx,
+              n_elem,
+              f,
+              alpha,
+              A1, rs_at, cs_at,
+              x1, incx,
+              y1, incy,
+              cntx
+            );
+        }
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+        return;
+    }
+
+    /* If beta is zero, use setv. Otherwise, scale by beta. */
+        /* y = beta * y; */
+    /* beta=0 case is hadled by scalv internally */
+
+    bli_dscalv_zen_int10
+    (
+        BLIS_NO_CONJUGATE,
+        n_elem,
+        beta,
+        y, incy,
+        cntx
+    );
+
+    if( bli_deq0( *alpha ) )
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
+        return;
+    }
+
+    if (incy > 1)
+    {
+        /*
+          Initialize mem pool buffer to NULL and size to 0
+          "buf" and "size" fields are assigned once memory
+          is allocated from the pool in bli_membrk_acquire_m().
+          This will ensure bli_mem_is_alloc() will be passed on
+          an allocated memory if created or a NULL .
+        */
+        mem_bufY.pblk.buf = NULL;   mem_bufY.pblk.block_size = 0;
+        mem_bufY.buf_type = 0;      mem_bufY.size = 0;
+        mem_bufY.pool = NULL;
+
+        /* In order to get the buffer from pool via rntm access to memory broker
+        is needed.Following are initializations for rntm */
+
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        //calculate the size required for n_elem double elements in vector Y.
+        size_t buffer_size = n_elem * sizeof(double);
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dgemv_unf_var2(): get mem pool block\n" );
+        #endif
+
+        /*acquire a Buffer(n_elem*size(double)) from the memory broker
+        and save the associated mem_t entry to mem_bufY.*/
+        bli_membrk_acquire_m(&rntm,
+                                buffer_size,
+                                BLIS_BUFFER_FOR_B_PANEL,
+                                &mem_bufY);
+
+        /*Continue packing Y if buffer memory is allocated*/
+        if ((bli_mem_is_alloc( &mem_bufY )))
+        {
+            y_buf = bli_mem_buffer(&mem_bufY);
+
+            //pack Y vector with non-unit stride to a temp buffer y_buf with unit stride
+            for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
+            {
+                *(y_buf + y_index) =  *(y + (y_index * incy)) ;
+            }
+            // stride of vector y_buf =1
+            buf_incy = 1;
+        }
+    }
+
+    for ( i = 0; i < n_iter; i += f )
+    {
+        f  = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE );
+
+        A1 = a + (0  )*rs_at + (i  )*cs_at;
+        x1 = x + (i  )*incx;
+
+        /* y = y + alpha * A1 * x1; */
+        bli_daxpyf_zen_int_16x4
+        (
+          conja,
+          conjx,
+          n_elem,
+          f,
+          alpha,
+          A1, rs_at, cs_at,
+          x1, incx,
+          y_buf, buf_incy,
+          cntx
+        );
+    }
+    if ((incy > 1) && bli_mem_is_alloc( &mem_bufY ))
+    {
+        //store the result from unit strided y_buf to non-unit strided Y
+        for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
+        {
+            *(y + (y_index * incy)) = *(y_buf + y_index) ;
+        }
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool
+        bli_membrk_release(&rntm , &mem_bufY);
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+}
+
+void bli_sgemv_unf_var2
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       float*  alpha,
+       float*  a, inc_t rs_a, inc_t cs_a,
+       float*  x, inc_t incx,
+       float*  beta,
+       float*  y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
+    float*  A1;
+    float*  x1;
+    float*  y1;
+    dim_t   i;
+    dim_t   b_fuse, f;
+    dim_t   n_elem, n_iter;
+    inc_t   rs_at, cs_at;
+    conj_t  conja;
+
+    // For AMD these APIS are invoked skipping intermediate framework layers
+    // Hence we need to ensure that cntx is set here.
+    bli_init_once();
+    if(cntx == NULL) cntx = bli_gks_query_cntx();
+
+    bli_set_dims_incs_with_trans( transa,
+                                  m, n, rs_a, cs_a,
+                                  &n_elem, &n_iter, &rs_at, &cs_at );
+
+    conja = bli_extract_conj( transa );
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        const num_t dt = PASTEMAC(s,type);
+        /* If beta is zero, use setv. Otherwise, scale by beta. */
+        if ( PASTEMAC(s,eq0)( *beta ) )
+        {
+            float*  zero = PASTEMAC(s,0);
+            /* y = 0; */
+            PASTEMAC2(s,setv,BLIS_TAPI_EX_SUF)
+            (
+              BLIS_NO_CONJUGATE,
+              n_elem,
+              zero,
+              y, incy,
+              cntx,
+              NULL
+            );
+        }
+        else
+        {
+            /* y = beta * y; */
+            PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
+            (
+              BLIS_NO_CONJUGATE,
+              n_elem,
+              beta,
+              y, incy,
+              cntx,
+              NULL
+            );
+        }
+
+        PASTECH(s,axpyf_ker_ft) kfp_af;
+
+        /* Query the context for the kernel function pointer and fusing factor. */
+        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
+        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
+
+        for ( i = 0; i < n_iter; i += f )
+        {
+            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+            A1 = a + (0  )*rs_at + (i  )*cs_at;
+            x1 = x + (i  )*incx;
+            y1 = y + (0  )*incy;
+
+            /* y = y + alpha * A1 * x1; */
+            kfp_af
+            (
+              conja,
+              conjx,
+              n_elem,
+              f,
+              alpha,
+              A1, rs_at, cs_at,
+              x1, incx,
+              y1, incy,
+              cntx
+            );
+        }
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+        return;
+    }
+
+    /* If beta is zero, use setv. Otherwise, scale by beta. */
+        /* y = beta * y; */
+    /* beta=0 case is hadled by scalv internally */
+    bli_sscalv_zen_int10
+    (
+      BLIS_NO_CONJUGATE,
+      n_elem,
+      beta,
+      y, incy,
+      cntx
+    );
+
+    if( bli_seq0( *alpha ) )
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
+        return;
+    }
+
+    /* Query the context for the kernel function pointer and fusing factor. */
+    b_fuse = 6;
+
+    for ( i = 0; i < n_iter; i += f )
+    {
+        f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+        A1 = a + (0  )*rs_at + (i  )*cs_at;
+        x1 = x + (i  )*incx;
+        y1 = y + (0  )*incy;
+
+        /* y = y + alpha * A1 * x1; */
+        bli_saxpyf_zen_int_6
+        (
+          conja,
+          conjx,
+          n_elem,
+          f,
+          alpha,
+          A1, rs_at, cs_at,
+          x1, incx,
+          y1, incy,
+          cntx
+        );
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+}
+
+
+void bli_zgemv_unf_var2
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       dcomplex*  alpha,
+       dcomplex*  a, inc_t rs_a, inc_t cs_a,
+       dcomplex*  x, inc_t incx,
+       dcomplex*  beta,
+       dcomplex*  y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
+    dcomplex*  A1;
+    dcomplex*  x1;
+    dcomplex*  y1;
+    dim_t   i;
+    dim_t   b_fuse, f;
+    dim_t   n_elem, n_iter;
+    inc_t   rs_at, cs_at;
+    conj_t  conja;
+
+    // For AMD these APIS are invoked skipping intermediate framework layers
+    // Hence we need to ensure that cntx is set here.
+    bli_init_once();
+    if(cntx == NULL) cntx = bli_gks_query_cntx();
+
+    bli_set_dims_incs_with_trans( transa,
+                                  m, n, rs_a, cs_a,
+                                  &n_elem, &n_iter, &rs_at, &cs_at );
+
+    conja = bli_extract_conj( transa );
+
+    /* If beta is zero, use setv. Otherwise, scale by beta. */
+      /* y = beta * y; */
+
+    /* beta=0 case is hadled by scalv internally */
+    /*    bli_zscalv_zen_int10
+    (
+      BLIS_NO_CONJUGATE,
+      n_elem,
+      beta,
+      y,
+      incy,
+      cntx
+    );*/
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        const num_t dt = PASTEMAC(z,type);
+        /* If beta is zero, use setv. Otherwise, scale by beta. */
+        if ( PASTEMAC(z,eq0)( *beta ) )
+        {
+            dcomplex*  zero = PASTEMAC(z,0);
+            /* y = 0; */
+            PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF)
+            (
+              BLIS_NO_CONJUGATE,
+              n_elem,
+              zero,
+              y, incy,
+              cntx,
+              NULL
+            );
+        }
+        else
+        {
+            /* y = beta * y; */
+            PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF)
+            (
+              BLIS_NO_CONJUGATE,
+              n_elem,
+              beta,
+              y, incy,
+              cntx,
+              NULL
+            );
+        }
+
+        PASTECH(z,axpyf_ker_ft) kfp_af;
+
+        /* Query the context for the kernel function pointer and fusing factor. */
+        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
+        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
+
+        for ( i = 0; i < n_iter; i += f )
+        {
+            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+            A1 = a + (0  )*rs_at + (i  )*cs_at;
+            x1 = x + (i  )*incx;
+            y1 = y + (0  )*incy;
+
+            /* y = y + alpha * A1 * x1; */
+            kfp_af
+            (
+              conja,
+              conjx,
+              n_elem,
+              f,
+              alpha,
+              A1, rs_at, cs_at,
+              x1, incx,
+              y1, incy,
+              cntx
+            );
+        }
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+        return;
+    }
+
+    bli_zscalv_ex
+    (
+      BLIS_NO_CONJUGATE,
+      n_elem,
+      beta,
+      y, incy,
+      cntx,
+      NULL
+    );
+
+    if( bli_zeq0( *alpha ) )
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+        return;
+    }
+
+    // for non-unit incx, incy and rs_at and conjugate will be added in the next patch
+    if( (incx == 1 && incy == 1 && rs_at == 1 ) &&
+         !bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa))
+    {
+        // This gemv code deals with the followint conditions only
+        // 1. incx, incy, and row stride equal to one
+        // 2. Non conjugate A matrix and X vector
+        // 3. No Transpose for A Martix
+        // Rest is taken care by the else part (axpyf implementation)
+        bli_zgemv_zen_int_4x4
+        (
+            conja,
+            conjx,
+            m,
+            n,
+            alpha,
+            a, rs_at, cs_at,
+            x, incx,
+            beta,
+            y, incy,
+            cntx
+        );
+    }
+    else
+    {
+        /* fusing factor */
+        b_fuse = 4;
+
+        for ( i = 0; i < n_iter; i += f )
+        {
+            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+            A1 = a + (0  )*rs_at + (i  )*cs_at;
+            x1 = x + (i  )*incx;
+            y1 = y + (0  )*incy;
+
+            /* y = y + alpha * A1 * x1; */
+            bli_zaxpyf_zen_int_4
+            (
+                conja,
+                conjx,
+                n_elem,
+                f,
+                alpha,
+                A1, rs_at, cs_at,
+                x1, incx,
+                y1, incy,
+                cntx
+            );
+        }
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+}
+
+void bli_cgemv_unf_var2
+     (
+       trans_t transa,
+       conj_t  conjx,
+       dim_t   m,
+       dim_t   n,
+       scomplex*  alpha,
+       scomplex*  a, inc_t rs_a, inc_t cs_a,
+       scomplex*  x, inc_t incx,
+       scomplex*  beta,
+       scomplex*  y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
+    scomplex*  A1;
+    scomplex*  x1;
+    scomplex*  y1;
+    dim_t   i;
+    dim_t   b_fuse, f;
+    dim_t   n_elem, n_iter;
+    inc_t   rs_at, cs_at;
+    conj_t  conja;
+
+    // For AMD these APIS are invoked skipping intermediate framework layers
+    // Hence we need to ensure that cntx is set here.
+    bli_init_once();
+    if(cntx == NULL) cntx = bli_gks_query_cntx();
+
+    bli_set_dims_incs_with_trans( transa,
+                                  m, n, rs_a, cs_a,
+                                  &n_elem, &n_iter, &rs_at, &cs_at );
+
+    conja = bli_extract_conj( transa );
+
+    /* If beta is zero, use setv. Otherwise, scale by beta. */
+        /* y = beta * y; */
+    /* beta=0 case is hadled by scalv internally */
+    /*bli_cscalv_zen_int10
+    (
+      BLIS_NO_CONJUGATE,
+      n_elem,
+      beta,
+      y,
+      incy,
+      cntx
+    );*/
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        const num_t dt = PASTEMAC(c,type);
+        /* If beta is zero, use setv. Otherwise, scale by beta. */
+        if ( PASTEMAC(c,eq0)( *beta ) )
+        {
+            scomplex*  zero = PASTEMAC(c,0);
+            /* y = 0; */
+            PASTEMAC2(c,setv,BLIS_TAPI_EX_SUF)
+            (
+              BLIS_NO_CONJUGATE,
+              n_elem,
+              zero,
+              y, incy,
+              cntx,
+              NULL
+            );
+        }
+        else
+        {
+            /* y = beta * y; */
+            PASTEMAC2(c,scalv,BLIS_TAPI_EX_SUF)
+            (
+              BLIS_NO_CONJUGATE,
+              n_elem,
+              beta,
+              y, incy,
+              cntx,
+              NULL
+            );
+        }
+
+        PASTECH(c,axpyf_ker_ft) kfp_af;
+
+        /* Query the context for the kernel function pointer and fusing factor. */
+        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
+        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
+
+        for ( i = 0; i < n_iter; i += f )
+        {
+            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+
+            A1 = a + (0  )*rs_at + (i  )*cs_at;
+            x1 = x + (i  )*incx;
+            y1 = y + (0  )*incy;
+
+            /* y = y + alpha * A1 * x1; */
+            kfp_af
+            (
+              conja,
+              conjx,
+              n_elem,
+              f,
+              alpha,
+              A1, rs_at, cs_at,
+              x1, incx,
+              y1, incy,
+              cntx
+            );
+        }
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+        return;
+    }
+
+    bli_cscalv_ex
+	    (
+	     BLIS_NO_CONJUGATE,
+	     n_elem,
+	     beta,
+	     y, incy,
+	     cntx,
+	     NULL
+	    );
+
+
+
+    if( bli_ceq0( *alpha ) )
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
+        return;
+    }
+
+    // for non-unit incx, incy and rs_at and conjugate will be added in the next patch
+    if( ( (incx == 1) && (incy == 1) && (rs_at == 1) ) &&
+         !bli_is_conj(conja) && !bli_is_conj(conjx) &&
+         !bli_is_trans(transa))
+    {
+        // This gemv code deals with the followint conditions only
+        // 1. incx, incy, and row stride equal to one
+        // 2. Non conjugate A matrix and X vector
+        // 3. No Transpose for A Martix
+        // Rest is taken care by the else part (axpyf implementation)
+        bli_cgemv_zen_int_4x4
+        (
+            conja,
+            conjx,
+            m,
+            n,
+            alpha,
+            a, rs_at, cs_at,
+            x, incx,
+            beta,
+            y, incy,
+            cntx
+        );
+    }
+    else
+    {
+        /* fusing factor. */
+        b_fuse = 4;
+
+        for ( i = 0; i < n_iter; i += f )
+        {
+            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+            A1 = a + (0  )*rs_at + (i  )*cs_at;
+            x1 = x + (i  )*incx;
+            y1 = y + (0  )*incy;
+
+            /* y = y + alpha * A1 * x1; */
+            bli_caxpyf_zen_int_4
+            (
+              conja,
+              conjx,
+              n_elem,
+              f,
+              alpha,
+              A1, rs_at, cs_at,
+              x1, incx,
+              y1, incy,
+              cntx
+            );
+        }
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+}
+
+
+
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -216,207 +216,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }

-#ifdef BLIS_CONFIG_EPYC
-
-void bli_post_hemv_8x8
-     (
-       double *a,
-       double *x,
-       double *y,
-       double *alpha,
-       dim_t cs_a,
-       dim_t rs_a
-     );
-
-void bli_dhemv_unf_var1
-     (
-       uplo_t  uplo,
-       conj_t  conja,
-       conj_t  conjx,
-       conj_t  conjh,
-       dim_t   m,
-       double*  alpha,
-       double*  a, inc_t rs_a, inc_t cs_a,
-       double*  x, inc_t incx,
-       double*  beta,
-       double*  y, inc_t incy,
-       cntx_t* cntx
-     )
-{
-	const num_t dt = PASTEMAC(d,type);
-
-	double*  one        = PASTEMAC(d,1);
-	double*  zero       = PASTEMAC(d,0);
-	double*  A10;
-	double*  A11;
-	double*  a10t;
-	double*  alpha11;
-	double*  a21;
-	double*  x0;
-	double*  x1;
-	double*  chi11;
-	double*  y0;
-	double*  y1;
-	double*  y01;
-	double*  psi11;
-	double*  y21;
-	double   conjx_chi11;
-	double   alpha_chi11;
-	double   alpha11_temp;
-	dim_t   i, k, j;
-	dim_t   b_fuse, f;
-	dim_t   n_behind;
-	dim_t   f_ahead, f_behind;
-	inc_t   rs_at, cs_at;
-	conj_t  conj0 = 0, conj1 = 0;
-
-	/* The algorithm will be expressed in terms of the lower triangular
-	 * case;the upper triangular case is supported by swapping the row
-	 * and column strides of A and toggling some conj parameters. */
-	if ( bli_is_lower( uplo ) )
-	{
-		rs_at = rs_a;
-		cs_at = cs_a;
-	}
-	else /* if ( bli_is_upper( uplo ) ) */
-	{
-		rs_at = cs_a;
-		cs_at = rs_a;
-	}
-
-	/* If beta is zero, use setv. Otherwise, scale by beta. */
-	if ( PASTEMAC(d,eq0)( *beta ) )
-	{
-		/* y = 0; */
-		PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
-		(
-		  BLIS_NO_CONJUGATE,
-		  m,
-		  zero,
-		  y, incy,
-		  cntx,
-		  NULL
-		);
-	}
-	else
-	{
-		/* y = beta * y; */
-		PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
-		(
-		  BLIS_NO_CONJUGATE,
-		  m,
-		  beta,
-		  y, incy,
-		  cntx,
-		  NULL
-		);
-	}
-
-	PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
-
-	/* Query the context for the kernel function pointer and fusing
-	 * factor. */
-	/* Assign kernel function pointer and fusing factor. */
-	arch_t id = bli_arch_query_id();
-	bool bamdzen = ((id == BLIS_ARCH_ZEN4) ||(id == BLIS_ARCH_ZEN3)
-			|| (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN));
-	if (bamdzen)
-	{
-		kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
-		b_fuse = 8;
-	}
-	else
-	{
-		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-		kfp_dotxaxpyf_ker =
-			bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
-		b_fuse =
-			bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
-	}
-
-	for ( i = 0; i < m; i += f )
-	{
-		f        = bli_determine_blocksize_dim_f( i, m, b_fuse );
-		n_behind = i;
-		A10      = a + (i  )*rs_at + (0  )*cs_at;
-		A11      = a + (i  )*rs_at + (i  )*cs_at;
-		x0       = x + (0  )*incx;
-		x1       = x + (i  )*incx;
-		y0       = y + (0  )*incy;
-		y1       = y + (i  )*incy;
-
-		/* y1 = y1 + alpha * A10  * x0;  (dotxf) */
-		/* y0 = y0 + alpha * A10' * x1;  (axpyf) */
-		kfp_dotxaxpyf_ker
-		(
-		  conj0,
-		  conj1,
-		  conjx,
-		  conjx,
-		  n_behind,
-		  f,
-		  alpha,
-		  A10, cs_at, rs_at,
-		  x0,  incx,
-		  x1,  incx,
-		  one,
-		  y1,  incy,
-		  y0,  incy,
-		  cntx
-		);
-
-		/* y1 = y1 + alpha * A11 * x1;  (variant 4) */
-		if((f == 8) && (incx == 1) && (incy == 1) && (cs_at == 1))
-		{
-			/*this helper function handles unit stride only*/
-			bli_post_hemv_8x8(A11, x1, y1, alpha, rs_at, cs_at);
-		}
-		else
-		{
-			for ( k = 0; k < f; ++k )
-			{
-				f_behind = k;
-				f_ahead  = f - k - 1;
-				a10t     = A11 + (k  )*rs_at + (0  )*cs_at;
-				alpha11  = A11 + (k  )*rs_at + (k  )*cs_at;
-				a21      = A11 + (k+1)*rs_at + (k  )*cs_at;
-				chi11    = x1  + (k  )*incx;
-				y01      = y1  + (0  )*incy;
-				psi11    = y1  + (k  )*incy;
-				y21      = y1  + (k+1)*incy;
-
-				/* y01 = y01 + alpha * a10t' * chi11; */
-				PASTEMAC(d,copycjs)( conjx, *chi11,
-						conjx_chi11 );
-				PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
-						alpha_chi11 );
-				for ( j = 0; j < f_behind; ++j )
-					PASTEMAC(d,axpys)( alpha_chi11,
-							*(a10t + j*cs_at),
-							*(y01 + j*incy) );
-
-				PASTEMAC(d,copycjs)( conja, *alpha11,
-						alpha11_temp );
-
-				/* psi11 = psi11 + alpha * alpha11 * chi11; */
-				PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
-						*psi11 );
-
-				/* y21 = y21 + alpha * a21 * chi11; */
-				for ( j = 0; j < f_ahead; ++j )
-				{
-					PASTEMAC(d,axpys)( alpha_chi11,
-							*(a21 + j*rs_at),
-							*(y21 + j*incy) );
-				}
-			}
-		}
-	}
-}
-GENTFUNC(float, s, hemv_unf_var1)
-GENTFUNC(scomplex, c, hemv_unf_var1)
-GENTFUNC(dcomplex, z, hemv_unf_var1)
-#else
 INSERT_GENTFUNC_BASIC0( hemv_unf_var1 )
-#endif

--- a/frame/2/hemv/bli_hemv_unf_var1_amd.c
+++ b/frame/2/hemv/bli_hemv_unf_var1_amd.c
@@ -0,0 +1,418 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       uplo_t  uplo, \
+       conj_t  conja, \
+       conj_t  conjx, \
+       conj_t  conjh, \
+       dim_t   m, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  x, inc_t incx, \
+       ctype*  beta, \
+       ctype*  y, inc_t incy, \
+       cntx_t* cntx  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	ctype*  one        = PASTEMAC(ch,1); \
+	ctype*  zero       = PASTEMAC(ch,0); \
+	ctype*  A10; \
+	ctype*  A11; \
+	ctype*  a10t; \
+	ctype*  alpha11; \
+	ctype*  a21; \
+	ctype*  x0; \
+	ctype*  x1; \
+	ctype*  chi11; \
+	ctype*  y0; \
+	ctype*  y1; \
+	ctype*  y01; \
+	ctype*  psi11; \
+	ctype*  y21; \
+	ctype   conjx_chi11; \
+	ctype   alpha_chi11; \
+	ctype   alpha11_temp; \
+	dim_t   i, k, j; \
+	dim_t   b_fuse, f; \
+	dim_t   n_behind; \
+	dim_t   f_ahead, f_behind; \
+	inc_t   rs_at, cs_at; \
+	conj_t  conj0, conj1; \
+\
+	/* The algorithm will be expressed in terms of the lower triangular case;
+	   the upper triangular case is supported by swapping the row and column
+	   strides of A and toggling some conj parameters. */ \
+	if      ( bli_is_lower( uplo ) ) \
+	{ \
+		rs_at = rs_a; \
+		cs_at = cs_a; \
+\
+		conj0 = conja; \
+		conj1 = bli_apply_conj( conjh, conja ); \
+	} \
+	else /* if ( bli_is_upper( uplo ) ) */ \
+	{ \
+		rs_at = cs_a; \
+		cs_at = rs_a; \
+\
+		conj0 = bli_apply_conj( conjh, conja ); \
+		conj1 = conja; \
+	} \
+\
+	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
+	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	{ \
+		/* y = 0; */ \
+		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  m, \
+		  zero, \
+		  y, incy, \
+		  cntx, \
+		  NULL  \
+		); \
+	} \
+	else \
+	{ \
+		/* y = beta * y; */ \
+		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  m, \
+		  beta, \
+		  y, incy, \
+		  cntx, \
+		  NULL  \
+		); \
+	} \
+\
+	PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
+\
+	/* Query the context for the kernel function pointer and fusing factor. */ \
+	kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
+	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
+\
+	for ( i = 0; i < m; i += f ) \
+	{ \
+		f        = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
+		n_behind = i; \
+		A10      = a + (i  )*rs_at + (0  )*cs_at; \
+		A11      = a + (i  )*rs_at + (i  )*cs_at; \
+		x0       = x + (0  )*incx; \
+		x1       = x + (i  )*incx; \
+		y0       = y + (0  )*incy; \
+		y1       = y + (i  )*incy; \
+\
+		/* y1 = y1 + alpha * A10  * x0;  (dotxf) */ \
+		/* y0 = y0 + alpha * A10' * x1;  (axpyf) */ \
+		kfp_xf \
+		( \
+		  conj0, \
+		  conj1, \
+		  conjx, \
+		  conjx, \
+		  n_behind, \
+		  f, \
+		  alpha, \
+		  A10, cs_at, rs_at, \
+		  x0,  incx, \
+		  x1,  incx, \
+		  one, \
+		  y1,  incy, \
+		  y0,  incy, \
+		  cntx  \
+		); \
+\
+		/* y1 = y1 + alpha * A11 * x1;  (variant 4) */ \
+		for ( k = 0; k < f; ++k ) \
+		{ \
+			f_behind = k; \
+			f_ahead  = f - k - 1; \
+			a10t     = A11 + (k  )*rs_at + (0  )*cs_at; \
+			alpha11  = A11 + (k  )*rs_at + (k  )*cs_at; \
+			a21      = A11 + (k+1)*rs_at + (k  )*cs_at; \
+			chi11    = x1  + (k  )*incx; \
+			y01      = y1  + (0  )*incy; \
+			psi11    = y1  + (k  )*incy; \
+			y21      = y1  + (k+1)*incy; \
+\
+			/* y01 = y01 + alpha * a10t' * chi11; */ \
+			PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
+			PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
+			if ( bli_is_conj( conj1 ) ) \
+			{ \
+				for ( j = 0; j < f_behind; ++j ) \
+					PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
+			} \
+			else \
+			{ \
+				for ( j = 0; j < f_behind; ++j ) \
+					PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
+			} \
+\
+			/* For hemv, explicitly set the imaginary component of alpha11 to
+			   zero. */ \
+			PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+			if ( bli_is_conj( conjh ) ) \
+				PASTEMAC(ch,seti0s)( alpha11_temp ); \
+\
+			/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
+			PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
+\
+			/* y21 = y21 + alpha * a21 * chi11; */ \
+			if ( bli_is_conj( conj0 ) ) \
+			{ \
+				for ( j = 0; j < f_ahead; ++j ) \
+					PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
+			} \
+			else \
+			{ \
+				for ( j = 0; j < f_ahead; ++j ) \
+					PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
+			} \
+		} \
+	} \
+}
+
+void bli_post_hemv_8x8
+     (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t cs_a,
+       dim_t rs_a
+     );
+
+void bli_dhemv_unf_var1
+     (
+       uplo_t  uplo,
+       conj_t  conja,
+       conj_t  conjx,
+       conj_t  conjh,
+       dim_t   m,
+       double*  alpha,
+       double*  a, inc_t rs_a, inc_t cs_a,
+       double*  x, inc_t incx,
+       double*  beta,
+       double*  y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+	const num_t dt = PASTEMAC(d,type);
+
+	double*  one        = PASTEMAC(d,1);
+	double*  zero       = PASTEMAC(d,0);
+	double*  A10;
+	double*  A11;
+	double*  a10t;
+	double*  alpha11;
+	double*  a21;
+	double*  x0;
+	double*  x1;
+	double*  chi11;
+	double*  y0;
+	double*  y1;
+	double*  y01;
+	double*  psi11;
+	double*  y21;
+	double   conjx_chi11;
+	double   alpha_chi11;
+	double   alpha11_temp;
+	dim_t   i, k, j;
+	dim_t   b_fuse, f;
+	dim_t   n_behind;
+	dim_t   f_ahead, f_behind;
+	inc_t   rs_at, cs_at;
+	conj_t  conj0 = 0, conj1 = 0;
+
+	/* The algorithm will be expressed in terms of the lower triangular
+	 * case;the upper triangular case is supported by swapping the row
+	 * and column strides of A and toggling some conj parameters. */
+	if ( bli_is_lower( uplo ) )
+	{
+		rs_at = rs_a;
+		cs_at = cs_a;
+	}
+	else /* if ( bli_is_upper( uplo ) ) */
+	{
+		rs_at = cs_a;
+		cs_at = rs_a;
+	}
+
+	/* If beta is zero, use setv. Otherwise, scale by beta. */
+	if ( PASTEMAC(d,eq0)( *beta ) )
+	{
+		/* y = 0; */
+		PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
+		(
+		  BLIS_NO_CONJUGATE,
+		  m,
+		  zero,
+		  y, incy,
+		  cntx,
+		  NULL
+		);
+	}
+	else
+	{
+		/* y = beta * y; */
+		PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
+		(
+		  BLIS_NO_CONJUGATE,
+		  m,
+		  beta,
+		  y, incy,
+		  cntx,
+		  NULL
+		);
+	}
+
+	PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
+
+	/* Query the context for the kernel function pointer and fusing
+	 * factor. */
+	/* Assign kernel function pointer and fusing factor. */
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+	{
+		kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
+		b_fuse = 8;
+	}
+	else
+	{
+		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+		kfp_dotxaxpyf_ker =
+			bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
+		b_fuse =
+			bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
+	}
+
+	for ( i = 0; i < m; i += f )
+	{
+		f        = bli_determine_blocksize_dim_f( i, m, b_fuse );
+		n_behind = i;
+		A10      = a + (i  )*rs_at + (0  )*cs_at;
+		A11      = a + (i  )*rs_at + (i  )*cs_at;
+		x0       = x + (0  )*incx;
+		x1       = x + (i  )*incx;
+		y0       = y + (0  )*incy;
+		y1       = y + (i  )*incy;
+
+		/* y1 = y1 + alpha * A10  * x0;  (dotxf) */
+		/* y0 = y0 + alpha * A10' * x1;  (axpyf) */
+		kfp_dotxaxpyf_ker
+		(
+		  conj0,
+		  conj1,
+		  conjx,
+		  conjx,
+		  n_behind,
+		  f,
+		  alpha,
+		  A10, cs_at, rs_at,
+		  x0,  incx,
+		  x1,  incx,
+		  one,
+		  y1,  incy,
+		  y0,  incy,
+		  cntx
+		);
+
+		/* y1 = y1 + alpha * A11 * x1;  (variant 4) */
+		if((f == 8) && (incx == 1) && (incy == 1) && (cs_at == 1))
+		{
+			/*this helper function handles unit stride only*/
+			bli_post_hemv_8x8(A11, x1, y1, alpha, rs_at, cs_at);
+		}
+		else
+		{
+			for ( k = 0; k < f; ++k )
+			{
+				f_behind = k;
+				f_ahead  = f - k - 1;
+				a10t     = A11 + (k  )*rs_at + (0  )*cs_at;
+				alpha11  = A11 + (k  )*rs_at + (k  )*cs_at;
+				a21      = A11 + (k+1)*rs_at + (k  )*cs_at;
+				chi11    = x1  + (k  )*incx;
+				y01      = y1  + (0  )*incy;
+				psi11    = y1  + (k  )*incy;
+				y21      = y1  + (k+1)*incy;
+
+				/* y01 = y01 + alpha * a10t' * chi11; */
+				PASTEMAC(d,copycjs)( conjx, *chi11,
+						conjx_chi11 );
+				PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
+						alpha_chi11 );
+				for ( j = 0; j < f_behind; ++j )
+					PASTEMAC(d,axpys)( alpha_chi11,
+							*(a10t + j*cs_at),
+							*(y01 + j*incy) );
+
+				PASTEMAC(d,copycjs)( conja, *alpha11,
+						alpha11_temp );
+
+				/* psi11 = psi11 + alpha * alpha11 * chi11; */
+				PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
+						*psi11 );
+
+				/* y21 = y21 + alpha * a21 * chi11; */
+				for ( j = 0; j < f_ahead; ++j )
+				{
+					PASTEMAC(d,axpys)( alpha_chi11,
+							*(a21 + j*rs_at),
+							*(y21 + j*incy) );
+				}
+			}
+		}
+	}
+}
+GENTFUNC(float, s, hemv_unf_var1)
+GENTFUNC(scomplex, c, hemv_unf_var1)
+GENTFUNC(dcomplex, z, hemv_unf_var1)
+
+
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -216,210 +216,6 @@ void PASTEMAC(ch,varname) \
 	} \
 }

-#ifdef BLIS_CONFIG_EPYC
-
-void bli_pre_hemv_8x8
-     (
-       double *a,
-       double *x,
-       double *y,
-       double *alpha,
-       dim_t cs_a,
-       dim_t rs_a
-      );
-
-void bli_dhemv_unf_var3
-     (
-       uplo_t  uplo,
-       conj_t  conja,
-       conj_t  conjx,
-       conj_t  conjh,
-       dim_t   m,
-       double*  alpha,
-       double*  a, inc_t rs_a, inc_t cs_a,
-       double*  x, inc_t incx,
-       double*  beta,
-       double*  y, inc_t incy,
-       cntx_t* cntx
-     )
-{
-	const num_t dt = PASTEMAC(d,type);
-
-	double*  one        = PASTEMAC(d,1);
-	double*  zero       = PASTEMAC(d,0);
-	double*  A11;
-	double*  A21;
-	double*  a10t;
-	double*  alpha11;
-	double*  a21;
-	double*  x1;
-	double*  x2;
-	double*  chi11;
-	double*  y1;
-	double*  y2;
-	double*  y01;
-	double*  psi11;
-	double*  y21;
-	double   conjx_chi11;
-	double   alpha_chi11;
-	double   alpha11_temp;
-	dim_t   i, k, j;
-	dim_t   b_fuse, f;
-	dim_t   n_ahead;
-	dim_t   f_ahead, f_behind;
-	inc_t   rs_at, cs_at;
-	conj_t  conj0 = 0, conj1 = 0;
-
-	/* The algorithm will be expressed in terms of the lower triangular
-	 * case; the upper triangular case is supported by swapping the row
-	 * and column strides of A and toggling some conj parameters. */
-	if ( bli_is_lower( uplo ) )
-	{
-		rs_at = rs_a;
-		cs_at = cs_a;
-	}
-	else /* if ( bli_is_upper( uplo ) ) */
-	{
-		rs_at = cs_a;
-		cs_at = rs_a;
-	}
-
-	/* If beta is zero, use setv. Otherwise, scale by beta. */
-	if ( PASTEMAC(d,eq0)( *beta ) )
-	{
-		/* y = 0; */
-		PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
-		(
-		  BLIS_NO_CONJUGATE,
-		  m,
-		  zero,
-		  y, incy,
-		  cntx,
-		  NULL
-		);
-	}
-	else
-	{
-		/* y = beta * y; */
-		PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
-		(
-		  BLIS_NO_CONJUGATE,
-		  m,
-		  beta,
-		  y, incy,
-		  cntx,
-		  NULL
-		);
-	}
-
-	PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
-
-	arch_t id = bli_arch_query_id();
-	bool bamdzen = ((id == BLIS_ARCH_ZEN4) || (id == BLIS_ARCH_ZEN3)
-			|| (id == BLIS_ARCH_ZEN2) || (id == BLIS_ARCH_ZEN));
-	if (bamdzen)
-	{
-		kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
-		b_fuse = 8;
-	}
-	else
-	{
-		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-		kfp_dotxaxpyf_ker =
-			bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
-		b_fuse =
-			bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
-	}
-
-	for ( i = 0; i < m; i += f )
-	{
-		f        = bli_determine_blocksize_dim_f( i, m, b_fuse );
-		n_ahead  = m - i - f;
-		A11      = a + (i  )*rs_at + (i  )*cs_at;
-		A21      = a + (i+f)*rs_at + (i  )*cs_at;
-		x1       = x + (i  )*incx;
-		x2       = x + (i+f)*incx;
-		y1       = y + (i  )*incy;
-		y2       = y + (i+f)*incy;
-
-		/* y1 = y1 + alpha * A11 * x1;  (variant 4) */
-		if((f == 8) && (incx == 1) && (incy == 1) && (rs_at == 1))
-		{
-			/*this helper function handles unit stride only*/
-			bli_pre_hemv_8x8(A11, x1, y1, alpha, cs_at, rs_at);
-		}
-		else
-		{
-			for ( k = 0; k < f; ++k )
-			{
-				f_behind = k;
-				f_ahead  = f - k - 1;
-				a10t     = A11 + (k  )*rs_at + (0  )*cs_at;
-				alpha11  = A11 + (k  )*rs_at + (k  )*cs_at;
-				a21      = A11 + (k+1)*rs_at + (k  )*cs_at;
-				chi11    = x1  + (k  )*incx;
-				y01      = y1  + (0  )*incy;
-				psi11    = y1  + (k  )*incy;
-				y21      = y1  + (k+1)*incy;
-
-				/* y01 = y01 + alpha * a10t' * chi11; */
-				PASTEMAC(d,copycjs)( conjx,
-						*chi11, conjx_chi11 );
-				PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
-						alpha_chi11 );
-				{
-					for ( j = 0; j < f_behind; ++j )
-					{
-						PASTEMAC(d,axpys)
-							( alpha_chi11,
-							  *(a10t + j*cs_at),
-							  *(y01 + j*incy) );
-					}
-				}
-
-				PASTEMAC(d,copycjs)( conja, *alpha11,
-						alpha11_temp );
-
-				/* psi11 = psi11 + alpha * alpha11 * chi11; */
-				PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
-						*psi11 );
-
-				/* y21 = y21 + alpha * a21 * chi11; */
-				for ( j = 0; j < f_ahead; ++j )
-				{
-					PASTEMAC(d,axpys)( alpha_chi11,
-							*(a21 + j*rs_at),
-							*(y21 + j*incy) );
-				}
-			}
-		}
-
-		/* y1 = y1 + alpha * A21' * x2;  (dotxf) */
-		/* y2 = y2 + alpha * A21  * x1;  (axpyf) */
-		kfp_dotxaxpyf_ker
-		(
-		  conj0,
-		  conj1,
-		  conjx,
-		  conjx,
-		  n_ahead,
-		  f,
-		  alpha,
-		  A21, rs_at, cs_at,
-		  x2,  incx,
-		  x1,  incx,
-		  one,
-		  y1,  incy,
-		  y2,  incy,
-		  cntx
-		);
-	}
-}
-
-GENTFUNC(float, s, hemv_unf_var3)
-GENTFUNC(scomplex, c, hemv_unf_var3)
-GENTFUNC(dcomplex, z, hemv_unf_var3)
-#else
 INSERT_GENTFUNC_BASIC0( hemv_unf_var3 )
-#endif
+

--- a/frame/2/hemv/bli_hemv_unf_var3_amd.c
+++ b/frame/2/hemv/bli_hemv_unf_var3_amd.c
@@ -0,0 +1,420 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       uplo_t  uplo, \
+       conj_t  conja, \
+       conj_t  conjx, \
+       conj_t  conjh, \
+       dim_t   m, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  x, inc_t incx, \
+       ctype*  beta, \
+       ctype*  y, inc_t incy, \
+       cntx_t* cntx  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	ctype*  one        = PASTEMAC(ch,1); \
+	ctype*  zero       = PASTEMAC(ch,0); \
+	ctype*  A11; \
+	ctype*  A21; \
+	ctype*  a10t; \
+	ctype*  alpha11; \
+	ctype*  a21; \
+	ctype*  x1; \
+	ctype*  x2; \
+	ctype*  chi11; \
+	ctype*  y1; \
+	ctype*  y2; \
+	ctype*  y01; \
+	ctype*  psi11; \
+	ctype*  y21; \
+	ctype   conjx_chi11; \
+	ctype   alpha_chi11; \
+	ctype   alpha11_temp; \
+	dim_t   i, k, j; \
+	dim_t   b_fuse, f; \
+	dim_t   n_ahead; \
+	dim_t   f_ahead, f_behind; \
+	inc_t   rs_at, cs_at; \
+	conj_t  conj0, conj1; \
+\
+	/* The algorithm will be expressed in terms of the lower triangular case;
+	   the upper triangular case is supported by swapping the row and column
+	   strides of A and toggling some conj parameters. */ \
+	if      ( bli_is_lower( uplo ) ) \
+	{ \
+		rs_at = rs_a; \
+		cs_at = cs_a; \
+\
+		conj0 = bli_apply_conj( conjh, conja ); \
+		conj1 = conja; \
+	} \
+	else /* if ( bli_is_upper( uplo ) ) */ \
+	{ \
+		rs_at = cs_a; \
+		cs_at = rs_a; \
+\
+		conj0 = conja; \
+		conj1 = bli_apply_conj( conjh, conja ); \
+	} \
+\
+	/* If beta is zero, use setv. Otherwise, scale by beta. */ \
+	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	{ \
+		/* y = 0; */ \
+		PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  m, \
+		  zero, \
+		  y, incy, \
+		  cntx, \
+		  NULL  \
+		); \
+	} \
+	else \
+	{ \
+		/* y = beta * y; */ \
+		PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  m, \
+		  beta, \
+		  y, incy, \
+		  cntx, \
+		  NULL  \
+		); \
+	} \
+\
+	PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
+\
+	/* Query the context for the kernel function pointer and fusing factor. */ \
+	kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
+	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
+\
+	for ( i = 0; i < m; i += f ) \
+	{ \
+		f        = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
+		n_ahead  = m - i - f; \
+		A11      = a + (i  )*rs_at + (i  )*cs_at; \
+		A21      = a + (i+f)*rs_at + (i  )*cs_at; \
+		x1       = x + (i  )*incx; \
+		x2       = x + (i+f)*incx; \
+		y1       = y + (i  )*incy; \
+		y2       = y + (i+f)*incy; \
+\
+		/* y1 = y1 + alpha * A11 * x1;  (variant 4) */ \
+		for ( k = 0; k < f; ++k ) \
+		{ \
+			f_behind = k; \
+			f_ahead  = f - k - 1; \
+			a10t     = A11 + (k  )*rs_at + (0  )*cs_at; \
+			alpha11  = A11 + (k  )*rs_at + (k  )*cs_at; \
+			a21      = A11 + (k+1)*rs_at + (k  )*cs_at; \
+			chi11    = x1  + (k  )*incx; \
+			y01      = y1  + (0  )*incy; \
+			psi11    = y1  + (k  )*incy; \
+			y21      = y1  + (k+1)*incy; \
+\
+			/* y01 = y01 + alpha * a10t' * chi11; */ \
+			PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
+			PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
+			if ( bli_is_conj( conj0 ) ) \
+			{ \
+				for ( j = 0; j < f_behind; ++j ) \
+					PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
+			} \
+			else \
+			{ \
+				for ( j = 0; j < f_behind; ++j ) \
+					PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
+			} \
+\
+			/* For hemv, explicitly set the imaginary component of alpha11 to
+			   zero. */ \
+			PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
+			if ( bli_is_conj( conjh ) ) \
+				PASTEMAC(ch,seti0s)( alpha11_temp ); \
+\
+			/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
+			PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
+\
+			/* y21 = y21 + alpha * a21 * chi11; */ \
+			if ( bli_is_conj( conj1 ) ) \
+			{ \
+				for ( j = 0; j < f_ahead; ++j ) \
+					PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
+			} \
+			else \
+			{ \
+				for ( j = 0; j < f_ahead; ++j ) \
+					PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
+			} \
+		} \
+\
+		/* y1 = y1 + alpha * A21' * x2;  (dotxf) */ \
+		/* y2 = y2 + alpha * A21  * x1;  (axpyf) */ \
+		kfp_xf \
+		( \
+		  conj0, \
+		  conj1, \
+		  conjx, \
+		  conjx, \
+		  n_ahead, \
+		  f, \
+		  alpha, \
+		  A21, rs_at, cs_at, \
+		  x2,  incx, \
+		  x1,  incx, \
+		  one, \
+		  y1,  incy, \
+		  y2,  incy, \
+		  cntx  \
+		); \
+	} \
+}
+
+void bli_pre_hemv_8x8
+     (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t cs_a,
+       dim_t rs_a
+      );
+
+void bli_dhemv_unf_var3
+     (
+       uplo_t  uplo,
+       conj_t  conja,
+       conj_t  conjx,
+       conj_t  conjh,
+       dim_t   m,
+       double*  alpha,
+       double*  a, inc_t rs_a, inc_t cs_a,
+       double*  x, inc_t incx,
+       double*  beta,
+       double*  y, inc_t incy,
+       cntx_t* cntx
+     )
+{
+	const num_t dt = PASTEMAC(d,type);
+
+	double*  one        = PASTEMAC(d,1);
+	double*  zero       = PASTEMAC(d,0);
+	double*  A11;
+	double*  A21;
+	double*  a10t;
+	double*  alpha11;
+	double*  a21;
+	double*  x1;
+	double*  x2;
+	double*  chi11;
+	double*  y1;
+	double*  y2;
+	double*  y01;
+	double*  psi11;
+	double*  y21;
+	double   conjx_chi11;
+	double   alpha_chi11;
+	double   alpha11_temp;
+	dim_t   i, k, j;
+	dim_t   b_fuse, f;
+	dim_t   n_ahead;
+	dim_t   f_ahead, f_behind;
+	inc_t   rs_at, cs_at;
+	conj_t  conj0 = 0, conj1 = 0;
+
+	/* The algorithm will be expressed in terms of the lower triangular
+	 * case; the upper triangular case is supported by swapping the row
+	 * and column strides of A and toggling some conj parameters. */
+	if ( bli_is_lower( uplo ) )
+	{
+		rs_at = rs_a;
+		cs_at = cs_a;
+	}
+	else /* if ( bli_is_upper( uplo ) ) */
+	{
+		rs_at = cs_a;
+		cs_at = rs_a;
+	}
+
+	/* If beta is zero, use setv. Otherwise, scale by beta. */
+	if ( PASTEMAC(d,eq0)( *beta ) )
+	{
+		/* y = 0; */
+		PASTEMAC2(d,setv,BLIS_TAPI_EX_SUF)
+		(
+		  BLIS_NO_CONJUGATE,
+		  m,
+		  zero,
+		  y, incy,
+		  cntx,
+		  NULL
+		);
+	}
+	else
+	{
+		/* y = beta * y; */
+		PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
+		(
+		  BLIS_NO_CONJUGATE,
+		  m,
+		  beta,
+		  y, incy,
+		  cntx,
+		  NULL
+		);
+	}
+
+	PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+	{
+		kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
+		b_fuse = 8;
+	}
+	else
+	{
+		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+		kfp_dotxaxpyf_ker =
+			bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx);
+		b_fuse =
+			bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx );
+	}
+
+	for ( i = 0; i < m; i += f )
+	{
+		f        = bli_determine_blocksize_dim_f( i, m, b_fuse );
+		n_ahead  = m - i - f;
+		A11      = a + (i  )*rs_at + (i  )*cs_at;
+		A21      = a + (i+f)*rs_at + (i  )*cs_at;
+		x1       = x + (i  )*incx;
+		x2       = x + (i+f)*incx;
+		y1       = y + (i  )*incy;
+		y2       = y + (i+f)*incy;
+
+		/* y1 = y1 + alpha * A11 * x1;  (variant 4) */
+		if((f == 8) && (incx == 1) && (incy == 1) && (rs_at == 1))
+		{
+			/*this helper function handles unit stride only*/
+			bli_pre_hemv_8x8(A11, x1, y1, alpha, cs_at, rs_at);
+		}
+		else
+		{
+			for ( k = 0; k < f; ++k )
+			{
+				f_behind = k;
+				f_ahead  = f - k - 1;
+				a10t     = A11 + (k  )*rs_at + (0  )*cs_at;
+				alpha11  = A11 + (k  )*rs_at + (k  )*cs_at;
+				a21      = A11 + (k+1)*rs_at + (k  )*cs_at;
+				chi11    = x1  + (k  )*incx;
+				y01      = y1  + (0  )*incy;
+				psi11    = y1  + (k  )*incy;
+				y21      = y1  + (k+1)*incy;
+
+				/* y01 = y01 + alpha * a10t' * chi11; */
+				PASTEMAC(d,copycjs)( conjx,
+						*chi11, conjx_chi11 );
+				PASTEMAC(d,scal2s)( *alpha, conjx_chi11,
+						alpha_chi11 );
+				{
+					for ( j = 0; j < f_behind; ++j )
+					{
+						PASTEMAC(d,axpys)
+							( alpha_chi11,
+							  *(a10t + j*cs_at),
+							  *(y01 + j*incy) );
+					}
+				}
+
+				PASTEMAC(d,copycjs)( conja, *alpha11,
+						alpha11_temp );
+
+				/* psi11 = psi11 + alpha * alpha11 * chi11; */
+				PASTEMAC(d,axpys)( alpha_chi11, alpha11_temp,
+						*psi11 );
+
+				/* y21 = y21 + alpha * a21 * chi11; */
+				for ( j = 0; j < f_ahead; ++j )
+				{
+					PASTEMAC(d,axpys)( alpha_chi11,
+							*(a21 + j*rs_at),
+							*(y21 + j*incy) );
+				}
+			}
+		}
+
+		/* y1 = y1 + alpha * A21' * x2;  (dotxf) */
+		/* y2 = y2 + alpha * A21  * x1;  (axpyf) */
+		kfp_dotxaxpyf_ker
+		(
+		  conj0,
+		  conj1,
+		  conjx,
+		  conjx,
+		  n_ahead,
+		  f,
+		  alpha,
+		  A21, rs_at, cs_at,
+		  x2,  incx,
+		  x1,  incx,
+		  one,
+		  y1,  incy,
+		  y2,  incy,
+		  cntx
+		);
+	}
+}
+
+GENTFUNC(float, s, hemv_unf_var3)
+GENTFUNC(scomplex, c, hemv_unf_var3)
+GENTFUNC(dcomplex, z, hemv_unf_var3)
+
+
--- a/frame/2/her2/bli_her2_unf_var1.c
+++ b/frame/2/her2/bli_her2_unf_var1.c
@@ -158,217 +158,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }

-
-#ifdef BLIS_CONFIG_EPYC
-
-/**
- * Following is function declaration
- * that computes her2 for transposed case.
- * It handles triangular part of matrix and
- * remaining computation in optimal way to
- * gain performance improvement.
- * a is triangular matrix, x and y are vectors
- */
-void bli_dher2_trans_zen_int_4
-     (
-       double *a,
-       double *x,
-       double *y,
-       double *alpha,
-       dim_t m,
-       dim_t lda
-     );
-
-void bli_dher2_unf_var1
-     (
-       uplo_t  uplo,
-       conj_t  conjx,
-       conj_t  conjy,
-       conj_t  conjh,
-       dim_t   m,
-       double*  alpha,
-       double*  x, inc_t incx,
-       double*  y, inc_t incy,
-       double*  c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx
-     )
-{
-        const num_t dt = PASTEMAC(d,type);
-
-        double*  x0;
-        double*  chi1;
-        double*  y0;
-        double*  psi1;
-        double*  c10t;
-        double*  gamma11;
-        double   alpha0;
-        double   alpha1;
-        double   alpha0_chi1;
-        double   alpha1_psi1;
-        double   alpha0_chi1_psi1;
-        double   conjx0_chi1;
-        double   conjy1_psi1;
-        double   conjy0_psi1;
-        dim_t   i;
-        dim_t   n_behind;
-        inc_t   rs_ct, cs_ct;
-        conj_t  conj0, conj1;
-
-        /* The algorithm will be expressed in terms of the lower triangular
-	 * case;the upper triangular case is supported by swapping the row
-	 * and column strides of A and toggling some conj parameters.
-	 */
-        if ( bli_is_lower( uplo ) )
-        {
-                rs_ct = rs_c;
-                cs_ct = cs_c;
-
-                PASTEMAC(d,copys)( *alpha, alpha0 );
-		PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
-	}
-	else /* if ( bli_is_upper( uplo ) ) */
-	{
-		rs_ct = cs_c;
-		cs_ct = rs_c;
-
-		/* Toggle conjugation of conjx/conjy, but only if we are being
-		 * invoked as her2; for syr2, conjx/conjy are unchanged.
-		 */
-		conjx = bli_apply_conj( conjh, conjx );
-		conjy = bli_apply_conj( conjh, conjy );
-
-		PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
-		PASTEMAC(d,copys)( *alpha, alpha1 );
-	}
-
-	/* Apply conjh (which carries the conjugation component of the
-	 * Hermitian transpose, if applicable) to conjx and/or conjy as
-	 * needed to arrive at the effective conjugation for the vector
-	 * subproblems.
-	 */
-        conj0 = bli_apply_conj( conjh, conjy );
-        conj1 = bli_apply_conj( conjh, conjx );
-
-        PASTECH(d,axpy2v_ker_ft) kfp_2v;
-
-        /* Query the context for the kernel function pointer. */
-        kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
-
-	if( (incx == 1) && (incy == 1) && (rs_ct == 1))
-	{
-		for ( i = 0; i < m; )
-		{
-			n_behind = i;
-			x0       = x + (0  )*incx;
-			chi1     = x + (i  )*incx;
-			y0       = y + (0  )*incy;
-			psi1     = y + (i  )*incy;
-			c10t     = c + (i  )*rs_ct + (0  )*cs_ct;
-			gamma11  = c + (i  )*rs_ct + (i  )*cs_ct;
-
-			if((n_behind >= 3))
-			{
-				bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
-				i+=4;
-			}
-			else
-			{
-				/* Apply conjx and/or conjy to chi1 and/or psi1. */
-				PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
-				PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
-				PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
-
-				/* Compute scalars for vector subproblems. */
-				PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
-				PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
-
-				/* Compute alpha * chi1 * conj(psi1) after both chi1
-				 * and psi1 have already been conjugated, if needed,
-				 * by conjx and conjy.
-				 */
-				PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
-						alpha0_chi1_psi1 );
-
-				/* c10t = c10t +      alpha  * chi1 * y0'; */
-				/* c10t = c10t + conj(alpha) * psi1 * x0'; */
-				kfp_2v
-					(
-					 conj0,
-					 conj1,
-					 n_behind,
-					 &alpha0_chi1,
-					 &alpha1_psi1,
-					 y0,   incy,
-					 x0,   incx,
-					 c10t, cs_ct,
-					 cntx
-					);
-
-				/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1)
-				   + conj(alpha) * psi1 * conj(chi1); */
-				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
-				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
-
-				i+=1;
-			}
-		}
-	}
-	else
-	{
-		for ( i = 0; i < m; ++i )
-		{
-			n_behind = i;
-			x0       = x + (0  )*incx;
-			chi1     = x + (i  )*incx;
-			y0       = y + (0  )*incy;
-			psi1     = y + (i  )*incy;
-			c10t     = c + (i  )*rs_ct + (0  )*cs_ct;
-			gamma11  = c + (i  )*rs_ct + (i  )*cs_ct;
-
-			/* Apply conjx and/or conjy to chi1 and/or psi1. */
-			PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
-			PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
-			PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
-
-			/* Compute scalars for vector subproblems. */
-			PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
-			PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
-
-			/* Compute alpha * chi1 * conj(psi1) after both chi1
-			 * and psi1 have already been conjugated, if needed,
-			 * by conjx and conjy.
-			 */
-			PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
-					alpha0_chi1_psi1 );
-
-			/* c10t = c10t +      alpha  * chi1 * y0'; */
-			/* c10t = c10t + conj(alpha) * psi1 * x0'; */
-			kfp_2v
-				(
-				 conj0,
-				 conj1,
-				 n_behind,
-				 &alpha0_chi1,
-				 &alpha1_psi1,
-				 y0,   incy,
-				 x0,   incx,
-				 c10t, cs_ct,
-				 cntx
-				);
-
-			/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1)
-			   + conj(alpha) * psi1 * conj(chi1); */
-			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
-			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
-
-		}
-	}
-}
-
-GENTFUNC(float, s, her2_unf_var1)
-GENTFUNC(scomplex, c, her2_unf_var1)
-GENTFUNC(dcomplex, z,her2_unf_var1)
-#else
 INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
-#endif

--- a/frame/2/her2/bli_her2_unf_var1_amd.c
+++ b/frame/2/her2/bli_her2_unf_var1_amd.c
@@ -0,0 +1,369 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       uplo_t  uplo, \
+       conj_t  conjx, \
+       conj_t  conjy, \
+       conj_t  conjh, \
+       dim_t   m, \
+       ctype*  alpha, \
+       ctype*  x, inc_t incx, \
+       ctype*  y, inc_t incy, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	ctype*  x0; \
+	ctype*  chi1; \
+	ctype*  y0; \
+	ctype*  psi1; \
+	ctype*  c10t; \
+	ctype*  gamma11; \
+	ctype   alpha0; \
+	ctype   alpha1; \
+	ctype   alpha0_chi1; \
+	ctype   alpha1_psi1; \
+	ctype   alpha0_chi1_psi1; \
+	ctype   conjx0_chi1; \
+	ctype   conjy1_psi1; \
+	ctype   conjy0_psi1; \
+	dim_t   i; \
+	dim_t   n_behind; \
+	inc_t   rs_ct, cs_ct; \
+	conj_t  conj0, conj1; \
+\
+	/* The algorithm will be expressed in terms of the lower triangular case;
+	   the upper triangular case is supported by swapping the row and column
+	   strides of A and toggling some conj parameters. */ \
+	if      ( bli_is_lower( uplo ) ) \
+	{ \
+		rs_ct = rs_c; \
+		cs_ct = cs_c; \
+\
+		PASTEMAC(ch,copys)( *alpha, alpha0 ); \
+		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
+	} \
+	else /* if ( bli_is_upper( uplo ) ) */ \
+	{ \
+		rs_ct = cs_c; \
+		cs_ct = rs_c; \
+\
+		/* Toggle conjugation of conjx/conjy, but only if we are being invoked
+		   as her2; for syr2, conjx/conjy are unchanged. */ \
+		conjx = bli_apply_conj( conjh, conjx ); \
+		conjy = bli_apply_conj( conjh, conjy ); \
+\
+		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
+		PASTEMAC(ch,copys)( *alpha, alpha1 ); \
+	} \
+\
+	/* Apply conjh (which carries the conjugation component of the Hermitian
+	   transpose, if applicable) to conjx and/or conjy as needed to arrive at
+	   the effective conjugation for the vector subproblems. */ \
+	conj0 = bli_apply_conj( conjh, conjy ); \
+	conj1 = bli_apply_conj( conjh, conjx ); \
+\
+	PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
+\
+	/* Query the context for the kernel function pointer. */ \
+	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
+\
+	for ( i = 0; i < m; ++i ) \
+	{ \
+		n_behind = i; \
+		x0       = x + (0  )*incx; \
+		chi1     = x + (i  )*incx; \
+		y0       = y + (0  )*incy; \
+		psi1     = y + (i  )*incy; \
+		c10t     = c + (i  )*rs_ct + (0  )*cs_ct; \
+		gamma11  = c + (i  )*rs_ct + (i  )*cs_ct; \
+\
+		/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
+		PASTEMAC(ch,copycjs)( conjx,        *chi1, conjx0_chi1 ); \
+		PASTEMAC(ch,copycjs)( conjy,        *psi1, conjy1_psi1 ); \
+		PASTEMAC(ch,copycjs)( conj0,        *psi1, conjy0_psi1 ); \
+\
+		/* Compute scalars for vector subproblems. */ \
+		PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
+		PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
+\
+		/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
+		   already been conjugated, if needed, by conjx and conjy. */ \
+		PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
+\
+		/* c10t = c10t +      alpha  * chi1 * y0'; */ \
+		/* c10t = c10t + conj(alpha) * psi1 * x0'; */ \
+		kfp_2v \
+		( \
+		  conj0, \
+		  conj1, \
+		  n_behind, \
+		  &alpha0_chi1, \
+		  &alpha1_psi1, \
+		  y0,   incy, \
+		  x0,   incx, \
+		  c10t, cs_ct, \
+		  cntx  \
+		); \
+\
+		/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1) \
+		                     + conj(alpha) * psi1 * conj(chi1); */ \
+		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+\
+		/* For her2, explicitly set the imaginary component of gamma11 to
+           zero. */ \
+		if ( bli_is_conj( conjh ) ) \
+			PASTEMAC(ch,seti0s)( *gamma11 ); \
+	} \
+}
+
+/**
+ * Following is function declaration
+ * that computes her2 for transposed case.
+ * It handles triangular part of matrix and
+ * remaining computation in optimal way to
+ * gain performance improvement.
+ * a is triangular matrix, x and y are vectors
+ */
+void bli_dher2_trans_zen_int_4
+     (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t m,
+       dim_t lda
+     );
+
+void bli_dher2_unf_var1
+     (
+       uplo_t  uplo,
+       conj_t  conjx,
+       conj_t  conjy,
+       conj_t  conjh,
+       dim_t   m,
+       double*  alpha,
+       double*  x, inc_t incx,
+       double*  y, inc_t incy,
+       double*  c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx
+     )
+{
+        const num_t dt = PASTEMAC(d,type);
+
+        double*  x0;
+        double*  chi1;
+        double*  y0;
+        double*  psi1;
+        double*  c10t;
+        double*  gamma11;
+        double   alpha0;
+        double   alpha1;
+        double   alpha0_chi1;
+        double   alpha1_psi1;
+        double   alpha0_chi1_psi1;
+        double   conjx0_chi1;
+        double   conjy1_psi1;
+        double   conjy0_psi1;
+        dim_t   i;
+        dim_t   n_behind;
+        inc_t   rs_ct, cs_ct;
+        conj_t  conj0, conj1;
+
+        /* The algorithm will be expressed in terms of the lower triangular
+	 * case;the upper triangular case is supported by swapping the row
+	 * and column strides of A and toggling some conj parameters.
+	 */
+        if ( bli_is_lower( uplo ) )
+        {
+                rs_ct = rs_c;
+                cs_ct = cs_c;
+
+                PASTEMAC(d,copys)( *alpha, alpha0 );
+		PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
+	}
+	else /* if ( bli_is_upper( uplo ) ) */
+	{
+		rs_ct = cs_c;
+		cs_ct = rs_c;
+
+		/* Toggle conjugation of conjx/conjy, but only if we are being
+		 * invoked as her2; for syr2, conjx/conjy are unchanged.
+		 */
+		conjx = bli_apply_conj( conjh, conjx );
+		conjy = bli_apply_conj( conjh, conjy );
+
+		PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
+		PASTEMAC(d,copys)( *alpha, alpha1 );
+	}
+
+	/* Apply conjh (which carries the conjugation component of the
+	 * Hermitian transpose, if applicable) to conjx and/or conjy as
+	 * needed to arrive at the effective conjugation for the vector
+	 * subproblems.
+	 */
+        conj0 = bli_apply_conj( conjh, conjy );
+        conj1 = bli_apply_conj( conjh, conjx );
+
+        PASTECH(d,axpy2v_ker_ft) kfp_2v;
+
+        /* Query the context for the kernel function pointer. */
+        kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
+
+	if( (incx == 1) && (incy == 1) && (rs_ct == 1))
+	{
+		for ( i = 0; i < m; )
+		{
+			n_behind = i;
+			x0       = x + (0  )*incx;
+			chi1     = x + (i  )*incx;
+			y0       = y + (0  )*incy;
+			psi1     = y + (i  )*incy;
+			c10t     = c + (i  )*rs_ct + (0  )*cs_ct;
+			gamma11  = c + (i  )*rs_ct + (i  )*cs_ct;
+
+			if((n_behind >= 3))
+			{
+				bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
+				i+=4;
+			}
+			else
+			{
+				/* Apply conjx and/or conjy to chi1 and/or psi1. */
+				PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
+				PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
+				PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
+
+				/* Compute scalars for vector subproblems. */
+				PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
+				PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
+
+				/* Compute alpha * chi1 * conj(psi1) after both chi1
+				 * and psi1 have already been conjugated, if needed,
+				 * by conjx and conjy.
+				 */
+				PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
+						alpha0_chi1_psi1 );
+
+				/* c10t = c10t +      alpha  * chi1 * y0'; */
+				/* c10t = c10t + conj(alpha) * psi1 * x0'; */
+				kfp_2v
+					(
+					 conj0,
+					 conj1,
+					 n_behind,
+					 &alpha0_chi1,
+					 &alpha1_psi1,
+					 y0,   incy,
+					 x0,   incx,
+					 c10t, cs_ct,
+					 cntx
+					);
+
+				/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1)
+				   + conj(alpha) * psi1 * conj(chi1); */
+				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+
+				i+=1;
+			}
+		}
+	}
+	else
+	{
+		for ( i = 0; i < m; ++i )
+		{
+			n_behind = i;
+			x0       = x + (0  )*incx;
+			chi1     = x + (i  )*incx;
+			y0       = y + (0  )*incy;
+			psi1     = y + (i  )*incy;
+			c10t     = c + (i  )*rs_ct + (0  )*cs_ct;
+			gamma11  = c + (i  )*rs_ct + (i  )*cs_ct;
+
+			/* Apply conjx and/or conjy to chi1 and/or psi1. */
+			PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
+			PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
+			PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
+
+			/* Compute scalars for vector subproblems. */
+			PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
+			PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
+
+			/* Compute alpha * chi1 * conj(psi1) after both chi1
+			 * and psi1 have already been conjugated, if needed,
+			 * by conjx and conjy.
+			 */
+			PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
+					alpha0_chi1_psi1 );
+
+			/* c10t = c10t +      alpha  * chi1 * y0'; */
+			/* c10t = c10t + conj(alpha) * psi1 * x0'; */
+			kfp_2v
+				(
+				 conj0,
+				 conj1,
+				 n_behind,
+				 &alpha0_chi1,
+				 &alpha1_psi1,
+				 y0,   incy,
+				 x0,   incx,
+				 c10t, cs_ct,
+				 cntx
+				);
+
+			/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1)
+			   + conj(alpha) * psi1 * conj(chi1); */
+			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+
+		}
+	}
+}
+
+GENTFUNC(float, s, her2_unf_var1)
+GENTFUNC(scomplex, c, her2_unf_var1)
+GENTFUNC(dcomplex, z,her2_unf_var1)
+
+
--- a/frame/2/her2/bli_her2_unf_var4.c
+++ b/frame/2/her2/bli_her2_unf_var4.c
@@ -166,192 +166,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }

-#ifdef BLIS_CONFIG_EPYC
-/**
- * Following is function declaration
- * that computes her2 for transposed case.
- * It handles triangular part of matrix and
- * remaining computation in optimal way to
- * gain performance improvement.
- * a is triangular matrix, x and y are vectors
- */
-void bli_dher2_zen_int_4
-    (
-       double *a,
-       double *x,
-       double *y,
-       double *alpha,
-       dim_t m,
-       dim_t lda
-    );
-
-void bli_dher2_unf_var4
-    (
-       uplo_t  uplo,
-       conj_t  conjx,
-       conj_t  conjy,
-       conj_t  conjh,
-       dim_t   m,
-       double*  alpha,
-       double*  x, inc_t incx,
-       double*  y, inc_t incy,
-       double*  c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx
-     )
-{
-
-        double*  chi1;
-        double*  x2;
-        double*  psi1;
-        double*  y2;
-        double*  gamma11;
-        double*  c21;
-        double   alpha0;
-        double   alpha0_psi1;
-        double   alpha1_chi1;
-        double   alpha0_chi1_psi1;
-        dim_t    i;
-	dim_t    n_ahead;
-	inc_t    rs_ct, cs_ct;
-
-	const num_t dt = PASTEMAC(d,type);
-
-	/* The algorithm will be expressed in terms of the lower triangular
-	 * case; the upper triangular case is supported by swapping the row
-	 * and column strides of A and toggling some conj parameters.
-	 */
-	if      ( bli_is_lower( uplo ) )
-	{
-		rs_ct = rs_c;
-		cs_ct = cs_c;
-
-		PASTEMAC(d,copys)( *alpha, alpha0 );
-	}
-	else /* if ( bli_is_upper( uplo ) ) */
-	{
-		rs_ct = cs_c;
-		cs_ct = rs_c;
-
-		/* Toggle conjugation of conjx/conjy, but only if we are being
-		 * invoked as her2; for syr2, conjx/conjy are unchanged.
-		 */
-
-		PASTEMAC(d,copys)( *alpha, alpha0 );
-	}
-	/* Apply conjh (which carries the conjugation component of the
-	 * Hermitian transpose, if applicable) to conjx and/or conjy as
-	 * needed to arrive at the effective conjugation for the vector
-	 * subproblems.
-	 */
-
-	PASTECH(d,axpy2v_ker_ft) kfp_2v;
-
-	/* Query the context for the kernel function pointer. */
-	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
-
-	if((incx == 1) && (incy == 1) && (rs_ct == 1))
-	{
-		for ( i = 0; i < m; )
-		{
-			n_ahead  = m - i - 1;
-			chi1     = x + (i  ) * incx;
-			x2       = x + (i+1) * incx;
-			psi1     = y + (i  ) * incy;
-			y2       = y + (i+1) * incy;
-			gamma11  = c + (i  ) + (i  )*cs_ct;
-			c21      = c + (i+1) + (i  )*cs_ct;
-
-			if((n_ahead >= 3))
-			{
-				bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
-				i+= 4;
-			}
-			else
-			{
-				/* Compute scalars for vector subproblems. */
-				PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
-				PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
-
-				/* Compute alpha * chi1 * conj(psi1) after both chi1
-				 * and psi1 have
-				 already been conjugated, if needed, by conjx and
-				 conjy. */
-				PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
-						alpha0_chi1_psi1 );
-
-				/* c21 = c21 +      alpha  * x2 * conj(psi1); */
-				/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
-
-				kfp_2v
-					(
-					 conjx,
-					 conjy,
-					 n_ahead,
-					 &alpha0_psi1,
-					 &alpha1_chi1,
-					 x2,  incx,
-					 y2,  incy,
-					 c21, rs_ct,
-					 cntx
-					);
-
-
-				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
-				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
-				i+=1;
-			}
-		}
-	}
-	else
-	{
-		for ( i = 0; i < m; ++i)
-		{
-			n_ahead  = m - i - 1;
-			chi1     = x + (i  ) * incx;
-			x2       = x + (i+1) * incx;
-			psi1     = y + (i  ) * incy;
-			y2       = y + (i+1) * incy;
-			gamma11  = c + (i  ) + (i  )*cs_ct;
-			c21      = c + (i+1) + (i  )*cs_ct;
-
-			/* Compute scalars for vector subproblems. */
-			PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
-			PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
-
-			/* Compute alpha * chi1 * conj(psi1) after both chi1
-			 * and psi1 have
-			 already been conjugated, if needed, by conjx and
-			 conjy. */
-			PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
-					alpha0_chi1_psi1 );
-
-			/* c21 = c21 +      alpha  * x2 * conj(psi1); */
-			/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
-
-			kfp_2v
-				(
-				 conjx,
-				 conjy,
-				 n_ahead,
-				 &alpha0_psi1,
-				 &alpha1_chi1,
-				 x2,  incx,
-				 y2,  incy,
-				 c21, rs_ct,
-				 cntx
-				);
-
-
-			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
-			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
-		}
-	}
-}
-
-GENTFUNC(float, s, her2_unf_var4)
-GENTFUNC(scomplex, c, her2_unf_var4)
-GENTFUNC(dcomplex, z,her2_unf_var4)
-#else
 INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
-#endif

--- a/frame/2/her2/bli_her2_unf_var4_amd.c
+++ b/frame/2/her2/bli_her2_unf_var4_amd.c
@@ -0,0 +1,354 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       uplo_t  uplo, \
+       conj_t  conjx, \
+       conj_t  conjy, \
+       conj_t  conjh, \
+       dim_t   m, \
+       ctype*  alpha, \
+       ctype*  x, inc_t incx, \
+       ctype*  y, inc_t incy, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx  \
+     ) \
+{ \
+	const num_t dt = PASTEMAC(ch,type); \
+\
+	ctype*  chi1; \
+	ctype*  x2; \
+	ctype*  psi1; \
+	ctype*  y2; \
+	ctype*  gamma11; \
+	ctype*  c21; \
+	ctype   alpha0; \
+	ctype   alpha1; \
+	ctype   alpha0_psi1; \
+	ctype   alpha1_chi1; \
+	ctype   alpha0_chi1_psi1; \
+	ctype   conjy0_psi1; \
+	ctype   conjx1_chi1; \
+	ctype   conjx0_chi1; \
+	dim_t   i; \
+	dim_t   n_ahead; \
+	inc_t   rs_ct, cs_ct; \
+	conj_t  conj0, conj1; \
+	conj_t  conjh_conjx; \
+	conj_t  conjh_conjy; \
+\
+	/* Eliminate unused variable warnings. */ \
+	( void )conjh_conjx; \
+	( void )conjh_conjy; \
+\
+	/* The algorithm will be expressed in terms of the lower triangular case;
+	   the upper triangular case is supported by swapping the row and column
+	   strides of A and toggling some conj parameters. */ \
+	if      ( bli_is_lower( uplo ) ) \
+	{ \
+		rs_ct = rs_c; \
+		cs_ct = cs_c; \
+\
+		PASTEMAC(ch,copys)( *alpha, alpha0 ); \
+		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
+	} \
+	else /* if ( bli_is_upper( uplo ) ) */ \
+	{ \
+		rs_ct = cs_c; \
+		cs_ct = rs_c; \
+\
+		/* Toggle conjugation of conjx/conjy, but only if we are being invoked
+		   as her2; for syr2, conjx/conjy are unchanged. */ \
+		conjx = bli_apply_conj( conjh, conjx ); \
+		conjy = bli_apply_conj( conjh, conjy ); \
+\
+		PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
+		PASTEMAC(ch,copys)( *alpha, alpha1 ); \
+	} \
+\
+	/* Apply conjh (which carries the conjugation component of the Hermitian
+	   transpose, if applicable) to conjx and/or conjy as needed to arrive at
+	   the effective conjugation for the vector subproblems. */ \
+	conj0       = conjx; \
+	conj1       = conjy; \
+	conjh_conjx = bli_apply_conj( conjh, conjx ); \
+	conjh_conjy = bli_apply_conj( conjh, conjy ); \
+\
+	PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
+\
+	/* Query the context for the kernel function pointer. */ \
+	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
+\
+	for ( i = 0; i < m; ++i ) \
+	{ \
+		n_ahead  = m - i - 1; \
+		chi1     = x + (i  )*incx; \
+		x2       = x + (i+1)*incx; \
+		psi1     = y + (i  )*incy; \
+		y2       = y + (i+1)*incy; \
+		gamma11  = c + (i  )*rs_ct + (i  )*cs_ct; \
+		c21      = c + (i+1)*rs_ct + (i  )*cs_ct; \
+\
+		/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
+		PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
+		PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
+		PASTEMAC(ch,copycjs)( conj0,       *chi1, conjx0_chi1 ); \
+\
+		/* Compute scalars for vector subproblems. */ \
+		PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
+		PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
+\
+		/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
+		   already been conjugated, if needed, by conjx and conjy. */ \
+		PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
+\
+		/* c21 = c21 +      alpha  * x2 * conj(psi1); */ \
+		/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \
+		kfp_2v \
+		( \
+		  conj0, \
+		  conj1, \
+		  n_ahead, \
+		  &alpha0_psi1, \
+		  &alpha1_chi1, \
+		  x2,  incx, \
+		  y2,  incy, \
+		  c21, rs_ct, \
+		  cntx  \
+		); \
+\
+		/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1) \
+		                     + conj(alpha) * psi1 * conj(chi1); */ \
+		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+		PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
+\
+		/* For her2, explicitly set the imaginary component of gamma11 to
+           zero. */ \
+		if ( bli_is_conj( conjh ) ) \
+			PASTEMAC(ch,seti0s)( *gamma11 ); \
+	} \
+}
+
+/**
+ * Following is function declaration
+ * that computes her2 for transposed case.
+ * It handles triangular part of matrix and
+ * remaining computation in optimal way to
+ * gain performance improvement.
+ * a is triangular matrix, x and y are vectors
+ */
+void bli_dher2_zen_int_4
+    (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t m,
+       dim_t lda
+    );
+
+void bli_dher2_unf_var4
+    (
+       uplo_t  uplo,
+       conj_t  conjx,
+       conj_t  conjy,
+       conj_t  conjh,
+       dim_t   m,
+       double*  alpha,
+       double*  x, inc_t incx,
+       double*  y, inc_t incy,
+       double*  c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx
+     )
+{
+
+        double*  chi1;
+        double*  x2;
+        double*  psi1;
+        double*  y2;
+        double*  gamma11;
+        double*  c21;
+        double   alpha0;
+        double   alpha0_psi1;
+        double   alpha1_chi1;
+        double   alpha0_chi1_psi1;
+        dim_t    i;
+	dim_t    n_ahead;
+	inc_t    rs_ct, cs_ct;
+
+	const num_t dt = PASTEMAC(d,type);
+
+	/* The algorithm will be expressed in terms of the lower triangular
+	 * case; the upper triangular case is supported by swapping the row
+	 * and column strides of A and toggling some conj parameters.
+	 */
+	if      ( bli_is_lower( uplo ) )
+	{
+		rs_ct = rs_c;
+		cs_ct = cs_c;
+
+		PASTEMAC(d,copys)( *alpha, alpha0 );
+	}
+	else /* if ( bli_is_upper( uplo ) ) */
+	{
+		rs_ct = cs_c;
+		cs_ct = rs_c;
+
+		/* Toggle conjugation of conjx/conjy, but only if we are being
+		 * invoked as her2; for syr2, conjx/conjy are unchanged.
+		 */
+
+		PASTEMAC(d,copys)( *alpha, alpha0 );
+	}
+	/* Apply conjh (which carries the conjugation component of the
+	 * Hermitian transpose, if applicable) to conjx and/or conjy as
+	 * needed to arrive at the effective conjugation for the vector
+	 * subproblems.
+	 */
+
+	PASTECH(d,axpy2v_ker_ft) kfp_2v;
+
+	/* Query the context for the kernel function pointer. */
+	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
+
+	if((incx == 1) && (incy == 1) && (rs_ct == 1))
+	{
+		for ( i = 0; i < m; )
+		{
+			n_ahead  = m - i - 1;
+			chi1     = x + (i  ) * incx;
+			x2       = x + (i+1) * incx;
+			psi1     = y + (i  ) * incy;
+			y2       = y + (i+1) * incy;
+			gamma11  = c + (i  ) + (i  )*cs_ct;
+			c21      = c + (i+1) + (i  )*cs_ct;
+
+			if((n_ahead >= 3))
+			{
+				bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
+				i+= 4;
+			}
+			else
+			{
+				/* Compute scalars for vector subproblems. */
+				PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
+				PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
+
+				/* Compute alpha * chi1 * conj(psi1) after both chi1
+				 * and psi1 have
+				 already been conjugated, if needed, by conjx and
+				 conjy. */
+				PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
+						alpha0_chi1_psi1 );
+
+				/* c21 = c21 +      alpha  * x2 * conj(psi1); */
+				/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
+
+				kfp_2v
+					(
+					 conjx,
+					 conjy,
+					 n_ahead,
+					 &alpha0_psi1,
+					 &alpha1_chi1,
+					 x2,  incx,
+					 y2,  incy,
+					 c21, rs_ct,
+					 cntx
+					);
+
+
+				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+				i+=1;
+			}
+		}
+	}
+	else
+	{
+		for ( i = 0; i < m; ++i)
+		{
+			n_ahead  = m - i - 1;
+			chi1     = x + (i  ) * incx;
+			x2       = x + (i+1) * incx;
+			psi1     = y + (i  ) * incy;
+			y2       = y + (i+1) * incy;
+			gamma11  = c + (i  ) + (i  )*cs_ct;
+			c21      = c + (i+1) + (i  )*cs_ct;
+
+			/* Compute scalars for vector subproblems. */
+			PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
+			PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
+
+			/* Compute alpha * chi1 * conj(psi1) after both chi1
+			 * and psi1 have
+			 already been conjugated, if needed, by conjx and
+			 conjy. */
+			PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
+					alpha0_chi1_psi1 );
+
+			/* c21 = c21 +      alpha  * x2 * conj(psi1); */
+			/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
+
+			kfp_2v
+				(
+				 conjx,
+				 conjy,
+				 n_ahead,
+				 &alpha0_psi1,
+				 &alpha1_chi1,
+				 x2,  incx,
+				 y2,  incy,
+				 c21, rs_ct,
+				 cntx
+				);
+
+
+			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+		}
+	}
+}
+
+GENTFUNC(float, s, her2_unf_var4)
+GENTFUNC(scomplex, c, her2_unf_var4)
+GENTFUNC(dcomplex, z,her2_unf_var4)
+
+
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -231,421 +231,4 @@ void PASTEMAC(ch,varname) \
    } \
 }

-#ifdef BLIS_CONFIG_EPYC
-void bli_dtrsv_unf_var1
-     (
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       double*  alpha,
-       double*  a, inc_t rs_a, inc_t cs_a,
-       double*  x, inc_t incx,
-       cntx_t* cntx
-     )
-{
-
-    double*  one        = PASTEMAC(d,1);
-    double*  minus_one  = PASTEMAC(d,m1);
-    double*  A10;
-    double*  A11;
-    double*  A12;
-    double*  a10t;
-    double*  alpha11;
-    double*  a12t;
-    double*  x0;
-    double*  x1;
-    double*  x2;
-    double*  x01;
-    double*  chi11;
-    double*  x21;
-    double   alpha11_conj;
-    double   rho1;
-    dim_t   iter, i, k, j, l;
-    dim_t   b_fuse, f;
-    dim_t   n_behind, f_behind;
-    inc_t   rs_at, cs_at;
-    uplo_t  uploa_trans;
-    conj_t  conja;
-
-    /* x = alpha * x; */
-    PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
-    (
-      BLIS_NO_CONJUGATE,
-      m,
-      alpha,
-      x, incx,
-      cntx,
-      NULL
-    );
-
-    if( bli_does_notrans( transa ) )
-    {
-        rs_at = rs_a;
-        cs_at = cs_a;
-        uploa_trans = uploa;
-    }
-    else /* if ( bli_does_trans( transa ) ) */
-    {
-        rs_at = cs_a;
-        cs_at = rs_a;
-        uploa_trans = bli_uplo_toggled( uploa );
-    }
-
-    conja = bli_extract_conj( transa );
-
-    PASTECH(d,dotxf_ker_ft) kfp_df;
-
-    /* Assign kernel function pointer and fusing factor. */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-	    kfp_df = bli_ddotxf_zen_int_8;
-	    b_fuse = 8;
-    }
-    else
-    {
-	    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-	    num_t dt = PASTEMAC(d,type);
-	    kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
-	    b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
-    }
-
-    /* We reduce all of the possible cases down to just lower/upper. */
-    if      ( bli_is_upper( uploa_trans ) )
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse );
-            i        = m - iter - f;
-            n_behind = iter;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A12      = a + (i  )*rs_at + (i+f)*cs_at;
-            x1       = x + (i  )*incx;
-            x2       = x + (i+f)*incx;
-
-            /* x1 = x1 - A12 * x2; */
-            kfp_df
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_behind,
-              f,
-              minus_one,
-              A12, cs_at, rs_at,
-              x2,  incx,
-              one,
-              x1,  incx,
-              cntx
-            );
-
-            /* x1 = x1 / triu( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = f - k - 1;
-                f_behind = k;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a12t     = A11 + (l  )*rs_at + (l+1)*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x21      = x1  + (l+1)*incx;
-
-                /* chi11 = chi11 - a12t * x21; */
-                PASTEMAC(d,set0s)( rho1 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_behind; ++j )
-                        PASTEMAC(d,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
-                }
-                else
-                {
-                    for ( j = 0; j < f_behind; ++j )
-                        PASTEMAC(d,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
-                }
-                PASTEMAC(d,subs)( rho1, *chi11 );
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
-                }
-            }
-        }
-    }
-    else /* if ( bli_is_lower( uploa_trans ) ) */
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse );
-            i        = iter;
-            n_behind = i;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A10      = a + (i  )*rs_at + (0  )*cs_at;
-            x1       = x + (i  )*incx;
-            x0       = x + (0  )*incx;
-
-            /* x1 = x1 - A10 * x0; */
-            kfp_df
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_behind,
-              f,
-              minus_one,
-              A10, cs_at, rs_at,
-              x0,  incx,
-              one,
-              x1,  incx,
-              cntx
-            );
-
-            /* x1 = x1 / tril( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = k;
-                f_behind = l;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a10t     = A11 + (l  )*rs_at + (0  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x01      = x1  + (0  )*incx;
-
-                /* chi11 = chi11 - a10t * x01; */
-                PASTEMAC(d,set0s)( rho1 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_behind; ++j )
-                        PASTEMAC(d,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
-                }
-                else
-                {
-                    for ( j = 0; j < f_behind; ++j )
-                        PASTEMAC(d,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
-                }
-                PASTEMAC(d,subs)( rho1, *chi11 );
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
-                }
-            }
-        }
-    }
-}
-
-void bli_strsv_unf_var1
-     (
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       float*  alpha,
-       float*  a, inc_t rs_a, inc_t cs_a,
-       float*  x, inc_t incx,
-       cntx_t* cntx
-     )
-{
-
-    float*  one        = PASTEMAC(s,1);
-    float*  minus_one  = PASTEMAC(s,m1);
-    float*  A10;
-    float*  A11;
-    float*  A12;
-    float*  a10t;
-    float*  alpha11;
-    float*  a12t;
-    float*  x0;
-    float*  x1;
-    float*  x2;
-    float*  x01;
-    float*  chi11;
-    float*  x21;
-    float   alpha11_conj;
-    float   rho1;
-    dim_t   iter, i, k, j, l;
-    dim_t   b_fuse, f;
-    dim_t   n_behind, f_behind;
-    inc_t   rs_at, cs_at;
-    uplo_t  uploa_trans;
-    conj_t  conja;
-
-    /* x = alpha * x; */
-    PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
-    (
-      BLIS_NO_CONJUGATE,
-      m,
-      alpha,
-      x, incx,
-      cntx,
-      NULL
-    );
-
-    if( bli_does_notrans( transa ) )
-    {
-        rs_at = rs_a;
-        cs_at = cs_a;
-        uploa_trans = uploa;
-    }
-    else /* if ( bli_does_trans( transa ) ) */
-    {
-        rs_at = cs_a;
-        cs_at = rs_a;
-        uploa_trans = bli_uplo_toggled( uploa );
-    }
-
-    conja = bli_extract_conj( transa );
-
-    PASTECH(s,dotxf_ker_ft) kfp_df;
-
-    /* Assign kernel function pointer and fusing factor. */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-	    kfp_df = bli_sdotxf_zen_int_8;
-	    b_fuse = 8;
-    }
-    else
-    {
-	    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-	    num_t dt = PASTEMAC(s,type);
-	    kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
-	    b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
-
-    }
-
-    /* We reduce all of the possible cases down to just lower/upper. */
-    if      ( bli_is_upper( uploa_trans ) )
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse );
-            i        = m - iter - f;
-            n_behind = iter;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A12      = a + (i  )*rs_at + (i+f)*cs_at;
-            x1       = x + (i  )*incx;
-            x2       = x + (i+f)*incx;
-
-            /* x1 = x1 - A12 * x2; */
-            kfp_df
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_behind,
-              f,
-              minus_one,
-              A12, cs_at, rs_at,
-              x2,  incx,
-              one,
-              x1,  incx,
-              cntx
-            );
-
-            /* x1 = x1 / triu( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = f - k - 1;
-                f_behind = k;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a12t     = A11 + (l  )*rs_at + (l+1)*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x21      = x1  + (l+1)*incx;
-
-                /* chi11 = chi11 - a12t * x21; */
-                PASTEMAC(s,set0s)( rho1 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_behind; ++j )
-                        PASTEMAC(s,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
-                }
-                else
-                {
-                    for ( j = 0; j < f_behind; ++j )
-                        PASTEMAC(s,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
-                }
-                PASTEMAC(s,subs)( rho1, *chi11 );
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
-                }
-            }
-        }
-    }
-    else /* if ( bli_is_lower( uploa_trans ) ) */
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse );
-            i        = iter;
-            n_behind = i;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A10      = a + (i  )*rs_at + (0  )*cs_at;
-            x1       = x + (i  )*incx;
-            x0       = x + (0  )*incx;
-
-            /* x1 = x1 - A10 * x0; */
-            kfp_df
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_behind,
-              f,
-              minus_one,
-              A10, cs_at, rs_at,
-              x0,  incx,
-              one,
-              x1,  incx,
-              cntx
-            );
-
-            /* x1 = x1 / tril( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = k;
-                f_behind = l;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a10t     = A11 + (l  )*rs_at + (0  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x01      = x1  + (0  )*incx;
-
-                /* chi11 = chi11 - a10t * x01; */
-                PASTEMAC(s,set0s)( rho1 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_behind; ++j )
-                        PASTEMAC(s,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
-                }
-                else
-                {
-                    for ( j = 0; j < f_behind; ++j )
-                        PASTEMAC(s,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
-                }
-                PASTEMAC(s,subs)( rho1, *chi11 );
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
-                }
-            }
-        }
-    }
-}
-
-INSERT_GENTFUNC_BASIC0_CZ( trsv_unf_var1 )
-#else
 INSERT_GENTFUNC_BASIC0( trsv_unf_var1 )
-#endif
--- a/frame/2/trsv/bli_trsv_unf_var1_amd.c
+++ b/frame/2/trsv/bli_trsv_unf_var1_amd.c
@@ -0,0 +1,638 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       uplo_t  uploa, \
+       trans_t transa, \
+       diag_t  diaga, \
+       dim_t   m, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  x, inc_t incx, \
+       cntx_t* cntx  \
+     ) \
+{ \
+    if(cntx == NULL) cntx = bli_gks_query_cntx(); \
+    const num_t dt = PASTEMAC(ch,type); \
+\
+    ctype*  one        = PASTEMAC(ch,1); \
+    ctype*  minus_one  = PASTEMAC(ch,m1); \
+    ctype*  A10; \
+    ctype*  A11; \
+    ctype*  A12; \
+    ctype*  a10t; \
+    ctype*  alpha11; \
+    ctype*  a12t; \
+    ctype*  x0; \
+    ctype*  x1; \
+    ctype*  x2; \
+    ctype*  x01; \
+    ctype*  chi11; \
+    ctype*  x21; \
+    ctype   alpha11_conj; \
+    ctype   rho1; \
+    dim_t   iter, i, k, j, l; \
+    dim_t   b_fuse, f; \
+    dim_t   n_behind, f_behind; \
+    inc_t   rs_at, cs_at; \
+    uplo_t  uploa_trans; \
+    conj_t  conja; \
+\
+    /* x = alpha * x; */ \
+    PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
+    ( \
+      BLIS_NO_CONJUGATE, \
+      m, \
+      alpha, \
+      x, incx, \
+      cntx, \
+      NULL  \
+    ); \
+\
+    if      ( bli_does_notrans( transa ) ) \
+    { \
+        rs_at = rs_a; \
+        cs_at = cs_a; \
+        uploa_trans = uploa; \
+    } \
+    else /* if ( bli_does_trans( transa ) ) */ \
+    { \
+        rs_at = cs_a; \
+        cs_at = rs_a; \
+        uploa_trans = bli_uplo_toggled( uploa ); \
+    } \
+\
+    conja = bli_extract_conj( transa ); \
+\
+    PASTECH(ch,dotxf_ker_ft) kfp_df; \
+\
+    /* Query the context for the kernel function pointer and fusing factor. */ \
+    kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+    b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
+\
+    /* We reduce all of the possible cases down to just lower/upper. */ \
+    if      ( bli_is_upper( uploa_trans ) ) \
+    { \
+        for ( iter = 0; iter < m; iter += f ) \
+        { \
+            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
+            i        = m - iter - f; \
+            n_behind = iter; \
+            A11      = a + (i  )*rs_at + (i  )*cs_at; \
+            A12      = a + (i  )*rs_at + (i+f)*cs_at; \
+            x1       = x + (i  )*incx; \
+            x2       = x + (i+f)*incx; \
+\
+            /* x1 = x1 - A12 * x2; */ \
+            kfp_df \
+            ( \
+              conja, \
+              BLIS_NO_CONJUGATE, \
+              n_behind, \
+              f, \
+              minus_one, \
+              A12, cs_at, rs_at, \
+              x2,  incx, \
+              one, \
+              x1,  incx, \
+              cntx  \
+            ); \
+\
+            /* x1 = x1 / triu( A11 ); */ \
+            for ( k = 0; k < f; ++k ) \
+            { \
+                l        = f - k - 1; \
+                f_behind = k; \
+                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at; \
+                a12t     = A11 + (l  )*rs_at + (l+1)*cs_at; \
+                chi11    = x1  + (l  )*incx; \
+                x21      = x1  + (l+1)*incx; \
+\
+                /* chi11 = chi11 - a12t * x21; */ \
+                PASTEMAC(ch,set0s)( rho1 ); \
+                if ( bli_is_conj( conja ) ) \
+                { \
+                    for ( j = 0; j < f_behind; ++j ) \
+                        PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
+                } \
+                else \
+                { \
+                    for ( j = 0; j < f_behind; ++j ) \
+                        PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
+                } \
+                PASTEMAC(ch,subs)( rho1, *chi11 ); \
+\
+                /* chi11 = chi11 / alpha11; */ \
+                if ( bli_is_nonunit_diag( diaga ) ) \
+                { \
+                    PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
+                    PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
+                } \
+            } \
+        } \
+    } \
+    else /* if ( bli_is_lower( uploa_trans ) ) */ \
+    { \
+        for ( iter = 0; iter < m; iter += f ) \
+        { \
+            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
+            i        = iter; \
+            n_behind = i; \
+            A11      = a + (i  )*rs_at + (i  )*cs_at; \
+            A10      = a + (i  )*rs_at + (0  )*cs_at; \
+            x1       = x + (i  )*incx; \
+            x0       = x + (0  )*incx; \
+\
+            /* x1 = x1 - A10 * x0; */ \
+            kfp_df \
+            ( \
+              conja, \
+              BLIS_NO_CONJUGATE, \
+              n_behind, \
+              f, \
+              minus_one, \
+              A10, cs_at, rs_at, \
+              x0,  incx, \
+              one, \
+              x1,  incx, \
+              cntx  \
+            ); \
+\
+            /* x1 = x1 / tril( A11 ); */ \
+            for ( k = 0; k < f; ++k ) \
+            { \
+                l        = k; \
+                f_behind = l; \
+                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at; \
+                a10t     = A11 + (l  )*rs_at + (0  )*cs_at; \
+                chi11    = x1  + (l  )*incx; \
+                x01      = x1  + (0  )*incx; \
+\
+                /* chi11 = chi11 - a10t * x01; */ \
+                PASTEMAC(ch,set0s)( rho1 ); \
+                if ( bli_is_conj( conja ) ) \
+                { \
+                    for ( j = 0; j < f_behind; ++j ) \
+                        PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
+                } \
+                else \
+                { \
+                    for ( j = 0; j < f_behind; ++j ) \
+                        PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
+                } \
+                PASTEMAC(ch,subs)( rho1, *chi11 ); \
+\
+                /* chi11 = chi11 / alpha11; */ \
+                if ( bli_is_nonunit_diag( diaga ) ) \
+                { \
+                    PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
+                    PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
+                } \
+            } \
+        } \
+    } \
+}
+
+void bli_dtrsv_unf_var1
+     (
+       uplo_t  uploa,
+       trans_t transa,
+       diag_t  diaga,
+       dim_t   m,
+       double*  alpha,
+       double*  a, inc_t rs_a, inc_t cs_a,
+       double*  x, inc_t incx,
+       cntx_t* cntx
+     )
+{
+
+    double*  one        = PASTEMAC(d,1);
+    double*  minus_one  = PASTEMAC(d,m1);
+    double*  A10;
+    double*  A11;
+    double*  A12;
+    double*  a10t;
+    double*  alpha11;
+    double*  a12t;
+    double*  x0;
+    double*  x1;
+    double*  x2;
+    double*  x01;
+    double*  chi11;
+    double*  x21;
+    double   alpha11_conj;
+    double   rho1;
+    dim_t   iter, i, k, j, l;
+    dim_t   b_fuse, f;
+    dim_t   n_behind, f_behind;
+    inc_t   rs_at, cs_at;
+    uplo_t  uploa_trans;
+    conj_t  conja;
+
+    /* x = alpha * x; */
+    PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
+    (
+      BLIS_NO_CONJUGATE,
+      m,
+      alpha,
+      x, incx,
+      cntx,
+      NULL
+    );
+
+    if( bli_does_notrans( transa ) )
+    {
+        rs_at = rs_a;
+        cs_at = cs_a;
+        uploa_trans = uploa;
+    }
+    else /* if ( bli_does_trans( transa ) ) */
+    {
+        rs_at = cs_a;
+        cs_at = rs_a;
+        uploa_trans = bli_uplo_toggled( uploa );
+    }
+
+    conja = bli_extract_conj( transa );
+
+    PASTECH(d,dotxf_ker_ft) kfp_df;
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE) {
+	    kfp_df = bli_ddotxf_zen_int_8;
+	    b_fuse = 8;
+    }
+    else
+    {
+	    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+	    num_t dt = PASTEMAC(d,type);
+	    kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
+	    b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
+    }
+
+    /* We reduce all of the possible cases down to just lower/upper. */
+    if      ( bli_is_upper( uploa_trans ) )
+    {
+        for ( iter = 0; iter < m; iter += f )
+        {
+            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse );
+            i        = m - iter - f;
+            n_behind = iter;
+            A11      = a + (i  )*rs_at + (i  )*cs_at;
+            A12      = a + (i  )*rs_at + (i+f)*cs_at;
+            x1       = x + (i  )*incx;
+            x2       = x + (i+f)*incx;
+
+            /* x1 = x1 - A12 * x2; */
+            kfp_df
+            (
+              conja,
+              BLIS_NO_CONJUGATE,
+              n_behind,
+              f,
+              minus_one,
+              A12, cs_at, rs_at,
+              x2,  incx,
+              one,
+              x1,  incx,
+              cntx
+            );
+
+            /* x1 = x1 / triu( A11 ); */
+            for ( k = 0; k < f; ++k )
+            {
+                l        = f - k - 1;
+                f_behind = k;
+                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
+                a12t     = A11 + (l  )*rs_at + (l+1)*cs_at;
+                chi11    = x1  + (l  )*incx;
+                x21      = x1  + (l+1)*incx;
+
+                /* chi11 = chi11 - a12t * x21; */
+                PASTEMAC(d,set0s)( rho1 );
+                if ( bli_is_conj( conja ) )
+                {
+                    for ( j = 0; j < f_behind; ++j )
+                        PASTEMAC(d,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
+                }
+                else
+                {
+                    for ( j = 0; j < f_behind; ++j )
+                        PASTEMAC(d,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
+                }
+                PASTEMAC(d,subs)( rho1, *chi11 );
+
+                /* chi11 = chi11 / alpha11; */
+                if ( bli_is_nonunit_diag( diaga ) )
+                {
+                    PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
+                    PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
+                }
+            }
+        }
+    }
+    else /* if ( bli_is_lower( uploa_trans ) ) */
+    {
+        for ( iter = 0; iter < m; iter += f )
+        {
+            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse );
+            i        = iter;
+            n_behind = i;
+            A11      = a + (i  )*rs_at + (i  )*cs_at;
+            A10      = a + (i  )*rs_at + (0  )*cs_at;
+            x1       = x + (i  )*incx;
+            x0       = x + (0  )*incx;
+
+            /* x1 = x1 - A10 * x0; */
+            kfp_df
+            (
+              conja,
+              BLIS_NO_CONJUGATE,
+              n_behind,
+              f,
+              minus_one,
+              A10, cs_at, rs_at,
+              x0,  incx,
+              one,
+              x1,  incx,
+              cntx
+            );
+
+            /* x1 = x1 / tril( A11 ); */
+            for ( k = 0; k < f; ++k )
+            {
+                l        = k;
+                f_behind = l;
+                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
+                a10t     = A11 + (l  )*rs_at + (0  )*cs_at;
+                chi11    = x1  + (l  )*incx;
+                x01      = x1  + (0  )*incx;
+
+                /* chi11 = chi11 - a10t * x01; */
+                PASTEMAC(d,set0s)( rho1 );
+                if ( bli_is_conj( conja ) )
+                {
+                    for ( j = 0; j < f_behind; ++j )
+                        PASTEMAC(d,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
+                }
+                else
+                {
+                    for ( j = 0; j < f_behind; ++j )
+                        PASTEMAC(d,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
+                }
+                PASTEMAC(d,subs)( rho1, *chi11 );
+
+                /* chi11 = chi11 / alpha11; */
+                if ( bli_is_nonunit_diag( diaga ) )
+                {
+                    PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
+                    PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
+                }
+            }
+        }
+    }
+}
+
+void bli_strsv_unf_var1
+     (
+       uplo_t  uploa,
+       trans_t transa,
+       diag_t  diaga,
+       dim_t   m,
+       float*  alpha,
+       float*  a, inc_t rs_a, inc_t cs_a,
+       float*  x, inc_t incx,
+       cntx_t* cntx
+     )
+{
+
+    float*  one        = PASTEMAC(s,1);
+    float*  minus_one  = PASTEMAC(s,m1);
+    float*  A10;
+    float*  A11;
+    float*  A12;
+    float*  a10t;
+    float*  alpha11;
+    float*  a12t;
+    float*  x0;
+    float*  x1;
+    float*  x2;
+    float*  x01;
+    float*  chi11;
+    float*  x21;
+    float   alpha11_conj;
+    float   rho1;
+    dim_t   iter, i, k, j, l;
+    dim_t   b_fuse, f;
+    dim_t   n_behind, f_behind;
+    inc_t   rs_at, cs_at;
+    uplo_t  uploa_trans;
+    conj_t  conja;
+
+    /* x = alpha * x; */
+    PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF)
+    (
+      BLIS_NO_CONJUGATE,
+      m,
+      alpha,
+      x, incx,
+      cntx,
+      NULL
+    );
+
+    if( bli_does_notrans( transa ) )
+    {
+        rs_at = rs_a;
+        cs_at = cs_a;
+        uploa_trans = uploa;
+    }
+    else /* if ( bli_does_trans( transa ) ) */
+    {
+        rs_at = cs_a;
+        cs_at = rs_a;
+        uploa_trans = bli_uplo_toggled( uploa );
+    }
+
+    conja = bli_extract_conj( transa );
+
+    PASTECH(s,dotxf_ker_ft) kfp_df;
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE) {
+	    kfp_df = bli_sdotxf_zen_int_8;
+	    b_fuse = 8;
+    }
+    else
+    {
+	    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+	    num_t dt = PASTEMAC(s,type);
+	    kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
+	    b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
+
+    }
+
+    /* We reduce all of the possible cases down to just lower/upper. */
+    if      ( bli_is_upper( uploa_trans ) )
+    {
+        for ( iter = 0; iter < m; iter += f )
+        {
+            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse );
+            i        = m - iter - f;
+            n_behind = iter;
+            A11      = a + (i  )*rs_at + (i  )*cs_at;
+            A12      = a + (i  )*rs_at + (i+f)*cs_at;
+            x1       = x + (i  )*incx;
+            x2       = x + (i+f)*incx;
+
+            /* x1 = x1 - A12 * x2; */
+            kfp_df
+            (
+              conja,
+              BLIS_NO_CONJUGATE,
+              n_behind,
+              f,
+              minus_one,
+              A12, cs_at, rs_at,
+              x2,  incx,
+              one,
+              x1,  incx,
+              cntx
+            );
+
+            /* x1 = x1 / triu( A11 ); */
+            for ( k = 0; k < f; ++k )
+            {
+                l        = f - k - 1;
+                f_behind = k;
+                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
+                a12t     = A11 + (l  )*rs_at + (l+1)*cs_at;
+                chi11    = x1  + (l  )*incx;
+                x21      = x1  + (l+1)*incx;
+
+                /* chi11 = chi11 - a12t * x21; */
+                PASTEMAC(s,set0s)( rho1 );
+                if ( bli_is_conj( conja ) )
+                {
+                    for ( j = 0; j < f_behind; ++j )
+                        PASTEMAC(s,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
+                }
+                else
+                {
+                    for ( j = 0; j < f_behind; ++j )
+                        PASTEMAC(s,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 );
+                }
+                PASTEMAC(s,subs)( rho1, *chi11 );
+
+                /* chi11 = chi11 / alpha11; */
+                if ( bli_is_nonunit_diag( diaga ) )
+                {
+                    PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
+                    PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
+                }
+            }
+        }
+    }
+    else /* if ( bli_is_lower( uploa_trans ) ) */
+    {
+        for ( iter = 0; iter < m; iter += f )
+        {
+            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse );
+            i        = iter;
+            n_behind = i;
+            A11      = a + (i  )*rs_at + (i  )*cs_at;
+            A10      = a + (i  )*rs_at + (0  )*cs_at;
+            x1       = x + (i  )*incx;
+            x0       = x + (0  )*incx;
+
+            /* x1 = x1 - A10 * x0; */
+            kfp_df
+            (
+              conja,
+              BLIS_NO_CONJUGATE,
+              n_behind,
+              f,
+              minus_one,
+              A10, cs_at, rs_at,
+              x0,  incx,
+              one,
+              x1,  incx,
+              cntx
+            );
+
+            /* x1 = x1 / tril( A11 ); */
+            for ( k = 0; k < f; ++k )
+            {
+                l        = k;
+                f_behind = l;
+                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
+                a10t     = A11 + (l  )*rs_at + (0  )*cs_at;
+                chi11    = x1  + (l  )*incx;
+                x01      = x1  + (0  )*incx;
+
+                /* chi11 = chi11 - a10t * x01; */
+                PASTEMAC(s,set0s)( rho1 );
+                if ( bli_is_conj( conja ) )
+                {
+                    for ( j = 0; j < f_behind; ++j )
+                        PASTEMAC(s,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
+                }
+                else
+                {
+                    for ( j = 0; j < f_behind; ++j )
+                        PASTEMAC(s,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 );
+                }
+                PASTEMAC(s,subs)( rho1, *chi11 );
+
+                /* chi11 = chi11 / alpha11; */
+                if ( bli_is_nonunit_diag( diaga ) )
+                {
+                    PASTEMAC(s,copycjs)( conja, *alpha11, alpha11_conj );
+                    PASTEMAC(s,invscals)( alpha11_conj, *chi11 );
+                }
+            }
+        }
+    }
+}
+
+INSERT_GENTFUNC_BASIC0_CZ( trsv_unf_var1 )
+
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -228,805 +228,5 @@ void PASTEMAC(ch,varname) \
        } \
    } \
 }
-#ifdef BLIS_CONFIG_EPYC
-void bli_dtrsv_unf_var2
-     (
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       double*  alpha,
-       double*  a, inc_t rs_a, inc_t cs_a,
-       double*  x, inc_t incx,
-       cntx_t* cntx
-     )
-{

-    double*  minus_one  = PASTEMAC(d,m1);
-    double*  A01;
-    double*  A11;
-    double*  A21;
-    double*  a01;
-    double*  alpha11;
-    double*  a21;
-    double*  x0;
-    double*  x1;
-    double*  x2;
-    double*  x01;
-    double*  chi11;
-    double*  x21;
-    double   alpha11_conj;
-    double   minus_chi11;
-    dim_t   iter, i, k, j, l;
-    dim_t   b_fuse, f;
-    dim_t   n_ahead, f_ahead;
-    inc_t   rs_at, cs_at;
-    uplo_t  uploa_trans;
-    conj_t  conja;
-
-    /* x = alpha * x; */
-    PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF)
-    (
-      BLIS_NO_CONJUGATE,
-      m,
-      alpha,
-      x, incx,
-      cntx,
-      NULL
-    );
-
-    if      ( bli_does_notrans( transa ) )
-    {
-        rs_at = rs_a;
-        cs_at = cs_a;
-        uploa_trans = uploa;
-    }
-    else /* if ( bli_does_trans( transa ) ) */
-    {
-        rs_at = cs_a;
-        cs_at = rs_a;
-        uploa_trans = bli_uplo_toggled( uploa );
-    }
-
-    conja = bli_extract_conj( transa );
-
-    PASTECH(d,axpyf_ker_ft) kfp_af;
-
-    /* Assign kernel function pointer and fusing factor. */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-	    kfp_af = bli_daxpyf_zen_int_16x4;
-	    b_fuse = 4;
-    }
-    else
-    {
-	    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-	    kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
-	    b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx );
-    }
-
-    /* We reduce all of the possible cases down to just lower/upper. */
-    if      ( bli_is_upper( uploa_trans ) )
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse );
-            i        = m - iter - f;
-            n_ahead  = i;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A01      = a + (0  )*rs_at + (i  )*cs_at;
-            x1       = x + (i  )*incx;
-            x0       = x + (0  )*incx;
-
-            /* x1 = x1 / triu( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = f - k - 1;
-                f_ahead  = l;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a01      = A11 + (0  )*rs_at + (l  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x01      = x1  + (0  )*incx;
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
-                }
-
-                /* x01 = x01 - chi11 * a01; */
-                PASTEMAC(d,neg2s)( *chi11, minus_chi11 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(d,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
-                }
-                else
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(d,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
-                }
-            }
-
-            /* x0 = x0 - A01 * x1; */
-            kfp_af
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_ahead,
-              f,
-              minus_one,
-              A01, rs_at, cs_at,
-              x1,  incx,
-              x0,  incx,
-              cntx
-            );
-        }
-    }
-    else /* if ( bli_is_lower( uploa_trans ) ) */
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse );
-            i        = iter;
-            n_ahead  = m - iter - f;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A21      = a + (i+f)*rs_at + (i  )*cs_at;
-            x1       = x + (i  )*incx;
-            x2       = x + (i+f)*incx;
-
-            /* x1 = x1 / tril( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = k;
-                f_ahead  = f - k - 1;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a21      = A11 + (l+1)*rs_at + (l  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x21      = x1  + (l+1)*incx;
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(d,copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(d,invscals)( alpha11_conj, *chi11 );
-                }
-
-                /* x21 = x21 - chi11 * a21; */
-                PASTEMAC(d,neg2s)( *chi11, minus_chi11 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(d,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
-                }
-                else
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(d,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
-                }
-            }
-
-            /* x2 = x2 - A21 * x1; */
-            kfp_af
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_ahead,
-              f,
-              minus_one,
-              A21, rs_at, cs_at,
-              x1,  incx,
-              x2,  incx,
-              cntx
-            );
-        }
-    }
-}
-
-void bli_strsv_unf_var2
-     (
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       float*  alpha,
-       float*  a, inc_t rs_a, inc_t cs_a,
-       float*  x, inc_t incx,
-       cntx_t* cntx
-     )
-{
-
-    float*  minus_one  = PASTEMAC(s, m1);
-    float*  A01;
-    float*  A11;
-    float*  A21;
-    float*  a01;
-    float*  alpha11;
-    float*  a21;
-    float*  x0;
-    float*  x1;
-    float*  x2;
-    float*  x01;
-    float*  chi11;
-    float*  x21;
-    float   alpha11_conj;
-    float   minus_chi11;
-    dim_t   iter, i, k, j, l;
-    dim_t   b_fuse, f;
-    dim_t   n_ahead, f_ahead;
-    inc_t   rs_at, cs_at;
-    uplo_t  uploa_trans;
-    conj_t  conja;
-
-    /* x = alpha * x; */
-    PASTEMAC2(s, scalv,BLIS_TAPI_EX_SUF)
-    (
-      BLIS_NO_CONJUGATE,
-      m,
-      alpha,
-      x, incx,
-      cntx,
-      NULL
-    );
-
-    if( bli_does_notrans( transa ) )
-    {
-        rs_at = rs_a;
-        cs_at = cs_a;
-        uploa_trans = uploa;
-    }
-    else /* if ( bli_does_trans( transa ) ) */
-    {
-        rs_at = cs_a;
-        cs_at = rs_a;
-        uploa_trans = bli_uplo_toggled( uploa );
-    }
-
-    conja = bli_extract_conj( transa );
-
-    PASTECH(s, axpyf_ker_ft) kfp_af;
-
-    /* Assign function pointer and fusing factor. */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-	    kfp_af = bli_saxpyf_zen_int_5;
-	    b_fuse = 5;
-    }
-    else
-    {
-	    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-	    kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_FLOAT, BLIS_AXPYF_KER, cntx );
-	    b_fuse = bli_cntx_get_blksz_def_dt( BLIS_FLOAT, BLIS_AF, cntx );
-    }
-
-    /* We reduce all of the possible cases down to just lower/upper. */
-    if      ( bli_is_upper( uploa_trans ) )
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse );
-            i        = m - iter - f;
-            n_ahead  = i;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A01      = a + (0  )*rs_at + (i  )*cs_at;
-            x1       = x + (i  )*incx;
-            x0       = x + (0  )*incx;
-
-            /* x1 = x1 / triu( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = f - k - 1;
-                f_ahead  = l;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a01      = A11 + (0  )*rs_at + (l  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x01      = x1  + (0  )*incx;
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(s, invscals)( alpha11_conj, *chi11 );
-                }
-
-                /* x01 = x01 - chi11 * a01; */
-                PASTEMAC(s, neg2s)( *chi11, minus_chi11 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(s, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
-                }
-                else
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(s, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
-                }
-            }
-
-            /* x0 = x0 - A01 * x1; */
-            kfp_af
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_ahead,
-              f,
-              minus_one,
-              A01, rs_at, cs_at,
-              x1,  incx,
-              x0,  incx,
-              cntx
-            );
-        }
-    }
-    else /* if ( bli_is_lower( uploa_trans ) ) */
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse );
-            i        = iter;
-            n_ahead  = m - iter - f;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A21      = a + (i+f)*rs_at + (i  )*cs_at;
-            x1       = x + (i  )*incx;
-            x2       = x + (i+f)*incx;
-
-            /* x1 = x1 / tril( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = k;
-                f_ahead  = f - k - 1;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a21      = A11 + (l+1)*rs_at + (l  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x21      = x1  + (l+1)*incx;
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(s, copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(s, invscals)( alpha11_conj, *chi11 );
-                }
-
-                /* x21 = x21 - chi11 * a21; */
-                PASTEMAC(s, neg2s)( *chi11, minus_chi11 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(s, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
-                }
-                else
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(s, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
-                }
-            }
-
-            /* x2 = x2 - A21 * x1; */
-            kfp_af
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_ahead,
-              f,
-              minus_one,
-              A21, rs_at, cs_at,
-              x1,  incx,
-              x2,  incx,
-              cntx
-            );
-        }
-    }
-}
-
-void bli_ztrsv_unf_var2
-     (
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       dcomplex*  alpha,
-       dcomplex*  a, inc_t rs_a, inc_t cs_a,
-       dcomplex*  x, inc_t incx,
-       cntx_t* cntx
-     )
-{
-
-    dcomplex*  minus_one  = PASTEMAC(z, m1);
-    dcomplex*  A01;
-    dcomplex*  A11;
-    dcomplex*  A21;
-    dcomplex*  a01;
-    dcomplex*  alpha11;
-    dcomplex*  a21;
-    dcomplex*  x0;
-    dcomplex*  x1;
-    dcomplex*  x2;
-    dcomplex*  x01;
-    dcomplex*  chi11;
-    dcomplex*  x21;
-    dcomplex   alpha11_conj;
-    dcomplex   minus_chi11;
-    dim_t   iter, i, k, j, l;
-    dim_t   b_fuse, f;
-    dim_t   n_ahead, f_ahead;
-    inc_t   rs_at, cs_at;
-    uplo_t  uploa_trans;
-    conj_t  conja;
-
-    /* x = alpha * x; */
-    PASTEMAC2(z, scalv,BLIS_TAPI_EX_SUF)
-    (
-      BLIS_NO_CONJUGATE,
-      m,
-      alpha,
-      x, incx,
-      cntx,
-      NULL
-    );
-
-    if( bli_does_notrans( transa ) )
-    {
-        rs_at = rs_a;
-        cs_at = cs_a;
-        uploa_trans = uploa;
-    }
-    else /* if ( bli_does_trans( transa ) ) */
-    {
-        rs_at = cs_a;
-        cs_at = rs_a;
-        uploa_trans = bli_uplo_toggled( uploa );
-    }
-
-    conja = bli_extract_conj( transa );
-
-    PASTECH(z, axpyf_ker_ft) kfp_af;
-
-    /* Assign function pointer and fusing factor. */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-	    kfp_af = bli_zaxpyf_zen_int_5;
-	    b_fuse = 5;
-    }
-    else
-    {
-	    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-	    kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYF_KER, cntx );
-	    b_fuse = bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_AF, cntx );
-    }
-    /* We reduce all of the possible cases down to just lower/upper. */
-    if      ( bli_is_upper( uploa_trans ) )
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse );
-            i        = m - iter - f;
-            n_ahead  = i;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A01      = a + (0  )*rs_at + (i  )*cs_at;
-            x1       = x + (i  )*incx;
-            x0       = x + (0  )*incx;
-
-            /* x1 = x1 / triu( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = f - k - 1;
-                f_ahead  = l;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a01      = A11 + (0  )*rs_at + (l  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x01      = x1  + (0  )*incx;
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(z, invscals)( alpha11_conj, *chi11 );
-                }
-
-                /* x01 = x01 - chi11 * a01; */
-                PASTEMAC(z, neg2s)( *chi11, minus_chi11 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(z, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
-                }
-                else
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(z, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
-                }
-            }
-
-            /* x0 = x0 - A01 * x1; */
-            kfp_af
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_ahead,
-              f,
-              minus_one,
-              A01, rs_at, cs_at,
-              x1,  incx,
-              x0,  incx,
-              cntx
-            );
-        }
-    }
-    else /* if ( bli_is_lower( uploa_trans ) ) */
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse );
-            i        = iter;
-            n_ahead  = m - iter - f;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A21      = a + (i+f)*rs_at + (i  )*cs_at;
-            x1       = x + (i  )*incx;
-            x2       = x + (i+f)*incx;
-
-            /* x1 = x1 / tril( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = k;
-                f_ahead  = f - k - 1;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a21      = A11 + (l+1)*rs_at + (l  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x21      = x1  + (l+1)*incx;
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(z, copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(z, invscals)( alpha11_conj, *chi11 );
-                }
-
-                /* x21 = x21 - chi11 * a21; */
-                PASTEMAC(z, neg2s)( *chi11, minus_chi11 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(z, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
-                }
-                else
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(z, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
-                }
-            }
-
-            /* x2 = x2 - A21 * x1; */
-            kfp_af
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_ahead,
-              f,
-              minus_one,
-              A21, rs_at, cs_at,
-              x1,  incx,
-              x2,  incx,
-              cntx
-            );
-        }
-    }
-}
-
-void bli_ctrsv_unf_var2
-     (
-       uplo_t  uploa,
-       trans_t transa,
-       diag_t  diaga,
-       dim_t   m,
-       scomplex*  alpha,
-       scomplex*  a, inc_t rs_a, inc_t cs_a,
-       scomplex*  x, inc_t incx,
-       cntx_t* cntx
-     )
-{
-
-    scomplex*  minus_one  = PASTEMAC(c, m1);
-    scomplex*  A01;
-    scomplex*  A11;
-    scomplex*  A21;
-    scomplex*  a01;
-    scomplex*  alpha11;
-    scomplex*  a21;
-    scomplex*  x0;
-    scomplex*  x1;
-    scomplex*  x2;
-    scomplex*  x01;
-    scomplex*  chi11;
-    scomplex*  x21;
-    scomplex   alpha11_conj;
-    scomplex   minus_chi11;
-    dim_t   iter, i, k, j, l;
-    dim_t   b_fuse, f;
-    dim_t   n_ahead, f_ahead;
-    inc_t   rs_at, cs_at;
-    uplo_t  uploa_trans;
-    conj_t  conja;
-
-    /* x = alpha * x; */
-    PASTEMAC2(c, scalv,BLIS_TAPI_EX_SUF)
-    (
-      BLIS_NO_CONJUGATE,
-      m,
-      alpha,
-      x, incx,
-      cntx,
-      NULL
-    );
-
-    if( bli_does_notrans( transa ) )
-    {
-        rs_at = rs_a;
-        cs_at = cs_a;
-        uploa_trans = uploa;
-    }
-    else /* if ( bli_does_trans( transa ) ) */
-    {
-        rs_at = cs_a;
-        cs_at = rs_a;
-        uploa_trans = bli_uplo_toggled( uploa );
-    }
-
-    conja = bli_extract_conj( transa );
-
-    PASTECH(c, axpyf_ker_ft) kfp_af;
-
-    /* Assign function pointer and fusing factor. */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-	    kfp_af = bli_caxpyf_zen_int_5;
-	    b_fuse = 5;
-    }
-    else
-    {
-	    if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-	    kfp_af = bli_cntx_get_l1f_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYF_KER, cntx );
-	    b_fuse = bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_AF, cntx );
-    }
-    /* We reduce all of the possible cases down to just lower/upper. */
-    if      ( bli_is_upper( uploa_trans ) )
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_b( iter, m, b_fuse );
-            i        = m - iter - f;
-            n_ahead  = i;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A01      = a + (0  )*rs_at + (i  )*cs_at;
-            x1       = x + (i  )*incx;
-            x0       = x + (0  )*incx;
-
-            /* x1 = x1 / triu( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = f - k - 1;
-                f_ahead  = l;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a01      = A11 + (0  )*rs_at + (l  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x01      = x1  + (0  )*incx;
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(c, invscals)( alpha11_conj, *chi11 );
-                }
-
-                /* x01 = x01 - chi11 * a01; */
-                PASTEMAC(c, neg2s)( *chi11, minus_chi11 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(c, axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
-                }
-                else
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(c, axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) );
-                }
-            }
-
-            /* x0 = x0 - A01 * x1; */
-            kfp_af
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_ahead,
-              f,
-              minus_one,
-              A01, rs_at, cs_at,
-              x1,  incx,
-              x0,  incx,
-              cntx
-            );
-        }
-    }
-    else /* if ( bli_is_lower( uploa_trans ) ) */
-    {
-        for ( iter = 0; iter < m; iter += f )
-        {
-            f        = bli_determine_blocksize_dim_f( iter, m, b_fuse );
-            i        = iter;
-            n_ahead  = m - iter - f;
-            A11      = a + (i  )*rs_at + (i  )*cs_at;
-            A21      = a + (i+f)*rs_at + (i  )*cs_at;
-            x1       = x + (i  )*incx;
-            x2       = x + (i+f)*incx;
-
-            /* x1 = x1 / tril( A11 ); */
-            for ( k = 0; k < f; ++k )
-            {
-                l        = k;
-                f_ahead  = f - k - 1;
-                alpha11  = A11 + (l  )*rs_at + (l  )*cs_at;
-                a21      = A11 + (l+1)*rs_at + (l  )*cs_at;
-                chi11    = x1  + (l  )*incx;
-                x21      = x1  + (l+1)*incx;
-
-                /* chi11 = chi11 / alpha11; */
-                if ( bli_is_nonunit_diag( diaga ) )
-                {
-                    PASTEMAC(c, copycjs)( conja, *alpha11, alpha11_conj );
-                    PASTEMAC(c, invscals)( alpha11_conj, *chi11 );
-                }
-
-                /* x21 = x21 - chi11 * a21; */
-                PASTEMAC(c, neg2s)( *chi11, minus_chi11 );
-                if ( bli_is_conj( conja ) )
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(c, axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
-                }
-                else
-                {
-                    for ( j = 0; j < f_ahead; ++j )
-                        PASTEMAC(c, axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) );
-                }
-            }
-
-            /* x2 = x2 - A21 * x1; */
-            kfp_af
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_ahead,
-              f,
-              minus_one,
-              A21, rs_at, cs_at,
-              x1,  incx,
-              x2,  incx,
-              cntx
-            );
-        }
-    }
-}
-
-#else
-INSERT_GENTFUNC_BASIC0( trsv_unf_var2 )
-#endif
+INSERT_GENTFUNC_BASIC0( trsv_unf_var2 )
--- a/frame/2/trsv/bli_trsv_unf_var2_amd.c
+++ b/frame/2/trsv/bli_trsv_unf_var2_amd.c
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -48,120 +48,6 @@ err_t bli_gemmsup_int
 {
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);

-#ifdef BLIS_CONFIG_EPYC
-	const num_t  dt          = bli_obj_dt( c );
-	const dim_t  m           = bli_obj_length( c );
-	const dim_t  n           = bli_obj_width( c );
-	const dim_t  k           = bli_obj_width( a );
-	const dim_t  MR          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
-	const dim_t  NR          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
-	const bool   auto_factor = bli_rntm_auto_factor( rntm );
-	const dim_t  n_threads   = bli_rntm_num_threads( rntm );
-
-	dim_t        jc_new;
-	dim_t        ic_new;
-
-
-	//bli_gemmsup_ref_var2
-	//bli_gemmsup_ref_var1
-	#if 0
-	bli_gemmsup_ref_var1n
-	#else
-	#endif
-	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
-	const bool    is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
-	                                     stor_id == BLIS_RRC ||
-	                                     stor_id == BLIS_RCR ||
-	                                     stor_id == BLIS_CRR );
-	#ifdef TRACEVAR
-	if ( bli_thread_am_ochief( thread ) )
-	  printf( "bli_l3_sup_int(): var2m primary\n" );
-	#endif
-
-	// Don't use the small/unpacked implementation if one of the matrices
-	// uses general stride.
-	if ( stor_id == BLIS_XXX ) {
-		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
-		return BLIS_FAILURE;
-	}
-
-	if ( is_rrr_rrc_rcr_crr )
-	{
-	  // This branch handles:
-	  //  - rrr rrc rcr crr for row-preferential kernels
-	  //  - rcc crc ccr ccc for column-preferential kernels
-	  //  - Currently only row-preferential kernels are only supported.
-
-	  // calculate number of micropanels in m and n dimensions and
-	  // recalculate the automatic thread factorization based on these number of  micropanels 
-	  const dim_t mu = m / MR;
-	  const dim_t nu = n / NR;
-
-	  // If the parallel thread factorization was automatic, we update it
-	  // with a new factorization based on the matrix dimensions in units
-	  // of micropanels.
-	  if ( auto_factor )
-	  {
-	      // In the block-panel algorithm, the m dimension is parallelized
-	      // with ic_nt and the n dimension is parallelized with jc_nt.
-	      bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
-
-	      // Update the ways of parallelism for the jc and ic loops, and then
-	      // update the current thread's root thrinfo_t node according to the
-	      // new ways of parallelism value for the jc loop.
-	      bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
-	      bli_l3_sup_thrinfo_update_root( rntm, thread );
-	  }
-
-	  /*Enable packing for B matrix for higher sizes*/
-	  if(bli_is_float(dt) && (n_threads==1)) {
-              if((m > 240) &&  (k > 240) && (n > 240))
-	          bli_rntm_set_pack_b( 1, rntm );
-	  }
-
-	  bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
-				 alpha, a, b, beta, c,
-				 stor_id, cntx, rntm, thread );
-	}
-	else
-	{
-	  // This branch handles:
-	  //  - rrr rrc rcr crr for column-preferential kernels
-	  //  - rcc crc ccr ccc for row-preferential kernels
-          //  - Currently only row-preferential kernels are only supported.
-	  const dim_t mu = n / MR; // the n becomes m after a transposition
-	  const dim_t nu = m / NR; // the m becomes n after a transposition
-
-	  if ( auto_factor )
-	  {
-	      // In the block-panel algorithm, the m dimension is parallelized
-	      // with ic_nt and the n dimension is parallelized with jc_nt.
-	      bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
-
-	      // Update the ways of parallelism for the jc and ic loops, and then
-	      // update the current thread's root thrinfo_t node according to the
-	      // new ways of parallelism value for the jc loop.
-	      bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
-	      bli_l3_sup_thrinfo_update_root( rntm, thread );
-	  }
-
-	  /* Enable packing for B matrix for higher sizes. Note that pack A 
-	   * becomes pack B inside var2m because this is transpose case*/
-	  if(bli_is_float(dt) && (n_threads==1)) {
-              if((m > 240) &&  (k > 240) && (n > 240))
-	          bli_rntm_set_pack_a( 1, rntm );
-	  }
-
-	  bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
-	                         alpha, a, b, beta, c,
-			         stor_id, cntx, rntm, thread );
-	}
-
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
-	return BLIS_SUCCESS;
-
-#else  // #ifdef BLIS_CONFIG_EPYC
-
 	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );

 	// Don't use the small/unpacked implementation if one of the matrices
@@ -335,8 +221,6 @@ err_t bli_gemmsup_int
 	// Return success so that the caller knows that we computed the solution.
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
 	return BLIS_SUCCESS;
-
-#endif
 }

 // -----------------------------------------------------------------------------
@@ -401,15 +285,9 @@ err_t bli_gemmtsup_int
 		// Decide which algorithm to use (block-panel var2m or panel-block
 		// var1n) based on the number of micropanels in the m and n dimensions.
 		// Also, recalculate the automatic thread factorization.
-#ifdef BLIS_CONFIG_EPYC
-		if         ( mu >= nu )    use_bp = TRUE;
-		else /* if ( mu <  nu ) */ use_bp = TRUE;// var1n is not implemented for GEMMT
-
-#else
 		if         ( mu >= nu )    use_bp = TRUE;
 		else /* if ( mu <  nu ) */ use_bp = FALSE;

-#endif
 		// If the parallel thread factorization was automatic, we update it
 		// with a new factorization based on the matrix dimensions in units
 		// of micropanels.
@@ -472,14 +350,10 @@ err_t bli_gemmtsup_int
 		// Decide which algorithm to use (block-panel var2m or panel-block
 		// var1n) based on the number of micropanels in the m and n dimensions.
 		// Also, recalculate the automatic thread factorization.
-#ifdef BLIS_CONFIG_EPYC
-		if         ( mu >= nu )    use_bp = TRUE;
-		else /* if ( mu <  nu ) */ use_bp = TRUE; //var1n is not implemented for gemmt
-#else
+
 		if         ( mu >= nu )    use_bp = TRUE;
 		else /* if ( mu <  nu ) */ use_bp = FALSE;

-#endif
 		// If the parallel thread factorization was automatic, we update it
 		// with a new factorization based on the matrix dimensions in units
 		// of micropanels.
--- a/frame/3/bli_l3_sup_int_amd.c
+++ b/frame/3/bli_l3_sup_int_amd.c
@@ -0,0 +1,352 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019-21, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+err_t bli_gemmsup_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
+
+	const num_t  dt          = bli_obj_dt( c );
+	const dim_t  m           = bli_obj_length( c );
+	const dim_t  n           = bli_obj_width( c );
+	const dim_t  k           = bli_obj_width( a );
+	const dim_t  MR          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t  NR          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
+	const bool   auto_factor = bli_rntm_auto_factor( rntm );
+	const dim_t  n_threads   = bli_rntm_num_threads( rntm );
+
+	dim_t        jc_new;
+	dim_t        ic_new;
+
+
+	//bli_gemmsup_ref_var2
+	//bli_gemmsup_ref_var1
+	#if 0
+	bli_gemmsup_ref_var1n
+	#else
+	#endif
+	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
+	const bool    is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
+	                                     stor_id == BLIS_RRC ||
+	                                     stor_id == BLIS_RCR ||
+	                                     stor_id == BLIS_CRR );
+	#ifdef TRACEVAR
+	if ( bli_thread_am_ochief( thread ) )
+	  printf( "bli_l3_sup_int(): var2m primary\n" );
+	#endif
+
+	// Don't use the small/unpacked implementation if one of the matrices
+	// uses general stride.
+	if ( stor_id == BLIS_XXX ) {
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
+		return BLIS_FAILURE;
+	}
+
+	if ( is_rrr_rrc_rcr_crr )
+	{
+	  // This branch handles:
+	  //  - rrr rrc rcr crr for row-preferential kernels
+	  //  - rcc crc ccr ccc for column-preferential kernels
+	  //  - Currently only row-preferential kernels are only supported.
+
+	  // calculate number of micropanels in m and n dimensions and
+	  // recalculate the automatic thread factorization based on these number of  micropanels 
+	  const dim_t mu = m / MR;
+	  const dim_t nu = n / NR;
+
+	  // If the parallel thread factorization was automatic, we update it
+	  // with a new factorization based on the matrix dimensions in units
+	  // of micropanels.
+	  if ( auto_factor )
+	  {
+	      // In the block-panel algorithm, the m dimension is parallelized
+	      // with ic_nt and the n dimension is parallelized with jc_nt.
+	      bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
+
+	      // Update the ways of parallelism for the jc and ic loops, and then
+	      // update the current thread's root thrinfo_t node according to the
+	      // new ways of parallelism value for the jc loop.
+	      bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
+	      bli_l3_sup_thrinfo_update_root( rntm, thread );
+	  }
+
+	  /*Enable packing for B matrix for higher sizes*/
+	  if(bli_is_float(dt) && (n_threads==1)) {
+              if((m > 240) &&  (k > 240) && (n > 240))
+	          bli_rntm_set_pack_b( 1, rntm );
+	  }
+
+	  bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
+				 alpha, a, b, beta, c,
+				 stor_id, cntx, rntm, thread );
+	}
+	else
+	{
+	  // This branch handles:
+	  //  - rrr rrc rcr crr for column-preferential kernels
+	  //  - rcc crc ccr ccc for row-preferential kernels
+          //  - Currently only row-preferential kernels are only supported.
+	  const dim_t mu = n / MR; // the n becomes m after a transposition
+	  const dim_t nu = m / NR; // the m becomes n after a transposition
+
+	  if ( auto_factor )
+	  {
+	      // In the block-panel algorithm, the m dimension is parallelized
+	      // with ic_nt and the n dimension is parallelized with jc_nt.
+	      bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
+
+	      // Update the ways of parallelism for the jc and ic loops, and then
+	      // update the current thread's root thrinfo_t node according to the
+	      // new ways of parallelism value for the jc loop.
+	      bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
+	      bli_l3_sup_thrinfo_update_root( rntm, thread );
+	  }
+
+	  /* Enable packing for B matrix for higher sizes. Note that pack A 
+	   * becomes pack B inside var2m because this is transpose case*/
+	  if(bli_is_float(dt) && (n_threads==1)) {
+              if((m > 240) &&  (k > 240) && (n > 240))
+	          bli_rntm_set_pack_a( 1, rntm );
+	  }
+
+	  bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
+	                         alpha, a, b, beta, c,
+			         stor_id, cntx, rntm, thread );
+	}
+
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
+	return BLIS_SUCCESS;
+
+
+}
+
+// -----------------------------------------------------------------------------
+
+err_t bli_gemmtsup_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
+//	AOCL_DTL_LOG_GEMMT_INPUTS(AOCL_DTL_LEVEL_TRACE_4, alpha, a, b, beta, c);
+
+
+	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
+
+	// Don't use the small/unpacked implementation if one of the matrices
+	// uses general stride.
+	if ( stor_id == BLIS_XXX ) {
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
+		return BLIS_FAILURE;
+	}
+
+	const bool    is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
+	                                     stor_id == BLIS_RRC ||
+	                                     stor_id == BLIS_RCR ||
+	                                     stor_id == BLIS_CRR );
+	const bool    is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
+
+	const num_t   dt         = bli_obj_dt( c );
+	const bool    row_pref   = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+
+	const bool    is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
+	                                      : is_rcc_crc_ccr_ccc );
+
+	const dim_t  m           = bli_obj_length( c );
+	const dim_t  n           = m;
+	const dim_t  MR          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t  NR          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
+	const bool   auto_factor = bli_rntm_auto_factor( rntm );
+	const dim_t  n_threads   = bli_rntm_num_threads( rntm );
+	bool         use_bp      = TRUE;
+	dim_t        jc_new;
+	dim_t        ic_new;
+
+
+	if ( is_primary )
+	{
+		// This branch handles:
+		//  - rrr rrc rcr crr for row-preferential kernels
+		//  - rcc crc ccr ccc for column-preferential kernels
+
+		const dim_t mu = m / MR;
+		const dim_t nu = n / NR;
+
+		// Decide which algorithm to use (block-panel var2m or panel-block
+		// var1n) based on the number of micropanels in the m and n dimensions.
+		// Also, recalculate the automatic thread factorization.
+
+		if         ( mu >= nu )    use_bp = TRUE;
+		else /* if ( mu <  nu ) */ use_bp = TRUE;// var1n is not implemented for GEMMT
+
+		// If the parallel thread factorization was automatic, we update it
+		// with a new factorization based on the matrix dimensions in units
+		// of micropanels.
+		if ( auto_factor )
+		{
+			if ( use_bp )
+			{
+				// In the block-panel algorithm, the m dimension is parallelized
+				// with ic_nt and the n dimension is parallelized with jc_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
+			}
+			else // if ( !use_bp )
+			{
+				// In the panel-block algorithm, the m dimension is parallelized
+				// with jc_nt and the n dimension is parallelized with ic_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
+			}
+
+			// Update the ways of parallelism for the jc and ic loops, and then
+			// update the current thread's root thrinfo_t node according to the
+			// new ways of parallelism value for the jc loop.
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
+			bli_l3_sup_thrinfo_update_root( rntm, thread );
+		}
+
+
+		if ( use_bp )
+		{
+			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
+			printf( "bli_l3_sup_int(): var2m primary\n" );
+			#endif
+			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
+			bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
+			                        alpha, a, b, beta, c,
+			                        stor_id, cntx, rntm, thread );
+		}
+		else // use_pb
+		{
+			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
+			printf( "bli_l3_sup_int(): var1n primary\n" );
+			#endif
+			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
+			bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
+			                        alpha, a, b, beta, c,
+			                        stor_id, cntx, rntm, thread );
+			// *requires nudging of nc up to be a multiple of mr.
+		}
+	}
+	else
+	{
+		// This branch handles:
+		//  - rrr rrc rcr crr for column-preferential kernels
+		//  - rcc crc ccr ccc for row-preferential kernels
+
+		const dim_t mu = n / MR; // the n becomes m after a transposition
+		const dim_t nu = m / NR; // the m becomes n after a transposition
+
+		// Decide which algorithm to use (block-panel var2m or panel-block
+		// var1n) based on the number of micropanels in the m and n dimensions.
+		// Also, recalculate the automatic thread factorization.
+
+		if         ( mu >= nu )    use_bp = TRUE;
+		else /* if ( mu <  nu ) */ use_bp = TRUE; //var1n is not implemented for gemmt
+
+		// If the parallel thread factorization was automatic, we update it
+		// with a new factorization based on the matrix dimensions in units
+		// of micropanels.
+		if ( auto_factor )
+		{
+			if ( use_bp )
+			{
+				// In the block-panel algorithm, the m dimension is parallelized
+				// with ic_nt and the n dimension is parallelized with jc_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
+			}
+			else // if ( !use_bp )
+			{
+				// In the panel-block algorithm, the m dimension is parallelized
+				// with jc_nt and the n dimension is parallelized with ic_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
+			}
+
+			// Update the ways of parallelism for the jc and ic loops, and then
+			// update the current thread's root thrinfo_t node according to the
+			// new ways of parallelism value for the jc loop.
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
+			bli_l3_sup_thrinfo_update_root( rntm, thread );
+		}
+
+
+		if ( use_bp )
+		{
+			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
+			printf( "bli_l3_sup_int(): var2m non-primary\n" );
+			#endif
+			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
+			bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
+			                        alpha, a, b, beta, c,
+			                        stor_id, cntx, rntm, thread );
+		}
+		else // use_pb
+		{
+			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
+			printf( "bli_l3_sup_int(): var1n non-primary\n" );
+			#endif
+			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
+			bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
+			                        alpha, a, b, beta, c,
+			                        stor_id, cntx, rntm, thread );
+			// *requires nudging of mc up to be a multiple of nr.
+		}
+	}
+
+	// Return success so that the caller knows that we computed the solution.
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
+	return BLIS_SUCCESS;
+}
+
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -177,19 +177,6 @@ void bli_gemm_front
 	dim_t m_dim_local = bli_obj_length( &c_local );
 	dim_t n_dim_local = bli_obj_width( &c_local );
 	dim_t k_dim_local = bli_obj_width( &a_local );
-#ifdef BLIS_CONFIG_EPYC
-	// Regression observed in sgemm native path in cases where m >= 4 * n 
-	// after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit 
-	// 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for
-	// the issue.
-	if( bli_obj_is_float( &c_local ) &&
-	    ( n_dim_local >= 1024 ) &&
-	    ( k_dim_local >= 1024 ) &&
-	    ( m_dim_local >= ( 4 * n_dim_local ) ) )
-	{
-		m_dim_local *= 2;
-	}
-#endif
 	
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop, and then make any
--- a/frame/3/gemm/bli_gemm_front_amd.c
+++ b/frame/3/gemm/bli_gemm_front_amd.c
@@ -0,0 +1,413 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_gemm_front
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl
+     )
+{
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
+	bli_init_once();
+
+	obj_t   a_local;
+	obj_t   b_local;
+	obj_t   c_local;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemm_check( alpha, a, b, beta, c, cntx );
+
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
+#ifdef BLIS_ENABLE_SMALL_MATRIX
+	// Only handle small problems separately for homogeneous datatypes.
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
+	     bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
+	{
+		err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
+
+		if ( status == BLIS_SUCCESS )
+		{
+			AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+			return;
+		}
+	}
+#endif
+
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( a, &a_local );
+	bli_obj_alias_to( b, &b_local );
+	bli_obj_alias_to( c, &c_local );
+
+#ifdef BLIS_ENABLE_GEMM_MD
+	cntx_t cntx_local;
+
+	// If any of the storage datatypes differ, or if the computation precision
+	// differs from the storage precision of C, utilize the mixed datatype
+	// code path.
+	// NOTE: If we ever want to support the caller setting the computation
+	// domain explicitly, we will need to check the computation dt against the
+	// storage dt of C (instead of the computation precision against the
+	// storage precision of C).
+	if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) ||
+	     bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) ||
+	     bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) )
+	{
+		// Handle mixed datatype cases in bli_gemm_md(), which may modify
+		// the objects or the context. (If the context is modified, cntx
+		// is adjusted to point to cntx_local.)
+		bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
+	}
+	//else // homogeneous datatypes
+#endif
+
+	// Load the pack schemas from the context and embed them into the objects
+	// for A and B. (Native contexts are initialized with the correct pack
+	// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
+	// have made a copy and modified the schemas, so reading them from the
+	// context should be a safe bet at this point.) This is a sort of hack for
+	// communicating the desired pack schemas to bli_gemm_cntl_create() (via
+	// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
+	// to subsequently access the schemas from the control tree, which
+	// hopefully reduces some confusion, particularly in bli_packm_init().
+	const pack_t schema_a = bli_cntx_schema_a_block( cntx );
+	const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
+
+	bli_obj_set_pack_schema( schema_a, &a_local );
+	bli_obj_set_pack_schema( schema_b, &b_local );
+
+	// Next, we handle the possibility of needing to typecast alpha to the
+	// computation datatype and/or beta to the storage datatype of C.
+
+	// Attach alpha to B, and in the process typecast alpha to the target
+	// datatype of the matrix (which in this case is equal to the computation
+	// datatype).
+	bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
+
+	// Attach beta to C, and in the process typecast beta to the target
+	// datatype of the matrix (which in this case is equal to the storage
+	// datatype of C).
+	bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta,  &c_local );
+
+	// Change the alpha and beta pointers to BLIS_ONE since the values have
+	// now been typecast and attached to the matrices above.
+	alpha = &BLIS_ONE;
+	beta  = &BLIS_ONE;
+
+#ifdef BLIS_ENABLE_GEMM_MD
+	// Don't perform the following optimization for ccr or crc cases, as
+	// those cases are sensitive to the ukernel storage preference (ie:
+	// transposing the operation would break them).
+	if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
+	     !bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
+#endif
+	// An optimization: If C is stored by rows and the micro-kernel prefers
+	// contiguous columns, or if C is stored by columns and the micro-kernel
+	// prefers contiguous rows, transpose the entire operation to allow the
+	// micro-kernel to access elements of C in its preferred manner.
+	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+
+		bli_obj_induce_trans( &a_local );
+		bli_obj_induce_trans( &b_local );
+		bli_obj_induce_trans( &c_local );
+
+		// We must also swap the pack schemas, which were set by bli_gemm_md()
+		// or the inlined code above.
+		bli_obj_swap_pack_schemas( &a_local, &b_local );
+	}
+	
+	dim_t m_dim_local = bli_obj_length( &c_local );
+	dim_t n_dim_local = bli_obj_width( &c_local );
+	dim_t k_dim_local = bli_obj_width( &a_local );
+	
+	// Regression observed in sgemm native path in cases where m >= 4 * n 
+	// after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit 
+	// 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for
+	// the issue.
+	if( bli_obj_is_float( &c_local ) &&
+	    ( n_dim_local >= 1024 ) &&
+	    ( k_dim_local >= 1024 ) &&
+	    ( m_dim_local >= ( 4 * n_dim_local ) ) )
+	{
+		m_dim_local *= 2;
+	}
+	
+	// Parse and interpret the contents of the rntm_t object to properly
+	// set the ways of parallelism for each loop, and then make any
+	// additional modifications necessary for the current operation.
+	bli_rntm_set_ways_for_op
+	(
+	  BLIS_GEMM,
+	  BLIS_LEFT, // ignored for gemm/hemm/symm
+	  m_dim_local,
+	  n_dim_local,
+	  k_dim_local,
+	  rntm
+	);
+
+	obj_t* cp    = &c_local;
+	obj_t* betap = beta;
+
+#ifdef BLIS_ENABLE_GEMM_MD
+#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
+	// If any of the following conditions are met, create a temporary matrix
+	// conformal to C into which we will accumulate the matrix product:
+	// - the storage precision of C differs from the computation precision;
+	// - the domains are mixed as crr;
+	// - the storage format of C does not match the preferred orientation
+	//   of the ccr or crc cases.
+	// Then, after the computation is complete, this matrix will be copied
+	// or accumulated back to C.
+	const bool is_ccr_mismatch =
+	             ( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
+                   !bli_obj_is_col_stored( &c_local ) );
+	const bool is_crc_mismatch =
+	             ( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
+                   !bli_obj_is_row_stored( &c_local ) );
+
+	obj_t ct;
+	bool  use_ct = FALSE;
+
+	// FGVZ: Consider adding another guard here that only creates and uses a
+	// temporary matrix for accumulation if k < c * kc, where c is some small
+	// constant like 2. And don't forget to use the same conditional for the
+	// castm() and free() at the end.
+	if (
+	     bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) ||
+	     bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ||
+	     is_ccr_mismatch ||
+	     is_crc_mismatch
+	   )
+	{
+		use_ct = TRUE;
+	}
+
+	// If we need a temporary matrix conformal to C for whatever reason,
+	// we create it and prepare to use it now.
+	if ( use_ct )
+	{
+		const dim_t m     = bli_obj_length( &c_local );
+		const dim_t n     = bli_obj_width( &c_local );
+		      inc_t rs    = bli_obj_row_stride( &c_local );
+		      inc_t cs    = bli_obj_col_stride( &c_local );
+
+		      num_t dt_ct = bli_obj_domain( &c_local ) |
+		                    bli_obj_comp_prec( &c_local );
+
+		// When performing the crr case, accumulate to a contiguously-stored
+		// real matrix so we do not have to repeatedly update C with general
+		// stride.
+		if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) )
+			dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local );
+
+		// When performing the mismatched ccr or crc cases, now is the time
+		// to specify the appropriate storage so the gemm_md_c2r_ref() virtual
+		// microkernel can output directly to C (instead of using a temporary
+		// microtile).
+		if      ( is_ccr_mismatch ) { rs = 1; cs = m; }
+		else if ( is_crc_mismatch ) { rs = n; cs = 1; }
+
+		bli_obj_create( dt_ct, m, n, rs, cs, &ct );
+
+		const num_t dt_exec = bli_obj_exec_dt( &c_local );
+		const num_t dt_comp = bli_obj_comp_dt( &c_local );
+
+		bli_obj_set_target_dt( dt_ct, &ct );
+		bli_obj_set_exec_dt( dt_exec, &ct );
+		bli_obj_set_comp_dt( dt_comp, &ct );
+
+		// A naive approach would cast C to the comptuation datatype,
+		// compute with beta, and then cast the result back to the
+		// user-provided output matrix. However, we employ a different
+		// approach that halves the number of memops on C (or its
+		// typecast temporary) by writing the A*B product directly to
+		// temporary storage, and then using xpbym to scale the
+		// output matrix by beta and accumulate/cast the A*B product.
+		//bli_castm( &c_local, &ct );
+		betap = &BLIS_ZERO;
+
+		cp = &ct;
+	}
+#endif
+#endif
+
+	// Invoke the internal back-end via the thread handler.
+	bli_l3_thread_decorator
+	(
+	  bli_gemm_int,
+	  BLIS_GEMM, // operation family id
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  betap,
+	  cp,
+	  cntx,
+	  rntm,
+	  cntl
+	);
+
+#ifdef BLIS_ENABLE_GEMM_MD
+#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
+	// If we created a temporary matrix conformal to C for whatever reason,
+	// we copy/accumulate the result back to C and then release the object.
+	if ( use_ct )
+    {
+		obj_t beta_local;
+
+		bli_obj_scalar_detach( &c_local, &beta_local );
+
+		//bli_castnzm( &ct, &c_local );
+		bli_xpbym( &ct, &beta_local, &c_local );
+
+		bli_obj_free( &ct );
+	}
+#endif
+#endif
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+}
+
+// -----------------------------------------------------------------------------
+
+#if 0
+	if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
+	     bli_obj_dt( a ) != bli_obj_dt( c ) ||
+	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
+	{
+		const bool a_is_real = bli_obj_is_real( a );
+		const bool a_is_comp = bli_obj_is_complex( a );
+		const bool b_is_real = bli_obj_is_real( b );
+		const bool b_is_comp = bli_obj_is_complex( b );
+		const bool c_is_real = bli_obj_is_real( c );
+		const bool c_is_comp = bli_obj_is_complex( c );
+
+		const bool a_is_single = bli_obj_is_single_prec( a );
+		const bool a_is_double = bli_obj_is_double_prec( a );
+		const bool b_is_single = bli_obj_is_single_prec( b );
+		const bool b_is_double = bli_obj_is_double_prec( b );
+		const bool c_is_single = bli_obj_is_single_prec( c );
+		const bool c_is_double = bli_obj_is_double_prec( c );
+
+		const bool comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
+		const bool comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
+
+		const bool mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
+		                         bli_obj_domain( c ) != bli_obj_domain( b );
+
+		( void )a_is_real; ( void )a_is_comp;
+		( void )b_is_real; ( void )b_is_comp;
+		( void )c_is_real; ( void )c_is_comp;
+		( void )a_is_single; ( void )a_is_double;
+		( void )b_is_single; ( void )b_is_double;
+		( void )c_is_single; ( void )c_is_double;
+		( void )comp_single; ( void )comp_double;
+
+		if (
+		     //( c_is_comp && a_is_comp && b_is_real ) ||
+		     //( c_is_comp && a_is_real && b_is_comp ) ||
+		     //( c_is_real && a_is_comp && b_is_comp ) ||
+		     //( c_is_comp && a_is_real && b_is_real ) ||
+		     //( c_is_real && a_is_comp && b_is_real ) ||
+		     //( c_is_real && a_is_real && b_is_comp ) ||
+		     //FALSE
+		     TRUE
+		   )
+		{
+			if (
+			     ( c_is_single && a_is_single && b_is_single && mixeddomain ) ||
+			     ( c_is_single && a_is_single && b_is_single && comp_single ) ||
+			     ( c_is_single && a_is_single && b_is_single && comp_double ) ||
+			     ( c_is_single && a_is_single && b_is_double                ) ||
+			     ( c_is_single && a_is_double && b_is_single                ) ||
+			     ( c_is_double && a_is_single && b_is_single                ) ||
+			     ( c_is_single && a_is_double && b_is_double                ) ||
+			     ( c_is_double && a_is_single && b_is_double                ) ||
+			     ( c_is_double && a_is_double && b_is_single                ) ||
+			     ( c_is_double && a_is_double && b_is_double && comp_single ) ||
+			     ( c_is_double && a_is_double && b_is_double && comp_double ) ||
+			     ( c_is_double && a_is_double && b_is_double && mixeddomain ) ||
+			     FALSE
+			   )
+				bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
+			else
+				bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
+		}
+		else
+			bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
+		return;
+	}
+#else
+#if 0
+	// If any of the storage datatypes differ, or if the execution precision
+	// differs from the storage precision of C, utilize the mixed datatype
+	// code path.
+	// NOTE: We could check the exec dt against the storage dt of C, but for
+	// now we don't support the caller setting the execution domain
+	// explicitly.
+	if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
+	     bli_obj_dt( a ) != bli_obj_dt( c ) ||
+	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
+	{
+		bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
+		return;
+	}
+#endif
+#endif
+
--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -501,6 +501,25 @@ bool bli_cpuid_is_bulldozer
 	return TRUE;
 }

+bool bli_cpuid_is_avx_supported( void )
+{
+	uint32_t family, model, features;
+
+	// Call the CPUID instruction and parse its results into a family id,
+	// model id, and a feature bit field. The return value encodes the
+	// vendor.
+	bli_cpuid_query( &family, &model, &features );
+
+	// Check for expected CPU features.
+	const uint32_t expected = FEATURE_AVX     |
+	                          FEATURE_FMA3    |
+	                          FEATURE_AVX2;
+
+	if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
+
+	return TRUE;
+}
+
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)

 arch_t bli_cpuid_query_id( void )
--- a/frame/base/bli_cpuid.h
+++ b/frame/base/bli_cpuid.h
@@ -133,7 +133,7 @@ BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want )

 void get_cpu_name( char *cpu_name );
 int  vpu_count( void );
-
+bool bli_cpuid_is_avx_supported(void);

 enum
 {
@@ -160,6 +160,8 @@ enum
 	FEATURE_AVX512VL = 0x4000
 };

+
+
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)

 char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath );
--- a/frame/compat/bla_amax.c
+++ b/frame/compat/bla_amax.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -98,217 +98,5 @@ f77_int PASTEF772(i,chx,blasname) \
 }

 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_CONFIG_EPYC
-
-f77_int isamax_
-     (
-       const f77_int* n,
-       const float* x, const f77_int* incx
-     )
-{
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx);
-
-    dim_t    n0;
-    float* x0;
-    inc_t    incx0;
-    gint_t   bli_index;
-    f77_int  f77_index;
-
-    /* If the vector is empty, return an index of zero. This early check
-       is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
-       return 0, which ends up getting incremented to 1 (below) before
-       being returned, which is not what we want. */
-    if ( *n < 1 || *incx <= 0 ) {
-      AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "isamax_: vector empty");
-      return 0;
-    }
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = ((float*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = ((float*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen)
-    {
-        /* Call BLIS kernel */
-        bli_samaxv_zen_int
-        (
-          n0,
-          x0, incx0,
-          &bli_index,
-          NULL
-        );
-    }
-    else
-    {
-      PASTEMAC2(s,amaxv,BLIS_TAPI_EX_SUF)
-      (
-        n0,
-        x0, incx0,
-        &bli_index,
-        NULL,
-        NULL
-      );
-    }
-
-    /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
-       index. Also, if the BLAS integer size differs from the BLIS
-       integer size, that typecast occurs here. */
-    f77_index = bli_index + 1;
-
-    /* Finalize BLIS. */
-//    bli_finalize_auto();
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-
-    return f77_index;
-}
-
-f77_int idamax_
-     (
-       const f77_int* n,
-       const double* x, const f77_int* incx
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx);
-
-    dim_t    n0;
-    double* x0;
-    inc_t    incx0;
-    gint_t   bli_index;
-    f77_int  f77_index;
-
-    /* If the vector is empty, return an index of zero. This early check
-       is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
-       return 0, which ends up getting incremented to 1 (below) before
-       being returned, which is not what we want. */
-    if ( *n < 1 || *incx <= 0 ) {
-      AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty");
-      return 0;
-    }
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = ((double*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = ((double*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen)
-    {
-        /* Call BLIS kernel */
-        bli_damaxv_zen_int
-        (
-          n0,
-          x0, incx0,
-          &bli_index,
-          NULL
-        );
-    }
-    else
-    {
-      PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF)
-      (
-        n0,
-        x0, incx0,
-        &bli_index,
-        NULL,
-        NULL
-      );
-    }
-
-    /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
-       index. Also, if the BLAS integer size differs from the BLIS
-       integer size, that typecast occurs here. */
-    f77_index = bli_index + 1;
-
-    /* Finalize BLIS. */
-//    bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-    return f77_index;
-}
-
-INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
-#else
 INSERT_GENTFUNC_BLAS( amax, amaxv )
 #endif
-#endif
--- a/frame/compat/bla_amax_amd.c
+++ b/frame/compat/bla_amax_amd.c
@@ -0,0 +1,295 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNC
+#define GENTFUNC( ftype_x, chx, blasname, blisname ) \
+\
+f77_int PASTEF772(i,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
+    AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(chx), *n, *incx) \
+\
+    dim_t    n0; \
+    ftype_x* x0; \
+    inc_t    incx0; \
+    gint_t   bli_index; \
+    f77_int  f77_index; \
+\
+    /* If the vector is empty, return an index of zero. This early check
+       is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
+       return 0, which ends up getting incremented to 1 (below) before
+       being returned, which is not what we want. */ \
+    if ( *n < 1 || *incx <= 0 ) { \
+      AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "iamax_: vector empty") \
+       return 0;                                   \
+    }\
+\
+    /* Initialize BLIS. */ \
+    bli_init_auto(); \
+\
+    /* Convert/typecast negative values of n to zero. */ \
+    bli_convert_blas_dim1( *n, n0 ); \
+\
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */ \
+    bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
+\
+    /* Call BLIS interface. */ \
+    PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
+    ( \
+      n0, \
+      x0, incx0, \
+      &bli_index, \
+      NULL, \
+      NULL  \
+    ); \
+\
+    /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
+       index. Also, if the BLAS integer size differs from the BLIS
+       integer size, that typecast occurs here. */ \
+    f77_index = bli_index + 1; \
+\
+    /* Finalize BLIS. */ \
+    bli_finalize_auto(); \
+\
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+    return f77_index; \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+
+f77_int isamax_
+     (
+       const f77_int* n,
+       const float* x, const f77_int* incx
+     )
+{
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx);
+
+    dim_t    n0;
+    float* x0;
+    inc_t    incx0;
+    gint_t   bli_index;
+    f77_int  f77_index;
+
+    /* If the vector is empty, return an index of zero. This early check
+       is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
+       return 0, which ends up getting incremented to 1 (below) before
+       being returned, which is not what we want. */
+    if ( *n < 1 || *incx <= 0 ) {
+      AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "isamax_: vector empty");
+      return 0;
+    }
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((float*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((float*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+    {
+        /* Call BLIS kernel */
+        bli_samaxv_zen_int
+        (
+          n0,
+          x0, incx0,
+          &bli_index,
+          NULL
+        );
+    }
+    else
+    {
+      PASTEMAC2(s,amaxv,BLIS_TAPI_EX_SUF)
+      (
+        n0,
+        x0, incx0,
+        &bli_index,
+        NULL,
+        NULL
+      );
+    }
+
+    /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
+       index. Also, if the BLAS integer size differs from the BLIS
+       integer size, that typecast occurs here. */
+    f77_index = bli_index + 1;
+
+    /* Finalize BLIS. */
+//    bli_finalize_auto();
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+
+    return f77_index;
+}
+
+f77_int idamax_
+     (
+       const f77_int* n,
+       const double* x, const f77_int* incx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_AMAX_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx);
+
+    dim_t    n0;
+    double* x0;
+    inc_t    incx0;
+    gint_t   bli_index;
+    f77_int  f77_index;
+
+    /* If the vector is empty, return an index of zero. This early check
+       is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
+       return 0, which ends up getting incremented to 1 (below) before
+       being returned, which is not what we want. */
+    if ( *n < 1 || *incx <= 0 ) {
+      AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty");
+      return 0;
+    }
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((double*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((double*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+    {
+        /* Call BLIS kernel */
+        bli_damaxv_zen_int
+        (
+          n0,
+          x0, incx0,
+          &bli_index,
+          NULL
+        );
+    }
+    else
+    {
+      PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF)
+      (
+        n0,
+        x0, incx0,
+        &bli_index,
+        NULL,
+        NULL
+      );
+    }
+
+    /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
+       index. Also, if the BLAS integer size differs from the BLIS
+       integer size, that typecast occurs here. */
+    f77_index = bli_index + 1;
+
+    /* Finalize BLIS. */
+//    bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+    return f77_index;
+}
+
+INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
+
+#endif
--- a/frame/compat/bla_axpy.c
+++ b/frame/compat/bla_axpy.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -87,411 +87,6 @@ void PASTEF77(ch,blasname) \

 #ifdef BLIS_ENABLE_BLAS

-#ifdef BLIS_CONFIG_EPYC
-void saxpy_
-(
- const f77_int* n,
- const float*   alpha,
- const float*   x, const f77_int* incx,
- float*   y, const f77_int* incy
- )
-{
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy)
-  dim_t  n0;
-  float* x0;
-  float* y0;
-  inc_t  incx0;
-  inc_t  incy0;
-
-  /* Initialize BLIS. */
-  //    bli_init_auto();
-
-  /* Convert/typecast negative values of n to zero. */
-  if ( *n < 0 ) n0 = ( dim_t )0;
-  else              n0 = ( dim_t )(*n);
-
-  /* If the input increments are negative, adjust the pointers so we can
-     use positive increments instead. */
-  if ( *incx < 0 )
-    {
-      /* The semantics of negative stride in BLAS are that the vector
-         operand be traversed in reverse order. (Another way to think
-         of this is that negative strides effectively reverse the order
-         of the vector, but without any explicit data movements.) This
-         is also how BLIS interprets negative strides. The differences
-         is that with BLAS, the caller *always* passes in the 0th (i.e.,
-         top-most or left-most) element of the vector, even when the
-         stride is negative. By contrast, in BLIS, negative strides are
-         used *relative* to the vector address as it is given. Thus, in
-         BLIS, if this backwards traversal is desired, the caller *must*
-         pass in the address to the (n-1)th (i.e., the bottom-most or
-         right-most) element along with a negative stride. */
-      x0    = ((float*)x) + (n0-1)*(-*incx);
-      incx0 = ( inc_t )(*incx);
-    }
-  else
-    {
-      x0    = ((float*)x);
-      incx0 = ( inc_t )(*incx);
-    }
-  if ( *incy < 0 )
-    {
-      y0    = ((float*)y) + (n0-1)*(-*incy);
-      incy0 = ( inc_t )(*incy);
-    }
-  else
-    {
-      y0    = ((float*)y);
-      incy0 = ( inc_t )(*incy);
-    }
-
-  // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-  // This function is invoked on all architectures including ‘generic’.
-  // Invoke architecture specific kernels only if we are sure that we are running on zen,
-  // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-  arch_t id = bli_arch_query_id();
-  bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                 (id == BLIS_ARCH_ZEN3) ||
-                 (id == BLIS_ARCH_ZEN2) ||
-                 (id == BLIS_ARCH_ZEN);
-
-  if (bamdzen)
-  {
-      bli_saxpyv_zen_int10
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (float*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL
-      );
-
-  }
-  else
-  {
-      PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF)
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (float*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL,
-        NULL
-      );
-
-  }
-  /* Finalize BLIS. */
-  //    bli_finalize_auto();
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-}
-
-void daxpy_
-(
- const f77_int* n,
- const double*   alpha,
- const double*   x, const f77_int* incx,
- double*   y, const f77_int* incy
- )
-{
-  dim_t  n0;
-  double* x0;
-  double* y0;
-  inc_t  incx0;
-  inc_t  incy0;
-
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy)
-  /* Initialize BLIS. */
-  //    bli_init_auto();
-
-  /* Convert/typecast negative values of n to zero. */
-  if ( *n < 0 ) n0 = ( dim_t )0;
-  else              n0 = ( dim_t )(*n);
-
-  /* If the input increments are negative, adjust the pointers so we can
-     use positive increments instead. */
-  if ( *incx < 0 )
-    {
-      /* The semantics of negative stride in BLAS are that the vector
-         operand be traversed in reverse order. (Another way to think
-         of this is that negative strides effectively reverse the order
-         of the vector, but without any explicit data movements.) This
-         is also how BLIS interprets negative strides. The differences
-         is that with BLAS, the caller *always* passes in the 0th (i.e.,
-         top-most or left-most) element of the vector, even when the
-         stride is negative. By contrast, in BLIS, negative strides are
-         used *relative* to the vector address as it is given. Thus, in
-         BLIS, if this backwards traversal is desired, the caller *must*
-         pass in the address to the (n-1)th (i.e., the bottom-most or
-         right-most) element along with a negative stride. */
-      x0    = ((double*)x) + (n0-1)*(-*incx);
-      incx0 = ( inc_t )(*incx);
-    }
-  else
-    {
-      x0    = ((double*)x);
-      incx0 = ( inc_t )(*incx);
-    }
-  if ( *incy < 0 )
-    {
-      y0    = ((double*)y) + (n0-1)*(-*incy);
-      incy0 = ( inc_t )(*incy);
-    }
-  else
-    {
-      y0    = ((double*)y);
-      incy0 = ( inc_t )(*incy);
-    }
-
-  // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-  // This function is invoked on all architectures including ‘generic’.
-  // Invoke architecture specific kernels only if we are sure that we are running on zen,
-  // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-  arch_t id = bli_arch_query_id();
-  bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                 (id == BLIS_ARCH_ZEN3) ||
-                 (id == BLIS_ARCH_ZEN2) ||
-                 (id == BLIS_ARCH_ZEN);
-
-  if (bamdzen)
-  {
-      bli_daxpyv_zen_int10
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (double*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL
-      );
-
-  }
-  else
-  {
-      PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF)
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (double*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL,
-        NULL
-      );
-
-  }
-
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-  /* Finalize BLIS. */
-  //    bli_finalize_auto();
-}
-
-void caxpy_
-(
- const f77_int* n,
- const scomplex*   alpha,
- const scomplex*   x, const f77_int* incx,
- scomplex*   y, const f77_int* incy
- )
-{
-  dim_t     n0;
-  scomplex* x0;
-  scomplex* y0;
-  inc_t  incx0;
-  inc_t  incy0;
-
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, (scomplex*)alpha, *incx, *incy)
-
-  /* Initialize BLIS. */
-  //    bli_init_auto();
-  /* Convert/typecast negative values of n to zero. */
-  if ( *n < 0 ) n0 = ( dim_t )0;
-  else              n0 = ( dim_t )(*n);
-
-  /* If the input increments are negative, adjust the pointers so we can
-     use positive increments instead. */
-  if ( *incx < 0 )
-    {
-      /* The semantics of negative stride in BLAS are that the vector
-         operand be traversed in reverse order. (Another way to think
-         of this is that negative strides effectively reverse the order
-         of the vector, but without any explicit data movements.) This
-         is also how BLIS interprets negative strides. The differences
-         is that with BLAS, the caller *always* passes in the 0th (i.e.,
-         top-most or left-most) element of the vector, even when the
-         stride is negative. By contrast, in BLIS, negative strides are
-         used *relative* to the vector address as it is given. Thus, in
-         BLIS, if this backwards traversal is desired, the caller *must*
-         pass in the address to the (n-1)th (i.e., the bottom-most or
-         right-most) element along with a negative stride. */
-      x0    = ((scomplex*)x) + (n0-1)*(-*incx);
-      incx0 = ( inc_t )(*incx);
-    }
-  else
-    {
-      x0    = ((scomplex*)x);
-      incx0 = ( inc_t )(*incx);
-    }
-  if ( *incy < 0 )
-    {
-      y0    = ((scomplex*)y) + (n0-1)*(-*incy);
-      incy0 = ( inc_t )(*incy);
-    }
-  else
-    {
-      y0    = ((scomplex*)y);
-      incy0 = ( inc_t )(*incy);
-    }
-
-  // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-  // This function is invoked on all architectures including ‘generic’.
-  // Invoke architecture specific kernels only if we are sure that we are running on zen,
-  // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-  arch_t id = bli_arch_query_id();
-  bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                 (id == BLIS_ARCH_ZEN3) ||
-                 (id == BLIS_ARCH_ZEN2) ||
-                 (id == BLIS_ARCH_ZEN);
-
-  if (bamdzen)
-  {
-      bli_caxpyv_zen_int5
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (scomplex*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL
-      );
-
-  }
-  else
-  {
-      PASTEMAC2(c,axpyv,BLIS_TAPI_EX_SUF)
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (scomplex*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL,
-        NULL
-      );
-  }
-
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-  /* Finalize BLIS. */
-  //    bli_finalize_auto();
-}
-
-void zaxpy_
-(
- const f77_int* n,
- const dcomplex*   alpha,
- const dcomplex*   x, const f77_int* incx,
- dcomplex*   y, const f77_int* incy
- )
-{
-  dim_t  n0;
-  dcomplex* x0;
-  dcomplex* y0;
-  inc_t  incx0;
-  inc_t  incy0;
-
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, (dcomplex*)alpha, *incx, *incy)
-
-  /* Initialize BLIS. */
-  //    bli_init_auto();
-
-  /* Convert/typecast negative values of n to zero. */
-  if ( *n < 0 ) n0 = ( dim_t )0;
-  else              n0 = ( dim_t )(*n);
-
-  /* If the input increments are negative, adjust the pointers so we can
-     use positive increments instead. */
-  if ( *incx < 0 )
-    {
-      /* The semantics of negative stride in BLAS are that the vector
-         operand be traversed in reverse order. (Another way to think
-         of this is that negative strides effectively reverse the order
-         of the vector, but without any explicit data movements.) This
-         is also how BLIS interprets negative strides. The differences
-         is that with BLAS, the caller *always* passes in the 0th (i.e.,
-         top-most or left-most) element of the vector, even when the
-         stride is negative. By contrast, in BLIS, negative strides are
-         used *relative* to the vector address as it is given. Thus, in
-         BLIS, if this backwards traversal is desired, the caller *must*
-         pass in the address to the (n-1)th (i.e., the bottom-most or
-         right-most) element along with a negative stride. */
-      x0    = ((dcomplex*)x) + (n0-1)*(-*incx);
-      incx0 = ( inc_t )(*incx);
-    }
-  else
-    {
-      x0    = ((dcomplex*)x);
-      incx0 = ( inc_t )(*incx);
-    }
-  if ( *incy < 0 )
-    {
-      y0    = ((dcomplex*)y) + (n0-1)*(-*incy);
-      incy0 = ( inc_t )(*incy);
-    }
-  else
-    {
-      y0    = ((dcomplex*)y);
-      incy0 = ( inc_t )(*incy);
-    }
-
-  // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-  // This function is invoked on all architectures including ‘generic’.
-  // Invoke architecture specific kernels only if we are sure that we are running on zen,
-  // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-  arch_t id = bli_arch_query_id();
-  bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                 (id == BLIS_ARCH_ZEN3) ||
-                 (id == BLIS_ARCH_ZEN2) ||
-                 (id == BLIS_ARCH_ZEN);
-
-  if (bamdzen)
-  {
-      bli_zaxpyv_zen_int5
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (dcomplex*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL
-      );
-
-  }
-  else
-  {
-      PASTEMAC2(z,axpyv,BLIS_TAPI_EX_SUF)
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (dcomplex*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL,
-        NULL
-      );
-  }
-
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-  /* Finalize BLIS. */
-  //    bli_finalize_auto();
-}
-
-#else
 INSERT_GENTFUNC_BLAS( axpy, axpyv )
-#endif

 #endif
--- a/frame/compat/bla_axpy_amd.c
+++ b/frame/compat/bla_axpy_amd.c
@@ -0,0 +1,462 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+    dim_t  n0; \
+    ftype* x0; \
+    ftype* y0; \
+    inc_t  incx0; \
+    inc_t  incy0; \
+\
+    /* Initialize BLIS. */ \
+    bli_init_auto(); \
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
+    AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, (void*)alpha, *incx, *incy) \
+    /* Convert/typecast negative values of n to zero. */ \
+    bli_convert_blas_dim1( *n, n0 ); \
+\
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */ \
+    bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
+    bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
+\
+    /* Call BLIS interface. */ \
+    PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+    ( \
+      BLIS_NO_CONJUGATE, \
+      n0, \
+      (ftype*)alpha, \
+      x0, incx0, \
+      y0, incy0, \
+      NULL, \
+      NULL  \
+    ); \
+\
+     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+        /* Finalize BLIS. */ \
+     bli_finalize_auto();  \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+
+void saxpy_
+(
+ const f77_int* n,
+ const float*   alpha,
+ const float*   x, const f77_int* incx,
+ float*   y, const f77_int* incy
+ )
+{
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy)
+  dim_t  n0;
+  float* x0;
+  float* y0;
+  inc_t  incx0;
+  inc_t  incy0;
+
+  /* Initialize BLIS. */
+  //    bli_init_auto();
+
+  /* Convert/typecast negative values of n to zero. */
+  if ( *n < 0 ) n0 = ( dim_t )0;
+  else              n0 = ( dim_t )(*n);
+
+  /* If the input increments are negative, adjust the pointers so we can
+     use positive increments instead. */
+  if ( *incx < 0 )
+    {
+      /* The semantics of negative stride in BLAS are that the vector
+         operand be traversed in reverse order. (Another way to think
+         of this is that negative strides effectively reverse the order
+         of the vector, but without any explicit data movements.) This
+         is also how BLIS interprets negative strides. The differences
+         is that with BLAS, the caller *always* passes in the 0th (i.e.,
+         top-most or left-most) element of the vector, even when the
+         stride is negative. By contrast, in BLIS, negative strides are
+         used *relative* to the vector address as it is given. Thus, in
+         BLIS, if this backwards traversal is desired, the caller *must*
+         pass in the address to the (n-1)th (i.e., the bottom-most or
+         right-most) element along with a negative stride. */
+      x0    = ((float*)x) + (n0-1)*(-*incx);
+      incx0 = ( inc_t )(*incx);
+    }
+  else
+    {
+      x0    = ((float*)x);
+      incx0 = ( inc_t )(*incx);
+    }
+  if ( *incy < 0 )
+    {
+      y0    = ((float*)y) + (n0-1)*(-*incy);
+      incy0 = ( inc_t )(*incy);
+    }
+  else
+    {
+      y0    = ((float*)y);
+      incy0 = ( inc_t )(*incy);
+    }
+
+  // This function is invoked on all architectures including ‘generic’.
+  // Non-AVX platforms will use the kernels derived from the context.
+  if (bli_cpuid_is_avx_supported() == TRUE)
+  {
+      bli_saxpyv_zen_int10
+      (
+        BLIS_NO_CONJUGATE,
+        n0,
+        (float*)alpha,
+        x0, incx0,
+        y0, incy0,
+        NULL
+      );
+
+  }
+  else
+  {
+      PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF)
+      (
+        BLIS_NO_CONJUGATE,
+        n0,
+        (float*)alpha,
+        x0, incx0,
+        y0, incy0,
+        NULL,
+        NULL
+      );
+
+  }
+  /* Finalize BLIS. */
+  //    bli_finalize_auto();
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+}
+
+void daxpy_
+(
+ const f77_int* n,
+ const double*   alpha,
+ const double*   x, const f77_int* incx,
+ double*   y, const f77_int* incy
+ )
+{
+  dim_t  n0;
+  double* x0;
+  double* y0;
+  inc_t  incx0;
+  inc_t  incy0;
+
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy)
+  /* Initialize BLIS. */
+  //    bli_init_auto();
+
+  /* Convert/typecast negative values of n to zero. */
+  if ( *n < 0 ) n0 = ( dim_t )0;
+  else              n0 = ( dim_t )(*n);
+
+  /* If the input increments are negative, adjust the pointers so we can
+     use positive increments instead. */
+  if ( *incx < 0 )
+    {
+      /* The semantics of negative stride in BLAS are that the vector
+         operand be traversed in reverse order. (Another way to think
+         of this is that negative strides effectively reverse the order
+         of the vector, but without any explicit data movements.) This
+         is also how BLIS interprets negative strides. The differences
+         is that with BLAS, the caller *always* passes in the 0th (i.e.,
+         top-most or left-most) element of the vector, even when the
+         stride is negative. By contrast, in BLIS, negative strides are
+         used *relative* to the vector address as it is given. Thus, in
+         BLIS, if this backwards traversal is desired, the caller *must*
+         pass in the address to the (n-1)th (i.e., the bottom-most or
+         right-most) element along with a negative stride. */
+      x0    = ((double*)x) + (n0-1)*(-*incx);
+      incx0 = ( inc_t )(*incx);
+    }
+  else
+    {
+      x0    = ((double*)x);
+      incx0 = ( inc_t )(*incx);
+    }
+  if ( *incy < 0 )
+    {
+      y0    = ((double*)y) + (n0-1)*(-*incy);
+      incy0 = ( inc_t )(*incy);
+    }
+  else
+    {
+      y0    = ((double*)y);
+      incy0 = ( inc_t )(*incy);
+    }
+
+  // This function is invoked on all architectures including ‘generic’.
+  // Non-AVX platforms will use the kernels derived from the context.
+  if (bli_cpuid_is_avx_supported() == TRUE)
+  {
+      bli_daxpyv_zen_int10
+      (
+        BLIS_NO_CONJUGATE,
+        n0,
+        (double*)alpha,
+        x0, incx0,
+        y0, incy0,
+        NULL
+      );
+
+  }
+  else
+  {
+      PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF)
+      (
+        BLIS_NO_CONJUGATE,
+        n0,
+        (double*)alpha,
+        x0, incx0,
+        y0, incy0,
+        NULL,
+        NULL
+      );
+
+  }
+
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+  /* Finalize BLIS. */
+  //    bli_finalize_auto();
+}
+
+void caxpy_
+(
+ const f77_int* n,
+ const scomplex*   alpha,
+ const scomplex*   x, const f77_int* incx,
+ scomplex*   y, const f77_int* incy
+ )
+{
+  dim_t     n0;
+  scomplex* x0;
+  scomplex* y0;
+  inc_t  incx0;
+  inc_t  incy0;
+
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, (scomplex*)alpha, *incx, *incy)
+
+  /* Initialize BLIS. */
+  //    bli_init_auto();
+  /* Convert/typecast negative values of n to zero. */
+  if ( *n < 0 ) n0 = ( dim_t )0;
+  else              n0 = ( dim_t )(*n);
+
+  /* If the input increments are negative, adjust the pointers so we can
+     use positive increments instead. */
+  if ( *incx < 0 )
+    {
+      /* The semantics of negative stride in BLAS are that the vector
+         operand be traversed in reverse order. (Another way to think
+         of this is that negative strides effectively reverse the order
+         of the vector, but without any explicit data movements.) This
+         is also how BLIS interprets negative strides. The differences
+         is that with BLAS, the caller *always* passes in the 0th (i.e.,
+         top-most or left-most) element of the vector, even when the
+         stride is negative. By contrast, in BLIS, negative strides are
+         used *relative* to the vector address as it is given. Thus, in
+         BLIS, if this backwards traversal is desired, the caller *must*
+         pass in the address to the (n-1)th (i.e., the bottom-most or
+         right-most) element along with a negative stride. */
+      x0    = ((scomplex*)x) + (n0-1)*(-*incx);
+      incx0 = ( inc_t )(*incx);
+    }
+  else
+    {
+      x0    = ((scomplex*)x);
+      incx0 = ( inc_t )(*incx);
+    }
+  if ( *incy < 0 )
+    {
+      y0    = ((scomplex*)y) + (n0-1)*(-*incy);
+      incy0 = ( inc_t )(*incy);
+    }
+  else
+    {
+      y0    = ((scomplex*)y);
+      incy0 = ( inc_t )(*incy);
+    }
+
+  // This function is invoked on all architectures including ‘generic’.
+  // Non-AVX platforms will use the kernels derived from the context.
+  if (bli_cpuid_is_avx_supported() == TRUE)
+  {
+      bli_caxpyv_zen_int5
+      (
+        BLIS_NO_CONJUGATE,
+        n0,
+        (scomplex*)alpha,
+        x0, incx0,
+        y0, incy0,
+        NULL
+      );
+
+  }
+  else
+  {
+      PASTEMAC2(c,axpyv,BLIS_TAPI_EX_SUF)
+      (
+        BLIS_NO_CONJUGATE,
+        n0,
+        (scomplex*)alpha,
+        x0, incx0,
+        y0, incy0,
+        NULL,
+        NULL
+      );
+  }
+
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+  /* Finalize BLIS. */
+  //    bli_finalize_auto();
+}
+
+void zaxpy_
+(
+ const f77_int* n,
+ const dcomplex*   alpha,
+ const dcomplex*   x, const f77_int* incx,
+ dcomplex*   y, const f77_int* incy
+ )
+{
+  dim_t  n0;
+  dcomplex* x0;
+  dcomplex* y0;
+  inc_t  incx0;
+  inc_t  incy0;
+
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, (dcomplex*)alpha, *incx, *incy)
+
+  /* Initialize BLIS. */
+  //    bli_init_auto();
+
+  /* Convert/typecast negative values of n to zero. */
+  if ( *n < 0 ) n0 = ( dim_t )0;
+  else              n0 = ( dim_t )(*n);
+
+  /* If the input increments are negative, adjust the pointers so we can
+     use positive increments instead. */
+  if ( *incx < 0 )
+    {
+      /* The semantics of negative stride in BLAS are that the vector
+         operand be traversed in reverse order. (Another way to think
+         of this is that negative strides effectively reverse the order
+         of the vector, but without any explicit data movements.) This
+         is also how BLIS interprets negative strides. The differences
+         is that with BLAS, the caller *always* passes in the 0th (i.e.,
+         top-most or left-most) element of the vector, even when the
+         stride is negative. By contrast, in BLIS, negative strides are
+         used *relative* to the vector address as it is given. Thus, in
+         BLIS, if this backwards traversal is desired, the caller *must*
+         pass in the address to the (n-1)th (i.e., the bottom-most or
+         right-most) element along with a negative stride. */
+      x0    = ((dcomplex*)x) + (n0-1)*(-*incx);
+      incx0 = ( inc_t )(*incx);
+    }
+  else
+    {
+      x0    = ((dcomplex*)x);
+      incx0 = ( inc_t )(*incx);
+    }
+  if ( *incy < 0 )
+    {
+      y0    = ((dcomplex*)y) + (n0-1)*(-*incy);
+      incy0 = ( inc_t )(*incy);
+    }
+  else
+    {
+      y0    = ((dcomplex*)y);
+      incy0 = ( inc_t )(*incy);
+    }
+
+  // This function is invoked on all architectures including ‘generic’.
+  // Non-AVX platforms will use the kernels derived from the context.
+  if (bli_cpuid_is_avx_supported() == TRUE)
+  {
+      bli_zaxpyv_zen_int5
+      (
+        BLIS_NO_CONJUGATE,
+        n0,
+        (dcomplex*)alpha,
+        x0, incx0,
+        y0, incy0,
+        NULL
+      );
+
+  }
+  else
+  {
+      PASTEMAC2(z,axpyv,BLIS_TAPI_EX_SUF)
+      (
+        BLIS_NO_CONJUGATE,
+        n0,
+        (dcomplex*)alpha,
+        x0, incx0,
+        y0, incy0,
+        NULL,
+        NULL
+      );
+  }
+
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+  /* Finalize BLIS. */
+  //    bli_finalize_auto();
+}
+
+
+
+#endif
--- a/frame/compat/bla_copy.c
+++ b/frame/compat/bla_copy.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -88,217 +88,5 @@ void PASTEF77(ch,blasname) \
 }

 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_CONFIG_EPYC
-
-void scopy_
-(
-	const f77_int* n,
-	const float*   x, const f77_int* incx,
-	float*   y, const f77_int* incy
-)
-{
-	dim_t  n0;
-	float* x0;
-	float* y0;
-	inc_t  incx0;
-	inc_t  incy0;
-
-	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-	AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy)
-	/* Initialize BLIS. */
-//  bli_init_auto();
-
-	/* Convert/typecast negative values of n to zero. */
-	if (*n < 0)
-		n0 = (dim_t)0;
-	else
-		n0 = (dim_t)(*n);
-
-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */
-	if (*incx < 0)
-	{
-		/* The semantics of negative stride in BLAS are that the vector
-		operand be traversed in reverse order. (Another way to think
-		of this is that negative strides effectively reverse the order
-		of the vector, but without any explicit data movements.) This
-		is also how BLIS interprets negative strides. The differences
-		is that with BLAS, the caller *always* passes in the 0th (i.e.,
-		top-most or left-most) element of the vector, even when the
-		stride is negative. By contrast, in BLIS, negative strides are
-		used *relative* to the vector address as it is given. Thus, in
-		BLIS, if this backwards traversal is desired, the caller *must*
-		pass in the address to the (n-1)th (i.e., the bottom-most or
-		right-most) element along with a negative stride. */
-
-		x0 = (float*)((x)+(n0 - 1)*(-*incx));
-		incx0 = (inc_t)(*incx);
-
-	}
-	else
-	{
-		x0 = (float*)(x);
-		incx0 = (inc_t)(*incx);
-	}
-
-	if (*incy < 0)
-	{
-		y0 = (y)+(n0 - 1)*(-*incy);
-		incy0 = (inc_t)(*incy);
-
-	}
-	else
-	{
-		y0 = (y);
-		incy0 = (inc_t)(*incy);
-	}
-
-	// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-	// This function is invoked on all architectures including ‘generic’.
-	// Invoke architecture specific kernels only if we are sure that we are running on zen,
-	// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-	if (bamdzen)
-	{
-		/* Call BLIS kernel */
-		bli_scopyv_zen_int
-		(
-			BLIS_NO_CONJUGATE,
-			n0,
-			x0, incx0,
-			y0, incy0,
-			NULL
-		);
-	}
-	else
-	{
-		PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF)
-		(
-			BLIS_NO_CONJUGATE,
-			n0,
-			x0, incx0,
-			y0, incy0,
-			NULL,
-			NULL
-		);
-	}
-
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-	/* Finalize BLIS. */
-//    bli_finalize_auto();
-}
-
-void dcopy_
-(
-	const f77_int* n,
-	const double*   x, const f77_int* incx,
-	double*   y, const f77_int* incy
-)
-{
-	dim_t  n0;
-	double* x0;
-	double* y0;
-	inc_t  incx0;
-	inc_t  incy0;
-
-	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-	AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy)
-	/* Initialize BLIS. */
-//  bli_init_auto();
-
-	/* Convert/typecast negative values of n to zero. */
-	if (*n < 0)
-		n0 = (dim_t)0;
-	else
-		n0 = (dim_t)(*n);
-
-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */
-	if (*incx < 0)
-	{
-		/* The semantics of negative stride in BLAS are that the vector
-		operand be traversed in reverse order. (Another way to think
-		of this is that negative strides effectively reverse the order
-		of the vector, but without any explicit data movements.) This
-		is also how BLIS interprets negative strides. The differences
-		is that with BLAS, the caller *always* passes in the 0th (i.e.,
-		top-most or left-most) element of the vector, even when the
-		stride is negative. By contrast, in BLIS, negative strides are
-		used *relative* to the vector address as it is given. Thus, in
-		BLIS, if this backwards traversal is desired, the caller *must*
-		pass in the address to the (n-1)th (i.e., the bottom-most or
-		right-most) element along with a negative stride. */
-
-		x0 = (double*)((x)+(n0 - 1)*(-*incx));
-		incx0 = (inc_t)(*incx);
-
-	}
-	else
-	{
-		x0 = (double*)(x);
-		incx0 = (inc_t)(*incx);
-	}
-
-	if (*incy < 0)
-	{
-		y0 = (y)+(n0 - 1)*(-*incy);
-		incy0 = (inc_t)(*incy);
-
-	}
-	else
-	{
-		y0 = (y);
-		incy0 = (inc_t)(*incy);
-	}
-
-	// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-	// This function is invoked on all architectures including ‘generic’.
-	// Invoke architecture specific kernels only if we are sure that we are running on zen,
-	// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-	if (bamdzen)
-	{
-		/* Call BLIS kernel */
-		bli_dcopyv_zen_int
-		(
-			BLIS_NO_CONJUGATE,
-			n0,
-			x0, incx0,
-			y0, incy0,
-			NULL
-		);
-	}
-	else
-	{
-		PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF)
-		(
-			BLIS_NO_CONJUGATE,
-			n0,
-			x0, incx0,
-			y0, incy0,
-			NULL,
-			NULL
-		);
-	}
-
-
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-	/* Finalize BLIS. */
-//    bli_finalize_auto();
-}
-
-INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
-#else
 INSERT_GENTFUNC_BLAS(copy, copyv)
 #endif
-#endif
--- a/frame/compat/bla_copy_amd.c
+++ b/frame/compat/bla_copy_amd.c
@@ -0,0 +1,285 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+	dim_t  n0; \
+	ftype* x0; \
+	ftype* y0; \
+	inc_t  incx0; \
+	inc_t  incy0; \
+\
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
+	AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy) \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	/* Convert/typecast negative values of n to zero. */ \
+	bli_convert_blas_dim1( *n, n0 ); \
+\
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */ \
+	   bli_convert_blas_incv(n0, (ftype*)x, *incx, x0, incx0); \
+	   bli_convert_blas_incv(n0, (ftype*)y, *incy, y0, incy0); \
+	   \
+	   /* Call BLIS interface. */ \
+	   PASTEMAC2(ch, blisname, BLIS_TAPI_EX_SUF) \
+	   (\
+	   BLIS_NO_CONJUGATE, \
+	   n0, \
+	   x0, incx0, \
+	   y0, incy0, \
+	   NULL, \
+	   NULL  \
+	   ); \
+	   \
+\
+           AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+\
+	   /* Finalize BLIS. */ \
+	   bli_finalize_auto(); \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+
+void scopy_
+(
+	const f77_int* n,
+	const float*   x, const f77_int* incx,
+	float*   y, const f77_int* incy
+)
+{
+	dim_t  n0;
+	float* x0;
+	float* y0;
+	inc_t  incx0;
+	inc_t  incy0;
+
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+	AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy)
+	/* Initialize BLIS. */
+//  bli_init_auto();
+
+	/* Convert/typecast negative values of n to zero. */
+	if (*n < 0)
+		n0 = (dim_t)0;
+	else
+		n0 = (dim_t)(*n);
+
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */
+	if (*incx < 0)
+	{
+		/* The semantics of negative stride in BLAS are that the vector
+		operand be traversed in reverse order. (Another way to think
+		of this is that negative strides effectively reverse the order
+		of the vector, but without any explicit data movements.) This
+		is also how BLIS interprets negative strides. The differences
+		is that with BLAS, the caller *always* passes in the 0th (i.e.,
+		top-most or left-most) element of the vector, even when the
+		stride is negative. By contrast, in BLIS, negative strides are
+		used *relative* to the vector address as it is given. Thus, in
+		BLIS, if this backwards traversal is desired, the caller *must*
+		pass in the address to the (n-1)th (i.e., the bottom-most or
+		right-most) element along with a negative stride. */
+
+		x0 = (float*)((x)+(n0 - 1)*(-*incx));
+		incx0 = (inc_t)(*incx);
+
+	}
+	else
+	{
+		x0 = (float*)(x);
+		incx0 = (inc_t)(*incx);
+	}
+
+	if (*incy < 0)
+	{
+		y0 = (y)+(n0 - 1)*(-*incy);
+		incy0 = (inc_t)(*incy);
+
+	}
+	else
+	{
+		y0 = (y);
+		incy0 = (inc_t)(*incy);
+	}
+
+	// This function is invoked on all architectures including ‘generic’.
+	// Non-AVX platforms will use the kernels derived from the context.
+	if (bli_cpuid_is_avx_supported() == TRUE)
+	{
+		/* Call BLIS kernel */
+		bli_scopyv_zen_int
+		(
+			BLIS_NO_CONJUGATE,
+			n0,
+			x0, incx0,
+			y0, incy0,
+			NULL
+		);
+	}
+	else
+	{
+		PASTEMAC2(s, copyv, BLIS_TAPI_EX_SUF)
+		(
+			BLIS_NO_CONJUGATE,
+			n0,
+			x0, incx0,
+			y0, incy0,
+			NULL,
+			NULL
+		);
+	}
+
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+	/* Finalize BLIS. */
+//    bli_finalize_auto();
+}
+
+void dcopy_
+(
+	const f77_int* n,
+	const double*   x, const f77_int* incx,
+	double*   y, const f77_int* incy
+)
+{
+	dim_t  n0;
+	double* x0;
+	double* y0;
+	inc_t  incx0;
+	inc_t  incy0;
+
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+	AOCL_DTL_LOG_COPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy)
+	/* Initialize BLIS. */
+//  bli_init_auto();
+
+	/* Convert/typecast negative values of n to zero. */
+	if (*n < 0)
+		n0 = (dim_t)0;
+	else
+		n0 = (dim_t)(*n);
+
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */
+	if (*incx < 0)
+	{
+		/* The semantics of negative stride in BLAS are that the vector
+		operand be traversed in reverse order. (Another way to think
+		of this is that negative strides effectively reverse the order
+		of the vector, but without any explicit data movements.) This
+		is also how BLIS interprets negative strides. The differences
+		is that with BLAS, the caller *always* passes in the 0th (i.e.,
+		top-most or left-most) element of the vector, even when the
+		stride is negative. By contrast, in BLIS, negative strides are
+		used *relative* to the vector address as it is given. Thus, in
+		BLIS, if this backwards traversal is desired, the caller *must*
+		pass in the address to the (n-1)th (i.e., the bottom-most or
+		right-most) element along with a negative stride. */
+
+		x0 = (double*)((x)+(n0 - 1)*(-*incx));
+		incx0 = (inc_t)(*incx);
+
+	}
+	else
+	{
+		x0 = (double*)(x);
+		incx0 = (inc_t)(*incx);
+	}
+
+	if (*incy < 0)
+	{
+		y0 = (y)+(n0 - 1)*(-*incy);
+		incy0 = (inc_t)(*incy);
+
+	}
+	else
+	{
+		y0 = (y);
+		incy0 = (inc_t)(*incy);
+	}
+
+	// This function is invoked on all architectures including ‘generic’.
+	// Non-AVX platforms will use the kernels derived from the context.
+	if (bli_cpuid_is_avx_supported() == TRUE)
+	{
+		/* Call BLIS kernel */
+		bli_dcopyv_zen_int
+		(
+			BLIS_NO_CONJUGATE,
+			n0,
+			x0, incx0,
+			y0, incy0,
+			NULL
+		);
+	}
+	else
+	{
+		PASTEMAC2(d, copyv, BLIS_TAPI_EX_SUF)
+		(
+			BLIS_NO_CONJUGATE,
+			n0,
+			x0, incx0,
+			y0, incy0,
+			NULL,
+			NULL
+		);
+	}
+
+
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+	/* Finalize BLIS. */
+//    bli_finalize_auto();
+}
+
+INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
+
+#endif
--- a/frame/compat/bla_dot.c
+++ b/frame/compat/bla_dot.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -90,681 +90,11 @@ ftype PASTEF772(ch,blasname,chc) \
 }

 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_CONFIG_EPYC
-float sdot_
-     (
-       const f77_int* n,
-       const float*   x, const f77_int* incx,
-       const float*   y, const f77_int* incy
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
-    dim_t  n0;
-    float* x0;
-    float* y0;
-    inc_t  incx0;
-    inc_t  incy0;
-    float  rho;
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = ((float*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = ((float*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((float*)y) + (n0-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-
-    }
-    else
-    {
-        y0    = ((float*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen)
-    {
-        /* Call BLIS kernel. */
-        bli_sdotv_zen_int10
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL
-        );
-    }
-    else
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF)
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL,
-        NULL
-        );
-    }
-
-    /* Finalize BLIS. */
-//  bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-    return rho;
-}
-
-double ddot_
-     (
-       const f77_int* n,
-       const double*   x, const f77_int* incx,
-       const double*   y, const f77_int* incy
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
-    dim_t  n0;
-    double* x0;
-    double* y0;
-    inc_t  incx0;
-    inc_t  incy0;
-    double  rho;
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = ((double*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = ((double*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((double*)y) + (n0-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-
-    }
-    else
-    {
-        y0    = ((double*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen)
-    {
-        /* Call BLIS kernel. */
-        bli_ddotv_zen_int10
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL
-        );
-    }
-    else
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF)
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL,
-        NULL
-        );
-    }
-
-    /* Finalize BLIS. */
-//  bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-    return rho;
-}
-#else
 INSERT_GENTFUNCDOTR_BLAS( dot, dotv )
-#endif

 #ifdef BLIS_ENABLE_BLAS
 #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
-#ifdef BLIS_CONFIG_EPYC
-scomplex cdotu_
-     (
-       const f77_int* n,
-       const scomplex*   x, const f77_int* incx,
-       const scomplex*   y, const f77_int* incy
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
-    dim_t  n0;
-    scomplex* x0;
-    scomplex* y0;
-    inc_t  incx0;
-    inc_t  incy0;
-    scomplex  rho;
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = ((scomplex*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = ((scomplex*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((scomplex*)y) + (n0-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-
-    }
-    else
-    {
-        y0    = ((scomplex*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen)
-    {
-        /* Call BLIS kernel. */
-        bli_cdotv_zen_int5
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL
-        );
-    }
-    else
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL,
-        NULL
-        );
-    }
-
-    /* Finalize BLIS. */
-//  bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-    return rho;
-}
-
-dcomplex zdotu_
-     (
-       const f77_int* n,
-       const dcomplex*   x, const f77_int* incx,
-       const dcomplex*   y, const f77_int* incy
-     )
-{
-    dim_t  n0;
-    dcomplex* x0;
-    dcomplex* y0;
-    inc_t  incx0;
-    inc_t  incy0;
-    dcomplex  rho;
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = ((dcomplex*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = ((dcomplex*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((dcomplex*)y) + (n0-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-
-    }
-    else
-    {
-        y0    = ((dcomplex*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen)
-    {
-        /* Call BLIS kernel. */
-        bli_zdotv_zen_int5
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL
-        );
-    }
-    else
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL,
-        NULL
-        );
-    }
-
-    /* Finalize BLIS. */
-//  bli_finalize_auto();
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-
-    return rho;
-}
-
-
-scomplex cdotc_
-     (
-       const f77_int* n,
-       const scomplex*   x, const f77_int* incx,
-       const scomplex*   y, const f77_int* incy
-     )
-{
-    dim_t  n0;
-    scomplex* x0;
-    scomplex* y0;
-    inc_t  incx0;
-    inc_t  incy0;
-    scomplex  rho;
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = ((scomplex*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = ((scomplex*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((scomplex*)y) + (n0-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-
-    }
-    else
-    {
-        y0    = ((scomplex*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen)
-    {
-        /* Call BLIS kernel. */
-        bli_cdotv_zen_int5
-        (
-        BLIS_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL
-        );
-    }
-    else
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
-        (
-        BLIS_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL,
-        NULL
-        );
-    }
-
-    /* Finalize BLIS. */
-//  bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-
-    return rho;
-}
-
-dcomplex zdotc_
-     (
-       const f77_int* n,
-       const dcomplex*   x, const f77_int* incx,
-       const dcomplex*   y, const f77_int* incy
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
-    dim_t  n0;
-    dcomplex* x0;
-    dcomplex* y0;
-    inc_t  incx0;
-    inc_t  incy0;
-    dcomplex  rho;
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = ((dcomplex*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = ((dcomplex*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((dcomplex*)y) + (n0-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-
-    }
-    else
-    {
-        y0    = ((dcomplex*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen)
-    {
-        /* Call BLIS kernel. */
-        bli_zdotv_zen_int5
-        (
-        BLIS_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL
-        );
-    }
-    else
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
-        (
-        BLIS_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL,
-        NULL
-        );
-    }
-
-
-
-
-
-    /* Finalize BLIS. */
-//  bli_finalize_auto();
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-
-    return rho;
-}
-#else
 INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
-#endif
 #else
 // For the "intel" complex return type, use a hidden parameter to return the result
 #undef  GENTFUNCDOT
@@ -819,8 +149,8 @@ void PASTEF772(ch,blasname,chc) \
 }

 INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
-#endif
-#endif
+#endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL
+#endif // BLIS_ENABLE_BLAS


 // -- "Black sheep" dot product function definitions --
@@ -894,4 +224,4 @@ double PASTEF77(d,sdot)
    return rho;
 }

-#endif
+#endif // BLIS_ENABLE_BLAS
--- a/frame/compat/bla_dot_amd.c
+++ b/frame/compat/bla_dot_amd.c
@@ -0,0 +1,841 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNCDOT
+#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
+\
+ftype PASTEF772(ch,blasname,chc) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
+    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy); \
+    dim_t  n0; \
+    ftype* x0; \
+    ftype* y0; \
+    inc_t  incx0; \
+    inc_t  incy0; \
+    ftype  rho; \
+\
+    /* Initialize BLIS. */ \
+    bli_init_auto(); \
+\
+    /* Convert/typecast negative values of n to zero. */ \
+    bli_convert_blas_dim1( *n, n0 ); \
+\
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */ \
+    bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
+    bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
+\
+    /* Call BLIS interface. */ \
+    PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+    ( \
+      blis_conjx, \
+      BLIS_NO_CONJUGATE, \
+      n0, \
+      x0, incx0, \
+      y0, incy0, \
+      &rho, \
+      NULL, \
+      NULL  \
+    ); \
+\
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+    /* Finalize BLIS. */ \
+    bli_finalize_auto(); \
+\
+    return rho; \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+float sdot_
+     (
+       const f77_int* n,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
+    dim_t  n0;
+    float* x0;
+    float* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    float  rho;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((float*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((float*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((float*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((float*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+    {
+        /* Call BLIS kernel. */
+        bli_sdotv_zen_int10
+        (
+        BLIS_NO_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL
+        );
+    }
+    else
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF)
+        (
+        BLIS_NO_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL,
+        NULL
+        );
+    }
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+    return rho;
+}
+
+double ddot_
+     (
+       const f77_int* n,
+       const double*   x, const f77_int* incx,
+       const double*   y, const f77_int* incy
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
+    dim_t  n0;
+    double* x0;
+    double* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    double  rho;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((double*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((double*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((double*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((double*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+    {
+        /* Call BLIS kernel. */
+        bli_ddotv_zen_int10
+        (
+        BLIS_NO_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL
+        );
+    }
+    else
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF)
+        (
+        BLIS_NO_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL,
+        NULL
+        );
+    }
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+    return rho;
+}
+
+#ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
+scomplex cdotu_
+     (
+       const f77_int* n,
+       const scomplex*   x, const f77_int* incx,
+       const scomplex*   y, const f77_int* incy
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
+    dim_t  n0;
+    scomplex* x0;
+    scomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    scomplex  rho;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((scomplex*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((scomplex*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((scomplex*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((scomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+    {
+        /* Call BLIS kernel. */
+        bli_cdotv_zen_int5
+        (
+        BLIS_NO_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL
+        );
+    }
+    else
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
+        (
+        BLIS_NO_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL,
+        NULL
+        );
+    }
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+    return rho;
+}
+
+dcomplex zdotu_
+     (
+       const f77_int* n,
+       const dcomplex*   x, const f77_int* incx,
+       const dcomplex*   y, const f77_int* incy
+     )
+{
+    dim_t  n0;
+    dcomplex* x0;
+    dcomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    dcomplex  rho;
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((dcomplex*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((dcomplex*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((dcomplex*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((dcomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+    {
+        /* Call BLIS kernel. */
+        bli_zdotv_zen_int5
+        (
+        BLIS_NO_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL
+        );
+    }
+    else
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
+        (
+        BLIS_NO_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL,
+        NULL
+        );
+    }
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+
+    return rho;
+}
+
+
+scomplex cdotc_
+     (
+       const f77_int* n,
+       const scomplex*   x, const f77_int* incx,
+       const scomplex*   y, const f77_int* incy
+     )
+{
+    dim_t  n0;
+    scomplex* x0;
+    scomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    scomplex  rho;
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *n, *incx, *incy);
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((scomplex*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((scomplex*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((scomplex*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((scomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+    {
+        /* Call BLIS kernel. */
+        bli_cdotv_zen_int5
+        (
+        BLIS_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL
+        );
+    }
+    else
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
+        (
+        BLIS_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL,
+        NULL
+        );
+    }
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+
+    return rho;
+}
+
+dcomplex zdotc_
+     (
+       const f77_int* n,
+       const dcomplex*   x, const f77_int* incx,
+       const dcomplex*   y, const f77_int* incy
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *n, *incx, *incy);
+    dim_t  n0;
+    dcomplex* x0;
+    dcomplex* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+    dcomplex  rho;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = ((dcomplex*)x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = ((dcomplex*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((dcomplex*)y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = ((dcomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE)
+    {
+        /* Call BLIS kernel. */
+        bli_zdotv_zen_int5
+        (
+        BLIS_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL
+        );
+    }
+    else
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
+        (
+        BLIS_CONJUGATE,
+        BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        &rho,
+        NULL,
+        NULL
+        );
+    }
+
+
+
+
+
+    /* Finalize BLIS. */
+//  bli_finalize_auto();
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+
+    return rho;
+}
+
+#else // BLIS_DISABLE_COMPLEX_RETURN_INTEL
+// For the "intel" complex return type, use a hidden parameter to return the result
+#undef  GENTFUNCDOT
+#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
+\
+void PASTEF772(ch,blasname,chc) \
+     ( \
+       ftype*         rhop, \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
+  AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *n, *incx, *incy); \
+        dim_t  n0; \
+        ftype* x0; \
+        ftype* y0; \
+        inc_t  incx0; \
+        inc_t  incy0; \
+        ftype  rho; \
+\
+        /* Initialize BLIS. */ \
+        bli_init_auto(); \
+\
+        /* Convert/typecast negative values of n to zero. */ \
+        bli_convert_blas_dim1( *n, n0 ); \
+\
+        /* If the input increments are negative, adjust the pointers so we can
+           use positive increments instead. */ \
+        bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
+        bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
+\
+        /* Call BLIS interface. */ \
+        PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+        ( \
+          blis_conjx, \
+          BLIS_NO_CONJUGATE, \
+          n0, \
+          x0, incx0, \
+          y0, incy0, \
+          &rho, \
+          NULL, \
+          NULL  \
+        ); \
+\
+        /* Finalize BLIS. */ \
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+        bli_finalize_auto(); \
+\
+        *rhop = rho; \
+}
+
+INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
+#endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL
+
+
+
+// -- "Black sheep" dot product function definitions --
+
+// Input vectors stored in single precision, computed in double precision,
+// with result returned in single precision.
+float PASTEF77(sd,sdot)
+     (
+       const f77_int* n,
+       const float*   sb,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     )
+{
+    return ( float )
+           (
+             ( double )(*sb) +
+             PASTEF77(d,sdot)
+             (
+               n,
+               x, incx,
+               y, incy
+             )
+           );
+}
+
+// Input vectors stored in single precision, computed in double precision,
+// with result returned in double precision.
+double PASTEF77(d,sdot)
+     (
+       const f77_int* n,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     )
+{
+    dim_t   n0;
+    float*  x0;
+    float*  y0;
+    inc_t   incx0;
+    inc_t   incy0;
+    double  rho;
+    dim_t   i;
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
+    /* Initialization of BLIS is not required. */
+
+    /* Convert/typecast negative values of n to zero. */
+    bli_convert_blas_dim1( *n, n0 );
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
+    bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
+
+    rho = 0.0;
+
+    for ( i = 0; i < n0; i++ )
+    {
+        float* chi1 = x0 + (i  )*incx0;
+        float* psi1 = y0 + (i  )*incy0;
+
+        bli_ddots( (( double )(*chi1)),
+                   (( double )(*psi1)), rho );
+    }
+
+    /* Finalization of BLIS is not required, because initialization was
+       not required. */
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+
+    return rho;
+}
+
+#endif
--- a/frame/compat/bla_gemm.c
+++ b/frame/compat/bla_gemm.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 21, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -300,512 +300,7 @@ void PASTEF77(ch,blasname) \
 #endif

 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_CONFIG_EPYC
-void dgemm_
-(
-	const f77_char* transa,
-	const f77_char* transb,
-	const f77_int* m,
-	const f77_int* n,
-	const f77_int* k,
-	const double* alpha,
-	const double* a, const f77_int* lda,
-	const double* b, const f77_int* ldb,
-	const double* beta,
-	double* c, const f77_int* ldc
-)
-{
-
-
-
-  trans_t blis_transa;
-  trans_t blis_transb;
-  dim_t   m0, n0, k0;
-
-  /* Initialize BLIS. */
-  bli_init_auto();
-
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
-                           (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
-
-  /* Perform BLAS parameter checking. */
-  PASTEBLACHK(gemm)
-    (
-     MKSTR(d),
-     MKSTR(gemm),
-     transa,
-     transb,
-     m,
-     n,
-     k,
-     lda,
-     ldb,
-     ldc
-    );
-
-	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
-  bli_param_map_netlib_to_blis_trans(*transa, &blis_transa);
-  bli_param_map_netlib_to_blis_trans(*transb, &blis_transb);
-
-  /* Typecast BLAS integers to BLIS integers. */
-  bli_convert_blas_dim1(*m, m0);
-  bli_convert_blas_dim1(*n, n0);
-  bli_convert_blas_dim1(*k, k0);
-
-
-    /* Set the row and column strides of the matrix operands. */
-    const inc_t rs_a = 1;
-    const inc_t cs_a = *lda;
-    const inc_t rs_b = 1;
-    const inc_t cs_b = *ldb;
-    const inc_t rs_c = 1;
-    const inc_t cs_c = *ldc;
-
-	// When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-	// This function is invoked on all architectures including ‘generic’.
-	// Invoke architecture specific kernels only if we are sure that we are running on zen,
-	// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-	arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-	if (!bamdzen)
-	{
-		// This code is duplicated below, however we don't want to move it out of
-		// this IF block as it will affect the performance on Zen architetures
-		// Also this is temporary fix which will be replaced later.
-		const num_t dt = BLIS_DOUBLE;
-
-		obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
-		obj_t ao = BLIS_OBJECT_INITIALIZER;
-		obj_t bo = BLIS_OBJECT_INITIALIZER;
-		obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
-		obj_t co = BLIS_OBJECT_INITIALIZER;
-
-		dim_t m0_a, n0_a;
-		dim_t m0_b, n0_b;
-
-		bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
-		bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
-
-		bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao);
-		bli_obj_init_finish_1x1(dt, (double *)beta, &betao);
-
-		bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao);
-		bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo);
-		bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co);
-
-		bli_obj_set_conjtrans(blis_transa, &ao);
-		bli_obj_set_conjtrans(blis_transb, &bo);
-
-		// Will call parallelized dgemm code - sup & native
-		PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
-		(
-			&alphao,
-			&ao,
-			&bo,
-			&betao,
-			&co,
-			NULL,
-			NULL
-		);
-
-		AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-		/* Finalize BLIS. */
-		bli_finalize_auto();
-		return;
-	}
-
-	if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
-    {
-	bli_dgemm_ref_k1_nn( m0, n0, k0,
-			  (double*)alpha,
-			  (double*)a, *lda,
-			  (double*)b, *ldb,
-			  (double*)beta,
-			  c, *ldc
-			);
-	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-	/* Finalize BLIS */
-	bli_finalize_auto();
-
-	return;
-    }
-
-    if (n0 == 1)
-    {
-	if (bli_is_notrans(blis_transa))
-	{
-	    bli_dgemv_unf_var2(
-		BLIS_NO_TRANSPOSE,
-		bli_extract_conj(blis_transb),
-		m0, k0,
-		(double*)alpha,
-		(double*)a, rs_a, cs_a,
-		(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
-		(double*)beta,
-		c, rs_c,
-		((void*)0)
-	    );
-	}
-	else
-	{
-	    bli_dgemv_unf_var1(
-		blis_transa,
-		bli_extract_conj(blis_transb),
-		k0, m0,
-		(double*)alpha,
-		(double*)a, rs_a, cs_a,
-		(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
-		(double*)beta,
-		c, rs_c,
-		((void*)0)
-	    );
-	}
-
-	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-
-	return;
-    }
-    else if (m0 == 1)
-    {
-	if (bli_is_notrans(blis_transb))
-	{
-	    bli_dgemv_unf_var1(
-		blis_transb,
-		bli_extract_conj(blis_transa),
-		n0, k0,
-		(double*)alpha,
-		(double*)b, cs_b, rs_b,
-		(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
-		(double*)beta,
-		c, cs_c,
-		((void*)0)
-	    );
-	}
-	else
-	{
-	    bli_dgemv_unf_var2(
-		blis_transb,
-		bli_extract_conj(blis_transa),
-		k0, n0,
-		(double*)alpha,
-		(double*)b, cs_b, rs_b,
-		(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
-		(double*)beta,
-		c, cs_c,
-		((void*)0)
-	    );
-	}
-	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-	return;
-    }
-
-    const num_t dt = BLIS_DOUBLE;
-
-    obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1;
-    obj_t       ao = BLIS_OBJECT_INITIALIZER;
-    obj_t       bo = BLIS_OBJECT_INITIALIZER;
-    obj_t       betao = BLIS_OBJECT_INITIALIZER_1X1;
-    obj_t       co = BLIS_OBJECT_INITIALIZER;
-
-    dim_t       m0_a, n0_a;
-    dim_t       m0_b, n0_b;
-
-    bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
-    bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
-
-    bli_obj_init_finish_1x1(dt, (double*)alpha, &alphao);
-    bli_obj_init_finish_1x1(dt, (double*)beta, &betao);
-
-    bli_obj_init_finish(dt, m0_a, n0_a, (double*)a, rs_a, cs_a, &ao);
-    bli_obj_init_finish(dt, m0_b, n0_b, (double*)b, rs_b, cs_b, &bo);
-    bli_obj_init_finish(dt, m0, n0, (double*)c, rs_c, cs_c, &co);
-
-    bli_obj_set_conjtrans(blis_transa, &ao);
-    bli_obj_set_conjtrans(blis_transb, &bo);
-
-    //cntx_t* cntx = bli_gks_query_cntx();
-    //dim_t nt = bli_thread_get_num_threads(); // get number of threads
-    bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
-
-    // if m0 is large and (n0 & k0) < 10 - SMALL GEMM - ST is better
-    //
-
-#ifdef AOCL_DYNAMIC
-    if (nt && ((n0 > 10 ) || (k0 > 10)) )
-#else
-      if (nt)
-#endif
-      {
-	// Will call parallelized dgemm code - sup & native
-	PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
-		(
-			&alphao,
-			&ao,
-			&bo,
-			&betao,
-			&co,
-			NULL,
-			NULL
-		 );
-	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-	/* Finalize BLIS. */
-	bli_finalize_auto();
-	return;
-      }
-
-    // The code below will be called when number of threads = 1.
-
-#ifdef BLIS_ENABLE_SMALL_MATRIX
-
-    //if( ((m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) && (n0 > 2))
-    if( ( ( (m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) ) ||
-	  ((n0 <= 10) && (k0 <=10)) )
-      {
-	err_t status;
-	if (bli_is_notrans(blis_transa))
-	  {
-	    status =  bli_dgemm_small( &alphao,
-					   &ao,
-					   &bo,
-					   &betao,
-					   &co,
-					   NULL, //cntx,
-					   NULL
-					   );
-	  }
-	else
-	  {
-	    status =  bli_dgemm_small_At ( &alphao,
-					           &ao,
-					           &bo,
-					           &betao,
-					           &co,
-					           NULL, //cntx,
-					           NULL
-					         );
-	  }
-
-	if (status == BLIS_SUCCESS)
-	  {
-	    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-	    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-	    /* Finalize BLIS. */
-	    bli_finalize_auto();
-
-	    return;
-	  }
-      }
-
-#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
-
-    err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
-	if (status == BLIS_SUCCESS)
-	{
-		AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-		return;
-	}
-
-	// fall back on native path when dgemm is not handled in sup path.
-	bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
-
-
-	/* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */
-	/*	( */
-	/*		&alphao, */
-	/*		&ao, */
-	/*		&bo, */
-	/*		&betao, */
-	/*		&co, */
-	/*		NULL, */
-	/*		NULL */
-	/*	 ); */
-
-	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-	/* Finalize BLIS. */
-	bli_finalize_auto();
-} // end of dgemm_
-
-void zgemm_
-     (
-       const f77_char* transa,
-       const f77_char* transb,
-       const f77_int*  m,
-       const f77_int*  n,
-       const f77_int*  k,
-       const dcomplex*    alpha,
-       const dcomplex*    a, const f77_int* lda,
-       const dcomplex*    b, const f77_int* ldb,
-       const dcomplex*    beta,
-             dcomplex*    c, const f77_int* ldc
-     )
-{
-  trans_t blis_transa;
-  trans_t blis_transb;
-  dim_t   m0, n0, k0;
-
-  /* Initialize BLIS. */
-  bli_init_auto();
-
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
-		(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
-
-  /* Perform BLAS parameter checking. */
-	PASTEBLACHK(gemm)
-	(
-	  MKSTR(z),
-	  MKSTR(gemm),
-	  transa,
-	  transb,
-	  m,
-	  n,
-	  k,
-	  lda,
-	  ldb,
-	  ldc
-	);
-
-	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
-	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
-	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
-
-	/* Typecast BLAS integers to BLIS integers. */
-	bli_convert_blas_dim1( *m, m0 );
-	bli_convert_blas_dim1( *n, n0 );
-	bli_convert_blas_dim1( *k, k0 );
-
-	/* Set the row and column strides of the matrix operands. */
-	const inc_t rs_a = 1;
-	const inc_t cs_a = *lda;
-	const inc_t rs_b = 1;
-	const inc_t cs_b = *ldb;
-	const inc_t rs_c = 1;
-	const inc_t cs_c = *ldc;
-
-	const num_t dt     = BLIS_DCOMPLEX;
-
-	obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1;
-	obj_t       ao     = BLIS_OBJECT_INITIALIZER;
-	obj_t       bo     = BLIS_OBJECT_INITIALIZER;
-	obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1;
-	obj_t       co     = BLIS_OBJECT_INITIALIZER;
-
-	dim_t       m0_a, n0_a;
-	dim_t       m0_b, n0_b;
-
-	bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
-	bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
-
-	bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
-	bli_obj_init_finish_1x1( dt, (dcomplex*)beta,  &betao  );
-
-	bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
-	bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
-	bli_obj_init_finish( dt, m0,   n0,   (dcomplex*)c, rs_c, cs_c, &co );
-
-	bli_obj_set_conjtrans( blis_transa, &ao );
-	bli_obj_set_conjtrans( blis_transb, &bo );
-
-	// default instance peformance tuning is done in zgemm.
-	// Single instance tuning is done based on env set.
-	dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 );
-
-	//dim_t nt = bli_thread_get_num_threads(); // get number of threads
-	bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
-	if ( nt )
-	{
-		// Will call parallelized zgemm code - sup & native
-		PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
-			(
-			&alphao,
-			&ao,
-			&bo,
-			&betao,
-			&co,
-			NULL,
-			NULL
-			);
-
-	    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-	    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-	    /* Finalize BLIS. */
-	    bli_finalize_auto();
-	    return;
-	  }
-
-    // The code below will be called when number of threads = 1.
-#if ENABLE_INDUCED_METHOD
-	/* 3m_sqp is optimal for certain matrix shapes.
-	   Initial study that it works well for square sizes and sizes closer to square shape.
-
-	   * Usage of 3m_sqp is restricted to sizes, where it is found efficient compared to native, sup and other induced method.
-	   * Further investigation is necessary to make the usage choices more generic.  */
-	bool sqp_on = false;
-	if( (m0 == n0 ) && ( n0 == k0 ) && ( m0 == 128 ) )
-	{
-		sqp_on = true;
-	}
-
-	// current range of sizes used for 3m_sqp to be expaned after evaluation.
-	if( ( m0 >= 4200) && ( m0 <= 4600 ) && ( ( n0 >= 326 ) || (n0 <= 1600 ) )
-     && ( k0 == 1120 ) ) //to be tuned further.
-	{
-		sqp_on = true;
-	}
-
-	if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) )
-	{
-		//sqp algo is found better for n > 40
-		if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS)
-		{
-			AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-			AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-			return;
-		}
-	}
-#endif//ENABLE_INDUCED_METHOD
-
-// native tuning resulted in better numbers compared to sup in constrained multi-instance
-// sup has been enabled for single instance cases.
-	if(single_instance==1)
-	{
-		err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
-		if(status==BLIS_SUCCESS)
-		{
-			AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-			AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-			return;
-		}
-
-	}
-	// fall back on native path when zgemm is not handled in sup path.
-	bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
-	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-	return;
-
-
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-	/* Finalize BLIS. */
-	bli_finalize_auto();
-}// end of zgemm_
-
-
-INSERT_GENTFUNC_BLAS_SC( gemm, gemm )
-#else
 INSERT_GENTFUNC_BLAS( gemm,gemm )
-#endif

 // Observed a regression in dgemm with this function addition.
 // Disabling temporarily.
--- a/frame/compat/bla_gemm_amd.c
+++ b/frame/compat/bla_gemm_amd.c
@@ -0,0 +1,894 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019 - 22, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#define ENABLE_INDUCED_METHOD 0
+#ifdef BLIS_BLAS3_CALLS_TAPI
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+	     ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	trans_t blis_transa; \
+	trans_t blis_transb; \
+	dim_t   m0, n0, k0; \
+	inc_t   rs_a, cs_a; \
+	inc_t   rs_b, cs_b; \
+	inc_t   rs_c, cs_c; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
+	AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \
+				(void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
+\
+	/* Perform BLAS parameter checking. */ \
+	PASTEBLACHK(blasname) \
+	( \
+	  MKSTR(ch), \
+	  MKSTR(blasname), \
+	  transa, \
+	  transb, \
+	  m, \
+	  n, \
+	  k, \
+	  lda, \
+	  ldb, \
+	  ldc  \
+	); \
+\
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
+\
+	/* Typecast BLAS integers to BLIS integers. */ \
+	bli_convert_blas_dim1( *m, m0 ); \
+	bli_convert_blas_dim1( *n, n0 ); \
+	bli_convert_blas_dim1( *k, k0 ); \
+\
+	/* Set the row and column strides of the matrix operands. */ \
+	rs_a = 1; \
+	cs_a = *lda; \
+	rs_b = 1; \
+	cs_b = *ldb; \
+	rs_c = 1; \
+	cs_c = *ldc; \
+\
+	/* Call BLIS interface. */ \
+	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	( \
+	  blis_transa, \
+	  blis_transb, \
+	  m0, \
+	  n0, \
+	  k0, \
+	  (ftype*)alpha, \
+	  (ftype*)a, rs_a, cs_a, \
+	  (ftype*)b, rs_b, cs_b, \
+	  (ftype*)beta, \
+	  (ftype*)c, rs_c, cs_c, \
+	  NULL, \
+	  NULL  \
+	); \
+\
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	/* Finalize BLIS. */				 \
+	bli_finalize_auto(); \
+}
+
+#else
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+	     ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+\
+	trans_t blis_transa; \
+	trans_t blis_transb; \
+	dim_t   m0, n0, k0; \
+\
+	dim_t       m0_a, n0_a; \
+	dim_t       m0_b, n0_b; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
+	AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \
+				(void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
+\
+	/* Perform BLAS parameter checking. */ \
+	PASTEBLACHK(blasname) \
+	( \
+	  MKSTR(ch), \
+	  MKSTR(blasname), \
+	  transa, \
+	  transb, \
+	  m, \
+	  n, \
+	  k, \
+	  lda, \
+	  ldb, \
+	  ldc  \
+	); \
+\
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
+\
+	/* Typecast BLAS integers to BLIS integers. */ \
+	bli_convert_blas_dim1( *m, m0 ); \
+	bli_convert_blas_dim1( *n, n0 ); \
+	bli_convert_blas_dim1( *k, k0 ); \
+\
+	/* Set the row and column strides of the matrix operands. */ \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
+\
+	if( n0 == 1 ) \
+	{ \
+		if(bli_is_notrans(blis_transa)) \
+		{ \
+			PASTEMAC(ch,gemv_unf_var2)( \
+					BLIS_NO_TRANSPOSE, \
+					bli_extract_conj(blis_transb), \
+					m0, k0, \
+					(ftype*)alpha, \
+					(ftype*)a, rs_a, cs_a,\
+					(ftype*)b, bli_is_notrans(blis_transb)?rs_b:cs_b, \
+					(ftype*) beta, \
+					c, rs_c, \
+					NULL \
+					); \
+		} \
+		else \
+		{ \
+			PASTEMAC(ch,gemv_unf_var1)( \
+					blis_transa, \
+					bli_extract_conj(blis_transb), \
+					k0, m0, \
+					(ftype*)alpha, \
+					(ftype*)a, rs_a, cs_a, \
+					(ftype*)b, bli_is_notrans(blis_transb)?rs_b:cs_b, \
+					(ftype*)beta, \
+					c, rs_c, \
+					NULL \
+					); \
+		} \
+		AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+		return; \
+	} \
+	else if( m0 == 1 ) \
+	{ \
+		if(bli_is_notrans(blis_transb)) \
+		{ \
+			PASTEMAC(ch,gemv_unf_var1)( \
+					blis_transb, \
+					bli_extract_conj(blis_transa), \
+					n0, k0, \
+					(ftype*)alpha, \
+					(ftype*)b, cs_b, rs_b, \
+					(ftype*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, \
+					(ftype*)beta, \
+					c, cs_c, \
+					NULL \
+					); \
+		} \
+		else \
+		{ \
+			PASTEMAC(ch,gemv_unf_var2)( \
+					blis_transb, \
+					bli_extract_conj(blis_transa), \
+					k0, n0, \
+					(ftype*)alpha, \
+					(ftype*)b, cs_b, rs_b, \
+					(ftype*)a, bli_is_notrans(blis_transa)?cs_a:rs_a, \
+					(ftype*)beta, \
+					c, cs_c, \
+					NULL \
+					); \
+		} \
+		AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+		return; \
+	} \
+\
+	const num_t dt     = PASTEMAC(ch,type); \
+\
+	obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1; \
+	obj_t       ao     = BLIS_OBJECT_INITIALIZER; \
+	obj_t       bo     = BLIS_OBJECT_INITIALIZER; \
+	obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1; \
+	obj_t       co     = BLIS_OBJECT_INITIALIZER; \
+\
+	bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
+	bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
+\
+	bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, (ftype*)beta,  &betao  ); \
+\
+	bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m0,   n0,   (ftype*)c, rs_c, cs_c, &co ); \
+\
+	bli_obj_set_conjtrans( blis_transa, &ao ); \
+	bli_obj_set_conjtrans( blis_transb, &bo ); \
+\
+	PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
+	( \
+	  &alphao, \
+	  &ao, \
+	  &bo, \
+	  &betao, \
+	  &co, \
+	  NULL, \
+	  NULL  \
+	); \
+\
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	/* Finalize BLIS. */				 \
+	bli_finalize_auto(); \
+}
+#endif
+
+#ifdef BLIS_ENABLE_BLAS
+void dgemm_
+(
+	const f77_char* transa,
+	const f77_char* transb,
+	const f77_int* m,
+	const f77_int* n,
+	const f77_int* k,
+	const double* alpha,
+	const double* a, const f77_int* lda,
+	const double* b, const f77_int* ldb,
+	const double* beta,
+	double* c, const f77_int* ldc
+)
+{
+
+
+
+  trans_t blis_transa;
+  trans_t blis_transb;
+  dim_t   m0, n0, k0;
+
+  /* Initialize BLIS. */
+  bli_init_auto();
+
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+  AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
+                           (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
+
+  /* Perform BLAS parameter checking. */
+  PASTEBLACHK(gemm)
+    (
+     MKSTR(d),
+     MKSTR(gemm),
+     transa,
+     transb,
+     m,
+     n,
+     k,
+     lda,
+     ldb,
+     ldc
+    );
+
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+  bli_param_map_netlib_to_blis_trans(*transa, &blis_transa);
+  bli_param_map_netlib_to_blis_trans(*transb, &blis_transb);
+
+  /* Typecast BLAS integers to BLIS integers. */
+  bli_convert_blas_dim1(*m, m0);
+  bli_convert_blas_dim1(*n, n0);
+  bli_convert_blas_dim1(*k, k0);
+
+
+    /* Set the row and column strides of the matrix operands. */
+    const inc_t rs_a = 1;
+    const inc_t cs_a = *lda;
+    const inc_t rs_b = 1;
+    const inc_t cs_b = *ldb;
+    const inc_t rs_c = 1;
+    const inc_t cs_c = *ldc;
+
+	// This function is invoked on all architectures including ‘generic’.
+	// Non-AVX platforms will use the kernels derived from the context.
+	if (bli_cpuid_is_avx_supported() == FALSE)
+	{
+		// This code is duplicated below, however we don't want to move it out of
+		// this IF block as it will affect the performance on Zen architetures
+		// Also this is temporary fix which will be replaced later.
+		const num_t dt = BLIS_DOUBLE;
+
+		obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
+		obj_t ao = BLIS_OBJECT_INITIALIZER;
+		obj_t bo = BLIS_OBJECT_INITIALIZER;
+		obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
+		obj_t co = BLIS_OBJECT_INITIALIZER;
+
+		dim_t m0_a, n0_a;
+		dim_t m0_b, n0_b;
+
+		bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
+		bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
+
+		bli_obj_init_finish_1x1(dt, (double *)alpha, &alphao);
+		bli_obj_init_finish_1x1(dt, (double *)beta, &betao);
+
+		bli_obj_init_finish(dt, m0_a, n0_a, (double *)a, rs_a, cs_a, &ao);
+		bli_obj_init_finish(dt, m0_b, n0_b, (double *)b, rs_b, cs_b, &bo);
+		bli_obj_init_finish(dt, m0, n0, (double *)c, rs_c, cs_c, &co);
+
+		bli_obj_set_conjtrans(blis_transa, &ao);
+		bli_obj_set_conjtrans(blis_transb, &bo);
+
+		// Will call parallelized dgemm code - sup & native
+		PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
+		(
+			&alphao,
+			&ao,
+			&bo,
+			&betao,
+			&co,
+			NULL,
+			NULL
+		);
+
+		AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+		/* Finalize BLIS. */
+		bli_finalize_auto();
+		return;
+	}
+
+	if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
+    {
+	bli_dgemm_ref_k1_nn( m0, n0, k0,
+			  (double*)alpha,
+			  (double*)a, *lda,
+			  (double*)b, *ldb,
+			  (double*)beta,
+			  c, *ldc
+			);
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+	/* Finalize BLIS */
+	bli_finalize_auto();
+
+	return;
+    }
+
+    if (n0 == 1)
+    {
+	if (bli_is_notrans(blis_transa))
+	{
+	    bli_dgemv_unf_var2(
+		BLIS_NO_TRANSPOSE,
+		bli_extract_conj(blis_transb),
+		m0, k0,
+		(double*)alpha,
+		(double*)a, rs_a, cs_a,
+		(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
+		(double*)beta,
+		c, rs_c,
+		((void*)0)
+	    );
+	}
+	else
+	{
+	    bli_dgemv_unf_var1(
+		blis_transa,
+		bli_extract_conj(blis_transb),
+		k0, m0,
+		(double*)alpha,
+		(double*)a, rs_a, cs_a,
+		(double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
+		(double*)beta,
+		c, rs_c,
+		((void*)0)
+	    );
+	}
+
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+
+	return;
+    }
+    else if (m0 == 1)
+    {
+	if (bli_is_notrans(blis_transb))
+	{
+	    bli_dgemv_unf_var1(
+		blis_transb,
+		bli_extract_conj(blis_transa),
+		n0, k0,
+		(double*)alpha,
+		(double*)b, cs_b, rs_b,
+		(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
+		(double*)beta,
+		c, cs_c,
+		((void*)0)
+	    );
+	}
+	else
+	{
+	    bli_dgemv_unf_var2(
+		blis_transb,
+		bli_extract_conj(blis_transa),
+		k0, n0,
+		(double*)alpha,
+		(double*)b, cs_b, rs_b,
+		(double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
+		(double*)beta,
+		c, cs_c,
+		((void*)0)
+	    );
+	}
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	return;
+    }
+
+    const num_t dt = BLIS_DOUBLE;
+
+    obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1;
+    obj_t       ao = BLIS_OBJECT_INITIALIZER;
+    obj_t       bo = BLIS_OBJECT_INITIALIZER;
+    obj_t       betao = BLIS_OBJECT_INITIALIZER_1X1;
+    obj_t       co = BLIS_OBJECT_INITIALIZER;
+
+    dim_t       m0_a, n0_a;
+    dim_t       m0_b, n0_b;
+
+    bli_set_dims_with_trans(blis_transa, m0, k0, &m0_a, &n0_a);
+    bli_set_dims_with_trans(blis_transb, k0, n0, &m0_b, &n0_b);
+
+    bli_obj_init_finish_1x1(dt, (double*)alpha, &alphao);
+    bli_obj_init_finish_1x1(dt, (double*)beta, &betao);
+
+    bli_obj_init_finish(dt, m0_a, n0_a, (double*)a, rs_a, cs_a, &ao);
+    bli_obj_init_finish(dt, m0_b, n0_b, (double*)b, rs_b, cs_b, &bo);
+    bli_obj_init_finish(dt, m0, n0, (double*)c, rs_c, cs_c, &co);
+
+    bli_obj_set_conjtrans(blis_transa, &ao);
+    bli_obj_set_conjtrans(blis_transb, &bo);
+
+    //cntx_t* cntx = bli_gks_query_cntx();
+    //dim_t nt = bli_thread_get_num_threads(); // get number of threads
+    bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
+
+    // if m0 is large and (n0 & k0) < 10 - SMALL GEMM - ST is better
+    //
+
+#ifdef AOCL_DYNAMIC
+    if (nt && ((n0 > 10 ) || (k0 > 10)) )
+#else
+      if (nt)
+#endif
+      {
+	// Will call parallelized dgemm code - sup & native
+	PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
+		(
+			&alphao,
+			&ao,
+			&bo,
+			&betao,
+			&co,
+			NULL,
+			NULL
+		 );
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+	/* Finalize BLIS. */
+	bli_finalize_auto();
+	return;
+      }
+
+    // The code below will be called when number of threads = 1.
+
+#ifdef BLIS_ENABLE_SMALL_MATRIX
+
+    //if( ((m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) && (n0 > 2))
+    if( ( ( (m0 + n0 -k0) < 2000) && ((m0 + k0-n0) < 2000) && ((n0 + k0-m0) < 2000) ) ||
+	  ((n0 <= 10) && (k0 <=10)) )
+      {
+	err_t status;
+	if (bli_is_notrans(blis_transa))
+	  {
+	    status =  bli_dgemm_small( &alphao,
+					   &ao,
+					   &bo,
+					   &betao,
+					   &co,
+					   NULL, //cntx,
+					   NULL
+					   );
+	  }
+	else
+	  {
+	    status =  bli_dgemm_small_At ( &alphao,
+					           &ao,
+					           &bo,
+					           &betao,
+					           &co,
+					           NULL, //cntx,
+					           NULL
+					         );
+	  }
+
+	if (status == BLIS_SUCCESS)
+	  {
+	    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+	    /* Finalize BLIS. */
+	    bli_finalize_auto();
+
+	    return;
+	  }
+      }
+
+#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
+
+    err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
+	if (status == BLIS_SUCCESS)
+	{
+		AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+		return;
+	}
+
+	// fall back on native path when dgemm is not handled in sup path.
+	bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
+
+
+	/* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */
+	/*	( */
+	/*		&alphao, */
+	/*		&ao, */
+	/*		&bo, */
+	/*		&betao, */
+	/*		&co, */
+	/*		NULL, */
+	/*		NULL */
+	/*	 ); */
+
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+	/* Finalize BLIS. */
+	bli_finalize_auto();
+} // end of dgemm_
+
+void zgemm_
+     (
+       const f77_char* transa,
+       const f77_char* transb,
+       const f77_int*  m,
+       const f77_int*  n,
+       const f77_int*  k,
+       const dcomplex*    alpha,
+       const dcomplex*    a, const f77_int* lda,
+       const dcomplex*    b, const f77_int* ldb,
+       const dcomplex*    beta,
+             dcomplex*    c, const f77_int* ldc
+     )
+{
+  trans_t blis_transa;
+  trans_t blis_transb;
+  dim_t   m0, n0, k0;
+
+  /* Initialize BLIS. */
+  bli_init_auto();
+
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+  AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
+		(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
+
+  /* Perform BLAS parameter checking. */
+	PASTEBLACHK(gemm)
+	(
+	  MKSTR(z),
+	  MKSTR(gemm),
+	  transa,
+	  transb,
+	  m,
+	  n,
+	  k,
+	  lda,
+	  ldb,
+	  ldc
+	);
+
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
+	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
+
+	/* Typecast BLAS integers to BLIS integers. */
+	bli_convert_blas_dim1( *m, m0 );
+	bli_convert_blas_dim1( *n, n0 );
+	bli_convert_blas_dim1( *k, k0 );
+
+	/* Set the row and column strides of the matrix operands. */
+	const inc_t rs_a = 1;
+	const inc_t cs_a = *lda;
+	const inc_t rs_b = 1;
+	const inc_t cs_b = *ldb;
+	const inc_t rs_c = 1;
+	const inc_t cs_c = *ldc;
+
+	const num_t dt     = BLIS_DCOMPLEX;
+
+	obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1;
+	obj_t       ao     = BLIS_OBJECT_INITIALIZER;
+	obj_t       bo     = BLIS_OBJECT_INITIALIZER;
+	obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1;
+	obj_t       co     = BLIS_OBJECT_INITIALIZER;
+
+	dim_t       m0_a, n0_a;
+	dim_t       m0_b, n0_b;
+
+	bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
+	bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
+
+	bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
+	bli_obj_init_finish_1x1( dt, (dcomplex*)beta,  &betao  );
+
+	bli_obj_init_finish( dt, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
+	bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
+	bli_obj_init_finish( dt, m0,   n0,   (dcomplex*)c, rs_c, cs_c, &co );
+
+	bli_obj_set_conjtrans( blis_transa, &ao );
+	bli_obj_set_conjtrans( blis_transb, &bo );
+
+	// default instance peformance tuning is done in zgemm.
+	// Single instance tuning is done based on env set.
+	dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 );
+
+	//dim_t nt = bli_thread_get_num_threads(); // get number of threads
+	bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
+	if ( nt )
+	{
+		// Will call parallelized zgemm code - sup & native
+		PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
+			(
+			&alphao,
+			&ao,
+			&bo,
+			&betao,
+			&co,
+			NULL,
+			NULL
+			);
+
+	    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+	    /* Finalize BLIS. */
+	    bli_finalize_auto();
+	    return;
+	  }
+
+    // The code below will be called when number of threads = 1.
+#if ENABLE_INDUCED_METHOD
+	/* 3m_sqp is optimal for certain matrix shapes.
+	   Initial study that it works well for square sizes and sizes closer to square shape.
+
+	   * Usage of 3m_sqp is restricted to sizes, where it is found efficient compared to native, sup and other induced method.
+	   * Further investigation is necessary to make the usage choices more generic.  */
+	bool sqp_on = false;
+	if( (m0 == n0 ) && ( n0 == k0 ) && ( m0 == 128 ) )
+	{
+		sqp_on = true;
+	}
+
+	// current range of sizes used for 3m_sqp to be expaned after evaluation.
+	if( ( m0 >= 4200) && ( m0 <= 4600 ) && ( ( n0 >= 326 ) || (n0 <= 1600 ) )
+     && ( k0 == 1120 ) ) //to be tuned further.
+	{
+		sqp_on = true;
+	}
+
+	if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) )
+	{
+		//sqp algo is found better for n > 40
+		if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS)
+		{
+			AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+			AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+			return;
+		}
+	}
+#endif//ENABLE_INDUCED_METHOD
+
+// native tuning resulted in better numbers compared to sup in constrained multi-instance
+// sup has been enabled for single instance cases.
+	if(single_instance==1)
+	{
+		err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
+		if(status==BLIS_SUCCESS)
+		{
+			AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+			AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+			return;
+		}
+
+	}
+	// fall back on native path when zgemm is not handled in sup path.
+	bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+	return;
+
+
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+	/* Finalize BLIS. */
+	bli_finalize_auto();
+}// end of zgemm_
+
+
+INSERT_GENTFUNC_BLAS_SC( gemm, gemm )
+
+
+// Observed a regression in dgemm with this function addition.
+// Disabling temporarily.
+#if 0
+void dzgemm_
+     (
+       const f77_char* transa,
+       const f77_char* transb,
+       const f77_int*  m,
+       const f77_int*  n,
+       const f77_int*  k,
+       const dcomplex*    alpha,
+       const double*    a, const f77_int* lda,
+       const dcomplex*    b, const f77_int* ldb,
+       const dcomplex*    beta,
+             dcomplex*    c, const f77_int* ldc
+     )
+{
+
+  trans_t blis_transa;
+  trans_t blis_transb;
+  dim_t   m0, n0, k0;
+
+  /* Initialize BLIS. */
+  bli_init_auto();
+
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+  AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
+				(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
+
+  /* Perform BLAS parameter checking. */
+	PASTEBLACHK(gemm)
+	(
+	  MKSTR(z),
+	  MKSTR(gemm),
+	  transa,
+	  transb,
+	  m,
+	  n,
+	  k,
+	  lda,
+	  ldb,
+	  ldc
+	);
+
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
+	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
+
+	/* Typecast BLAS integers to BLIS integers. */
+	bli_convert_blas_dim1( *m, m0 );
+	bli_convert_blas_dim1( *n, n0 );
+	bli_convert_blas_dim1( *k, k0 );
+
+	/* Set the row and column strides of the matrix operands. */
+	const inc_t rs_a = 1;
+	const inc_t cs_a = *lda;
+	const inc_t rs_b = 1;
+	const inc_t cs_b = *ldb;
+	const inc_t rs_c = 1;
+	const inc_t cs_c = *ldc;
+
+	const num_t dt     = BLIS_DCOMPLEX;
+	const num_t dt_a   = BLIS_DOUBLE;
+
+	obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1;
+	obj_t       ao     = BLIS_OBJECT_INITIALIZER;
+	obj_t       bo     = BLIS_OBJECT_INITIALIZER;
+	obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1;
+	obj_t       co     = BLIS_OBJECT_INITIALIZER;
+
+	dim_t       m0_a, n0_a;
+	dim_t       m0_b, n0_b;
+
+	bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
+	bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
+
+	bli_obj_init_finish_1x1( dt, (dcomplex*)alpha, &alphao );
+	bli_obj_init_finish_1x1( dt, (dcomplex*)beta,  &betao  );
+
+	bli_obj_init_finish( dt_a, m0_a, n0_a, (dcomplex*)a, rs_a, cs_a, &ao );
+	bli_obj_init_finish( dt, m0_b, n0_b, (dcomplex*)b, rs_b, cs_b, &bo );
+	bli_obj_init_finish( dt, m0,   n0,   (dcomplex*)c, rs_c, cs_c, &co );
+
+	bli_obj_set_conjtrans( blis_transa, &ao );
+	bli_obj_set_conjtrans( blis_transb, &bo );
+
+		// fall back on native path when zgemm is not handled in sup path.
+	bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
+
+
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+	/* Finalize BLIS. */
+	bli_finalize_auto();
+}// end of dzgemm_
+#endif
+#endif
--- a/frame/compat/bla_gemv.c
+++ b/frame/compat/bla_gemv.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 21, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -147,856 +147,5 @@ void PASTEF77(ch,blasname) \


 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_CONFIG_EPYC
-void dgemv_
-     (
-       const f77_char* transa,
-       const f77_int*  m,
-       const f77_int*  n,
-       const double*    alpha,
-       const double*    a, const f77_int* lda,
-       const double*    x, const f77_int* incx,
-       const double*    beta,
-             double*    y, const f77_int* incy
-     )
-{
-    trans_t blis_transa;
-    dim_t   m0, n0;
-    dim_t   m_y, n_x;
-    double*  x0;
-    double*  y0;
-    inc_t   incx0;
-    inc_t   incy0;
-    inc_t   rs_a, cs_a;
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
-
-    /* Perform BLAS parameter checking. */
-    PASTEBLACHK(gemv)
-    (
-      MKSTR(d),
-      MKSTR(gemv),
-      transa,
-      m,
-      n,
-      lda,
-      incx,
-      incy
-    );
-
-    if (*m == 0 || *n == 0)
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
-    if      ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
-    else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
-    else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
-    else
-    {
-        // See comment for bli_param_map_netlib_to_blis_side() above.
-        //bli_check_error_code( BLIS_INVALID_TRANS );
-        blis_transa = BLIS_NO_TRANSPOSE;
-    }
-
-    /* Convert/typecast negative values of m and n to zero. */
-    if ( *m < 0 ) m0 = ( dim_t )0;
-    else          m0 = ( dim_t )(*m);
-
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else          n0 = ( dim_t )(*n);
-
-    /* Determine the dimensions of x and y so we can adjust the increments,
-        if necessary.*/
-    if ( bli_does_notrans( blis_transa ) )
-    {
-        m_y = m0;
-        n_x = n0;
-    }
-    else
-    {
-        m_y = n0;
-        n_x = m0;
-    }
-
-    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
-        in a peculiar way. In these situations, BLAS returns without performing
-        any action, even though most sane interpretations of gemv would have the
-        the operation reduce to y := beta * y. Here, we catch those cases that
-        BLAS would normally mishandle and emulate the BLAS exactly so as to
-        provide "bug-for-bug" compatibility. Note that this extreme level of
-        compatibility would not be as much of an issue if it weren't for the
-        fact that some BLAS test suites actually test for these cases. Also, it
-        should be emphasized that BLIS, if called natively, does NOT exhibit
-        this quirky behavior; it will scale y by beta, as one would expect. */
-    if ( m_y > 0 && n_x == 0 )
-    {
-        /* Finalize BLIS. */
-        //      bli_finalize_auto();
-
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* If the input increments are negative, adjust the pointers so we can
-    use positive increments instead. */
-    if ( *incx < 0 )
-    {
-        x0    = ((double*)x) + (n_x-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-    }
-    else
-    {
-        x0    = ((double*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((double*)y) + (m_y-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-    }
-    else
-    {
-        y0    = ((double*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    /* Set the row and column strides of A. */
-    rs_a = 1;
-    cs_a = *lda;
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen == 0)
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF)
-        (
-          blis_transa,
-          BLIS_NO_CONJUGATE,
-          m0,
-          n0,
-          (double*)alpha,
-          (double*)a,  rs_a, cs_a,
-          x0, incx0,
-          (double*)beta,
-          y0, incy0,
-          NULL,
-          NULL
-        );
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* Call variants based on transpose value. */
-    if(bli_does_notrans(blis_transa))
-    {
-        //variant_2 is chosen for column-storage
-        // and uses axpyf-based implementation
-        bli_dgemv_unf_var2
-        (
-            blis_transa,
-            BLIS_NO_CONJUGATE,
-            m0,
-            n0,
-            (double*)alpha,
-            (double*)a,  rs_a, cs_a,
-            x0, incx0,
-            (double*)beta,
-            y0, incy0,
-            NULL
-        );
-    }
-    else
-    {
-        //var_1 is chosen for row-storage
-        //and uses dotxf-based implementation
-        bli_dgemv_unf_var1
-        (
-            blis_transa,
-            BLIS_NO_CONJUGATE,
-            m0,
-            n0,
-            (double*)alpha,
-            (double*)a,  rs_a, cs_a,
-            x0, incx0,
-            (double*)beta,
-            y0, incy0,
-            NULL
-        );
-    }
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-}
-
-void sgemv_
-     (
-       const f77_char* transa,
-       const f77_int*  m,
-       const f77_int*  n,
-       const float*    alpha,
-       const float*    a, const f77_int* lda,
-       const float*    x, const f77_int* incx,
-       const float*    beta,
-             float*    y, const f77_int* incy
-     )
-{
-    trans_t blis_transa;
-    dim_t   m0, n0;
-    dim_t   m_y, n_x;
-    float*  x0;
-    float*  y0;
-    inc_t   incx0;
-    inc_t   incy0;
-    inc_t   rs_a, cs_a;
-
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
-    /* Perform BLAS parameter checking. */
-    PASTEBLACHK(gemv)
-    (
-      MKSTR(s),
-      MKSTR(gemv),
-      transa,
-      m,
-      n,
-      lda,
-      incx,
-      incy
-    );
-
-    if (*m == 0 || *n == 0)
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
-    if      ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
-    else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
-    else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
-    else
-    {
-        // See comment for bli_param_map_netlib_to_blis_side() above.
-        //bli_check_error_code( BLIS_INVALID_TRANS );
-        blis_transa = BLIS_NO_TRANSPOSE;
-    }
-
-    /* Convert/typecast negative values of m and n to zero. */
-    if ( *m < 0 ) m0 = ( dim_t )0;
-    else          m0 = ( dim_t )(*m);
-
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else          n0 = ( dim_t )(*n);
-
-    /* Determine the dimensions of x and y so we can adjust the increments,
-        if necessary.*/
-    if ( bli_does_notrans( blis_transa ) )
-    {
-      m_y = m0;
-      n_x = n0;
-    }
-    else
-    {
-      m_y = n0;
-      n_x = m0;
-    }
-
-    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
-        in a peculiar way. In these situations, BLAS returns without performing
-        any action, even though most sane interpretations of gemv would have the
-        the operation reduce to y := beta * y. Here, we catch those cases that
-        BLAS would normally mishandle and emulate the BLAS exactly so as to
-        provide "bug-for-bug" compatibility. Note that this extreme level of
-        compatibility would not be as much of an issue if it weren't for the
-        fact that some BLAS test suites actually test for these cases. Also, it
-        should be emphasized that BLIS, if called natively, does NOT exhibit
-        this quirky behavior; it will scale y by beta, as one would expect. */
-    if ( m_y > 0 && n_x == 0 )
-    {
-        /* Finalize BLIS. */
-        //      bli_finalize_auto();
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* If the input increments are negative, adjust the pointers so we can
-        use positive increments instead. */
-    if ( *incx < 0 )
-    {
-        x0    = ((float*)x) + (n_x-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-    }
-    else
-    {
-        x0    = ((float*)x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((float*)y) + (m_y-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-    }
-    else
-    {
-        y0    = ((float*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    /* Set the row and column strides of A. */
-    rs_a = 1;
-    cs_a = *lda;
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen == 0)
-    {
-      /* Call BLIS interface. */
-      PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF)
-      (
-        blis_transa,
-        BLIS_NO_CONJUGATE,
-        m0,
-        n0,
-        (float*)alpha,
-        (float*)a,  rs_a, cs_a,
-        x0, incx0,
-        (float*)beta,
-        y0, incy0,
-        NULL,
-        NULL
-      );
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-      return;
-    }
-
-    /* Call variants based on transpose value. */
-    if(bli_does_notrans(blis_transa))
-    {
-        bli_sgemv_unf_var2
-        (
-            blis_transa,
-            BLIS_NO_CONJUGATE,
-            m0,
-            n0,
-            (float*)alpha,
-            (float*)a,  rs_a, cs_a,
-            x0, incx0,
-            (float*)beta,
-            y0, incy0,
-            NULL
-        );
-    }
-    else
-    {
-        bli_sgemv_unf_var1
-        (
-            blis_transa,
-            BLIS_NO_CONJUGATE,
-            m0,
-            n0,
-            (float*)alpha,
-            (float*)a,  rs_a, cs_a,
-            x0, incx0,
-            (float*)beta,
-            y0, incy0,
-            NULL
-        );
-    }
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-}
-
-
-void cgemv_
-     (
-       const f77_char* transa,
-       const f77_int*  m,
-       const f77_int*  n,
-       const scomplex* alpha,
-       const scomplex* a, const f77_int* lda,
-       const scomplex* x, const f77_int* incx,
-       const scomplex* beta,
-             scomplex* y, const f77_int* incy
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
-
-    trans_t    blis_transa;
-    dim_t      m0, n0;
-    dim_t      m_y, n_x;
-    scomplex*  x0;
-    scomplex*  y0;
-    inc_t      incx0;
-    inc_t      incy0;
-    inc_t      rs_a, cs_a;
-
-    /* Perform BLAS parameter checking. */
-    PASTEBLACHK(gemv)
-    (
-      MKSTR(c),
-      MKSTR(gemv),
-      transa,
-      m,
-      n,
-      lda,
-      incx,
-      incy
-    );
-
-    if (*m == 0 || *n == 0)
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
-    if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
-    else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
-    else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
-    else
-    {
-        // See comment for bli_param_map_netlib_to_blis_side() above.
-        // bli_check_error_code( BLIS_INVALID_TRANS );
-        blis_transa = BLIS_NO_TRANSPOSE;
-    }
-
-    /* Convert/typecast negative values of m and n to zero. */
-    if( *m < 0 ) m0 = (dim_t)0;
-    else         m0 = (dim_t)(*m);
-
-    if( *n < 0 ) n0 = (dim_t)0;
-    else         n0 = (dim_t)(*n);
-
-    /* Determine the dimensions of x and y so we can adjust the increments,
-        if necessary.*/
-    if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
-    else                                  { m_y = n0; n_x = m0; }
-
-    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
-        in a peculiar way. In these situations, BLAS returns without performing
-        any action, even though most sane interpretations of gemv would have the
-        the operation reduce to y := beta * y. Here, we catch those cases that
-        BLAS would normally mishandle and emulate the BLAS exactly so as to
-        provide "bug-for-bug" compatibility. Note that this extreme level of
-        compatibility would not be as much of an issue if it weren't for the
-        fact that some BLAS test suites actually test for these cases. Also, it
-        should be emphasized that BLIS, if called natively, does NOT exhibit
-        this quirky behavior; it will scale y by beta, as one would expect. */
-
-    if ( m_y > 0 && n_x == 0 )
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* If the input increments are negative, adjust the pointers so we can
-        use positive increments instead. */
-    if( *incx < 0 )
-    {
-        x0    = ((scomplex*)x) + (n_x-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-    }
-    else
-    {
-        x0    = ((scomplex*)x);
-        incx0 = (inc_t)(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((scomplex*)y) + (m_y-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-    }
-    else
-    {
-        y0    = ((scomplex*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    /* Set the row and column strides of A. */
-    rs_a = 1;
-    cs_a = *lda;
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if( m_y == 1 )
-    {
-        conj_t conja = bli_extract_conj(blis_transa);
-        scomplex rho;
-        if (bamdzen)
-        {
-            bli_cdotv_zen_int5
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_x,
-              (scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
-              x0, incx0,
-              &rho,
-              NULL
-            );
-        }
-        else
-        {
-            /* Call BLIS interface. */
-            PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_x,
-              (scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
-              x0, incx0,
-              &rho,
-              NULL,
-              NULL
-            );
-        }
-
-        scomplex yval = *y0;
-        if(!bli_ceq0(*beta))
-        {
-            bli_cscals( *beta, yval );
-        }
-        else
-        {
-            bli_csetsc( 0.0, 0.0, &yval);
-        }
-        if(!bli_ceq0(*alpha))
-        {
-            bli_caxpys( *alpha, rho, yval);
-        }
-        y0->real = yval.real;
-        y0->imag = yval.imag;
-
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    if (bamdzen == 0)
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF)
-        (
-          blis_transa,
-          BLIS_NO_CONJUGATE,
-          m0,
-          n0,
-          (scomplex*)alpha,
-          (scomplex*)a,  rs_a, cs_a,
-          x0, incx0,
-          (scomplex*)beta,
-          y0, incy0,
-          NULL,
-          NULL
-        );
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* call variants based on transpose value */
-    if( bli_does_notrans( blis_transa ) )
-    {
-        bli_cgemv_unf_var2
-        (
-        blis_transa,
-        BLIS_NO_CONJUGATE,
-        m0,
-        n0,
-        (scomplex*)alpha,
-        (scomplex*)a, rs_a, cs_a,
-        x0, incx0,
-        (scomplex*)beta,
-        y0, incy0,
-        NULL
-        );
-    }
-    else
-    {
-        bli_cgemv_unf_var1
-        (
-        blis_transa,
-        BLIS_NO_CONJUGATE,
-        m0,
-        n0,
-        (scomplex*)alpha,
-        (scomplex*)a, rs_a, cs_a,
-        x0, incx0,
-        (scomplex*)beta,
-        y0, incy0,
-        NULL
-        );
-    }
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-}
-
-
-void zgemv_
-     (
-       const f77_char* transa,
-       const f77_int*  m,
-       const f77_int*  n,
-       const dcomplex* alpha,
-       const dcomplex* a, const f77_int* lda,
-       const dcomplex* x, const f77_int* incx,
-       const dcomplex* beta,
-             dcomplex* y, const f77_int* incy
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
-    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
-
-    trans_t    blis_transa;
-    dim_t      m0, n0;
-    dim_t      m_y, n_x;
-    dcomplex*  x0;
-    dcomplex*  y0;
-    inc_t      incx0;
-    inc_t      incy0;
-    inc_t      rs_a, cs_a;
-
-    /* Perform BLAS parameter checking. */
-    PASTEBLACHK(gemv)
-    (
-      MKSTR(z),
-      MKSTR(gemv),
-      transa,
-      m,
-      n,
-      lda,
-      incx,
-      incy
-    );
-
-    if (*m == 0 || *n == 0)
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
-    if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
-    else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
-    else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
-    else
-    {
-        // See comment for bli_param_map_netlib_to_blis_side() above.
-        // bli_check_error_code( BLIS_INVALID_TRANS );
-        blis_transa = BLIS_NO_TRANSPOSE;
-    }
-
-    /* Convert/typecast negative values of m and n to zero. */
-    if( *m < 0 ) m0 = (dim_t)0;
-    else         m0 = (dim_t)(*m);
-
-    if( *n < 0 ) n0 = (dim_t)0;
-    else         n0 = (dim_t)(*n);
-
-    /* Determine the dimensions of x and y so we can adjust the increments,
-       if necessary.*/
-    if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
-    else                                  { m_y = n0; n_x = m0; }
-
-    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
-       in a peculiar way. In these situations, BLAS returns without performing
-       any action, even though most sane interpretations of gemv would have the
-       the operation reduce to y := beta * y. Here, we catch those cases that
-       BLAS would normally mishandle and emulate the BLAS exactly so as to
-       provide "bug-for-bug" compatibility. Note that this extreme level of
-       compatibility would not be as much of an issue if it weren't for the
-       fact that some BLAS test suites actually test for these cases. Also, it
-       should be emphasized that BLIS, if called natively, does NOT exhibit
-       this quirky behavior; it will scale y by beta, as one would expect. */
-
-    if ( m_y > 0 && n_x == 0 )
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-    if( *incx < 0 )
-    {
-        x0    = ((dcomplex*)x) + (n_x-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-    }
-    else
-    {
-        x0    = ((dcomplex*)x);
-        incx0 = (inc_t)(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = ((dcomplex*)y) + (m_y-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-    }
-    else
-    {
-        y0    = ((dcomplex*)y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    /* Set the row and column strides of A. */
-    rs_a = 1;
-    cs_a = *lda;
-
-    // When dynamic dispatch is enabled i.e. library is built for ‘amdzen’ configuration.
-    // This function is invoked on all architectures including ‘generic’.
-    // Invoke architecture specific kernels only if we are sure that we are running on zen,
-    // zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if( m_y == 1 )
-    {
-        conj_t conja = bli_extract_conj(blis_transa);
-        dcomplex rho;
-
-        if (bamdzen)
-        {
-            bli_zdotv_zen_int5
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_x,
-              (dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
-              x0, incx0,
-              &rho,
-              NULL
-            );
-        }
-        else
-        {
-            /* Call BLIS interface. */
-            PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
-            (
-              conja,
-              BLIS_NO_CONJUGATE,
-              n_x,
-              (dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
-              x0, incx0,
-              &rho,
-              NULL,
-              NULL
-            );
-        }
-
-        dcomplex yval = *y0;
-        if(!bli_zeq0(*beta))
-        {
-            bli_zscals( *beta, yval );
-        }
-        else
-        {
-            bli_zsetsc( 0.0, 0.0, &yval);
-        }
-        if(!bli_zeq0(*alpha))
-        {
-            bli_zaxpys( *alpha, rho, yval);
-        }
-        y0->real = yval.real;
-        y0->imag = yval.imag;
-
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    if (bamdzen == 0)
-    {
-        /* Call BLIS interface. */
-        PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF)
-        (
-          blis_transa,
-          BLIS_NO_CONJUGATE,
-          m0,
-          n0,
-          (dcomplex*)alpha,
-          (dcomplex*)a,  rs_a, cs_a,
-          x0, incx0,
-          (dcomplex*)beta,
-          y0, incy0,
-          NULL,
-          NULL
-        );
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-        return;
-    }
-
-    /* call variants based on transpose value */
-    if( bli_does_notrans( blis_transa ) )
-    {
-        bli_zgemv_unf_var2
-        (
-            blis_transa,
-            BLIS_NO_CONJUGATE,
-            m0,
-            n0,
-            (dcomplex*)alpha,
-            (dcomplex*)a, rs_a, cs_a,
-            x0, incx0,
-            (dcomplex*)beta,
-            y0, incy0,
-            NULL
-        );
-    }
-    else
-    {
-        bli_zgemv_unf_var1
-        (
-            blis_transa,
-            BLIS_NO_CONJUGATE,
-            m0,
-            n0,
-            (dcomplex*)alpha,
-            (dcomplex*)a, rs_a, cs_a,
-            x0, incx0,
-            (dcomplex*)beta,
-            y0, incy0,
-            NULL
-        );
-    }
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-}
-
-
-#else
 INSERT_GENTFUNC_BLAS( gemv, gemv )
 #endif
-#endif
--- a/frame/compat/bla_gemv_amd.c
+++ b/frame/compat/bla_gemv_amd.c
@@ -0,0 +1,963 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     ) \
+{ \
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
+    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy); \
+    trans_t blis_transa; \
+    dim_t   m0, n0; \
+    dim_t   m_y, n_x; \
+    ftype*  x0; \
+    ftype*  y0; \
+    inc_t   incx0; \
+    inc_t   incy0; \
+    inc_t   rs_a, cs_a; \
+\
+    /* Initialize BLIS. */ \
+    bli_init_auto(); \
+\
+    /* Perform BLAS parameter checking. */ \
+    PASTEBLACHK(blasname) \
+    ( \
+      MKSTR(ch), \
+      MKSTR(blasname), \
+      transa, \
+      m, \
+      n, \
+      lda, \
+      incx, \
+      incy  \
+    ); \
+\
+    if (*m == 0 || *n == 0) { \
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+        return; \
+    } \
+\
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+    bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+\
+    /* Convert/typecast negative values of m and n to zero. */ \
+    bli_convert_blas_dim1( *m, m0 ); \
+    bli_convert_blas_dim1( *n, n0 ); \
+\
+    /* Determine the dimensions of x and y so we can adjust the increments,
+       if necessary.*/ \
+    bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
+\
+    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
+       in a peculiar way. In these situations, BLAS returns without performing
+       any action, even though most sane interpretations of gemv would have the
+       the operation reduce to y := beta * y. Here, we catch those cases that
+       BLAS would normally mishandle and emulate the BLAS exactly so as to
+       provide "bug-for-bug" compatibility. Note that this extreme level of
+       compatibility would not be as much of an issue if it weren't for the
+       fact that some BLAS test suites actually test for these cases. Also, it
+       should be emphasized that BLIS, if called natively, does NOT exhibit
+       this quirky behavior; it will scale y by beta, as one would expect. */ \
+    if ( m_y > 0 && n_x == 0 ) \
+    { \
+        /* Finalize BLIS. */ \
+        bli_finalize_auto(); \
+\
+        return; \
+    } \
+\
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */ \
+    bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
+    bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
+\
+    /* Set the row and column strides of A. */ \
+    rs_a = 1; \
+    cs_a = *lda; \
+\
+    /* Call BLIS interface. */ \
+    PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+    ( \
+      blis_transa, \
+      BLIS_NO_CONJUGATE, \
+      m0, \
+      n0, \
+      (ftype*)alpha, \
+      (ftype*)a,  rs_a, cs_a, \
+      x0, incx0, \
+      (ftype*)beta, \
+      y0, incy0, \
+      NULL, \
+      NULL  \
+    ); \
+\
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+    /* Finalize BLIS. */ \
+    bli_finalize_auto(); \
+}
+
+
+#ifdef BLIS_ENABLE_BLAS
+void dgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const double*    alpha,
+       const double*    a, const f77_int* lda,
+       const double*    x, const f77_int* incx,
+       const double*    beta,
+             double*    y, const f77_int* incy
+     )
+{
+    trans_t blis_transa;
+    dim_t   m0, n0;
+    dim_t   m_y, n_x;
+    double*  x0;
+    double*  y0;
+    inc_t   incx0;
+    inc_t   incy0;
+    inc_t   rs_a, cs_a;
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
+
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemv)
+    (
+      MKSTR(d),
+      MKSTR(gemv),
+      transa,
+      m,
+      n,
+      lda,
+      incx,
+      incy
+    );
+
+    if (*m == 0 || *n == 0)
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+    if      ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
+    else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
+    else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
+    else
+    {
+        // See comment for bli_param_map_netlib_to_blis_side() above.
+        //bli_check_error_code( BLIS_INVALID_TRANS );
+        blis_transa = BLIS_NO_TRANSPOSE;
+    }
+
+    /* Convert/typecast negative values of m and n to zero. */
+    if ( *m < 0 ) m0 = ( dim_t )0;
+    else          m0 = ( dim_t )(*m);
+
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else          n0 = ( dim_t )(*n);
+
+    /* Determine the dimensions of x and y so we can adjust the increments,
+        if necessary.*/
+    if ( bli_does_notrans( blis_transa ) )
+    {
+        m_y = m0;
+        n_x = n0;
+    }
+    else
+    {
+        m_y = n0;
+        n_x = m0;
+    }
+
+    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
+        in a peculiar way. In these situations, BLAS returns without performing
+        any action, even though most sane interpretations of gemv would have the
+        the operation reduce to y := beta * y. Here, we catch those cases that
+        BLAS would normally mishandle and emulate the BLAS exactly so as to
+        provide "bug-for-bug" compatibility. Note that this extreme level of
+        compatibility would not be as much of an issue if it weren't for the
+        fact that some BLAS test suites actually test for these cases. Also, it
+        should be emphasized that BLIS, if called natively, does NOT exhibit
+        this quirky behavior; it will scale y by beta, as one would expect. */
+    if ( m_y > 0 && n_x == 0 )
+    {
+        /* Finalize BLIS. */
+        //      bli_finalize_auto();
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* If the input increments are negative, adjust the pointers so we can
+    use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        x0    = ((double*)x) + (n_x-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+    }
+    else
+    {
+        x0    = ((double*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((double*)y) + (m_y-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+    }
+    else
+    {
+        y0    = ((double*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Set the row and column strides of A. */
+    rs_a = 1;
+    cs_a = *lda;
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF)
+        (
+          blis_transa,
+          BLIS_NO_CONJUGATE,
+          m0,
+          n0,
+          (double*)alpha,
+          (double*)a,  rs_a, cs_a,
+          x0, incx0,
+          (double*)beta,
+          y0, incy0,
+          NULL,
+          NULL
+        );
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* Call variants based on transpose value. */
+    if(bli_does_notrans(blis_transa))
+    {
+        //variant_2 is chosen for column-storage
+        // and uses axpyf-based implementation
+        bli_dgemv_unf_var2
+        (
+            blis_transa,
+            BLIS_NO_CONJUGATE,
+            m0,
+            n0,
+            (double*)alpha,
+            (double*)a,  rs_a, cs_a,
+            x0, incx0,
+            (double*)beta,
+            y0, incy0,
+            NULL
+        );
+    }
+    else
+    {
+        //var_1 is chosen for row-storage
+        //and uses dotxf-based implementation
+        bli_dgemv_unf_var1
+        (
+            blis_transa,
+            BLIS_NO_CONJUGATE,
+            m0,
+            n0,
+            (double*)alpha,
+            (double*)a,  rs_a, cs_a,
+            x0, incx0,
+            (double*)beta,
+            y0, incy0,
+            NULL
+        );
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+}
+
+void sgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const float*    alpha,
+       const float*    a, const f77_int* lda,
+       const float*    x, const f77_int* incx,
+       const float*    beta,
+             float*    y, const f77_int* incy
+     )
+{
+    trans_t blis_transa;
+    dim_t   m0, n0;
+    dim_t   m_y, n_x;
+    float*  x0;
+    float*  y0;
+    inc_t   incx0;
+    inc_t   incy0;
+    inc_t   rs_a, cs_a;
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemv)
+    (
+      MKSTR(s),
+      MKSTR(gemv),
+      transa,
+      m,
+      n,
+      lda,
+      incx,
+      incy
+    );
+
+    if (*m == 0 || *n == 0)
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+    if      ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
+    else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
+    else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
+    else
+    {
+        // See comment for bli_param_map_netlib_to_blis_side() above.
+        //bli_check_error_code( BLIS_INVALID_TRANS );
+        blis_transa = BLIS_NO_TRANSPOSE;
+    }
+
+    /* Convert/typecast negative values of m and n to zero. */
+    if ( *m < 0 ) m0 = ( dim_t )0;
+    else          m0 = ( dim_t )(*m);
+
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else          n0 = ( dim_t )(*n);
+
+    /* Determine the dimensions of x and y so we can adjust the increments,
+        if necessary.*/
+    if ( bli_does_notrans( blis_transa ) )
+    {
+      m_y = m0;
+      n_x = n0;
+    }
+    else
+    {
+      m_y = n0;
+      n_x = m0;
+    }
+
+    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
+        in a peculiar way. In these situations, BLAS returns without performing
+        any action, even though most sane interpretations of gemv would have the
+        the operation reduce to y := beta * y. Here, we catch those cases that
+        BLAS would normally mishandle and emulate the BLAS exactly so as to
+        provide "bug-for-bug" compatibility. Note that this extreme level of
+        compatibility would not be as much of an issue if it weren't for the
+        fact that some BLAS test suites actually test for these cases. Also, it
+        should be emphasized that BLIS, if called natively, does NOT exhibit
+        this quirky behavior; it will scale y by beta, as one would expect. */
+    if ( m_y > 0 && n_x == 0 )
+    {
+        /* Finalize BLIS. */
+        //      bli_finalize_auto();
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* If the input increments are negative, adjust the pointers so we can
+        use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        x0    = ((float*)x) + (n_x-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+    }
+    else
+    {
+        x0    = ((float*)x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((float*)y) + (m_y-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+    }
+    else
+    {
+        y0    = ((float*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Set the row and column strides of A. */
+    rs_a = 1;
+    cs_a = *lda;
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+      /* Call BLIS interface. */
+      PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF)
+      (
+        blis_transa,
+        BLIS_NO_CONJUGATE,
+        m0,
+        n0,
+        (float*)alpha,
+        (float*)a,  rs_a, cs_a,
+        x0, incx0,
+        (float*)beta,
+        y0, incy0,
+        NULL,
+        NULL
+      );
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+      return;
+    }
+
+    /* Call variants based on transpose value. */
+    if(bli_does_notrans(blis_transa))
+    {
+        bli_sgemv_unf_var2
+        (
+            blis_transa,
+            BLIS_NO_CONJUGATE,
+            m0,
+            n0,
+            (float*)alpha,
+            (float*)a,  rs_a, cs_a,
+            x0, incx0,
+            (float*)beta,
+            y0, incy0,
+            NULL
+        );
+    }
+    else
+    {
+        bli_sgemv_unf_var1
+        (
+            blis_transa,
+            BLIS_NO_CONJUGATE,
+            m0,
+            n0,
+            (float*)alpha,
+            (float*)a,  rs_a, cs_a,
+            x0, incx0,
+            (float*)beta,
+            y0, incy0,
+            NULL
+        );
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+}
+
+
+void cgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const scomplex* alpha,
+       const scomplex* a, const f77_int* lda,
+       const scomplex* x, const f77_int* incx,
+       const scomplex* beta,
+             scomplex* y, const f77_int* incy
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
+
+    trans_t    blis_transa;
+    dim_t      m0, n0;
+    dim_t      m_y, n_x;
+    scomplex*  x0;
+    scomplex*  y0;
+    inc_t      incx0;
+    inc_t      incy0;
+    inc_t      rs_a, cs_a;
+
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemv)
+    (
+      MKSTR(c),
+      MKSTR(gemv),
+      transa,
+      m,
+      n,
+      lda,
+      incx,
+      incy
+    );
+
+    if (*m == 0 || *n == 0)
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+    if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
+    else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
+    else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
+    else
+    {
+        // See comment for bli_param_map_netlib_to_blis_side() above.
+        // bli_check_error_code( BLIS_INVALID_TRANS );
+        blis_transa = BLIS_NO_TRANSPOSE;
+    }
+
+    /* Convert/typecast negative values of m and n to zero. */
+    if( *m < 0 ) m0 = (dim_t)0;
+    else         m0 = (dim_t)(*m);
+
+    if( *n < 0 ) n0 = (dim_t)0;
+    else         n0 = (dim_t)(*n);
+
+    /* Determine the dimensions of x and y so we can adjust the increments,
+        if necessary.*/
+    if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
+    else                                  { m_y = n0; n_x = m0; }
+
+    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
+        in a peculiar way. In these situations, BLAS returns without performing
+        any action, even though most sane interpretations of gemv would have the
+        the operation reduce to y := beta * y. Here, we catch those cases that
+        BLAS would normally mishandle and emulate the BLAS exactly so as to
+        provide "bug-for-bug" compatibility. Note that this extreme level of
+        compatibility would not be as much of an issue if it weren't for the
+        fact that some BLAS test suites actually test for these cases. Also, it
+        should be emphasized that BLIS, if called natively, does NOT exhibit
+        this quirky behavior; it will scale y by beta, as one would expect. */
+
+    if ( m_y > 0 && n_x == 0 )
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* If the input increments are negative, adjust the pointers so we can
+        use positive increments instead. */
+    if( *incx < 0 )
+    {
+        x0    = ((scomplex*)x) + (n_x-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+    }
+    else
+    {
+        x0    = ((scomplex*)x);
+        incx0 = (inc_t)(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((scomplex*)y) + (m_y-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+    }
+    else
+    {
+        y0    = ((scomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Set the row and column strides of A. */
+    rs_a = 1;
+    cs_a = *lda;
+
+    if( m_y == 1 )
+    {
+        conj_t conja = bli_extract_conj(blis_transa);
+        scomplex rho;
+        if (bli_cpuid_is_avx_supported() == TRUE)
+        {
+            bli_cdotv_zen_int5
+            (
+              conja,
+              BLIS_NO_CONJUGATE,
+              n_x,
+              (scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
+              x0, incx0,
+              &rho,
+              NULL
+            );
+        }
+        else
+        {
+            /* Call BLIS interface. */
+            PASTEMAC2(c,dotv,BLIS_TAPI_EX_SUF)
+            (
+              conja,
+              BLIS_NO_CONJUGATE,
+              n_x,
+              (scomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
+              x0, incx0,
+              &rho,
+              NULL,
+              NULL
+            );
+        }
+
+        scomplex yval = *y0;
+        if(!bli_ceq0(*beta))
+        {
+            bli_cscals( *beta, yval );
+        }
+        else
+        {
+            bli_csetsc( 0.0, 0.0, &yval);
+        }
+        if(!bli_ceq0(*alpha))
+        {
+            bli_caxpys( *alpha, rho, yval);
+        }
+        y0->real = yval.real;
+        y0->imag = yval.imag;
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF)
+        (
+          blis_transa,
+          BLIS_NO_CONJUGATE,
+          m0,
+          n0,
+          (scomplex*)alpha,
+          (scomplex*)a,  rs_a, cs_a,
+          x0, incx0,
+          (scomplex*)beta,
+          y0, incy0,
+          NULL,
+          NULL
+        );
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* call variants based on transpose value */
+    if( bli_does_notrans( blis_transa ) )
+    {
+        bli_cgemv_unf_var2
+        (
+        blis_transa,
+        BLIS_NO_CONJUGATE,
+        m0,
+        n0,
+        (scomplex*)alpha,
+        (scomplex*)a, rs_a, cs_a,
+        x0, incx0,
+        (scomplex*)beta,
+        y0, incy0,
+        NULL
+        );
+    }
+    else
+    {
+        bli_cgemv_unf_var1
+        (
+        blis_transa,
+        BLIS_NO_CONJUGATE,
+        m0,
+        n0,
+        (scomplex*)alpha,
+        (scomplex*)a, rs_a, cs_a,
+        x0, incx0,
+        (scomplex*)beta,
+        y0, incy0,
+        NULL
+        );
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+}
+
+
+void zgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const dcomplex* alpha,
+       const dcomplex* a, const f77_int* lda,
+       const dcomplex* x, const f77_int* incx,
+       const dcomplex* beta,
+             dcomplex* y, const f77_int* incy
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
+    AOCL_DTL_LOG_GEMV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *m, *n, (void*)alpha, *lda, *incx, (void*)beta, *incy);
+
+    trans_t    blis_transa;
+    dim_t      m0, n0;
+    dim_t      m_y, n_x;
+    dcomplex*  x0;
+    dcomplex*  y0;
+    inc_t      incx0;
+    inc_t      incy0;
+    inc_t      rs_a, cs_a;
+
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemv)
+    (
+      MKSTR(z),
+      MKSTR(gemv),
+      transa,
+      m,
+      n,
+      lda,
+      incx,
+      incy
+    );
+
+    if (*m == 0 || *n == 0)
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+    if( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
+    else if( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
+    else if( * transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
+    else
+    {
+        // See comment for bli_param_map_netlib_to_blis_side() above.
+        // bli_check_error_code( BLIS_INVALID_TRANS );
+        blis_transa = BLIS_NO_TRANSPOSE;
+    }
+
+    /* Convert/typecast negative values of m and n to zero. */
+    if( *m < 0 ) m0 = (dim_t)0;
+    else         m0 = (dim_t)(*m);
+
+    if( *n < 0 ) n0 = (dim_t)0;
+    else         n0 = (dim_t)(*n);
+
+    /* Determine the dimensions of x and y so we can adjust the increments,
+       if necessary.*/
+    if( bli_does_notrans( blis_transa ) ) { m_y = m0, n_x = n0; }
+    else                                  { m_y = n0; n_x = m0; }
+
+    /* BLAS handles cases where trans(A) has no columns, and x has no elements,
+       in a peculiar way. In these situations, BLAS returns without performing
+       any action, even though most sane interpretations of gemv would have the
+       the operation reduce to y := beta * y. Here, we catch those cases that
+       BLAS would normally mishandle and emulate the BLAS exactly so as to
+       provide "bug-for-bug" compatibility. Note that this extreme level of
+       compatibility would not be as much of an issue if it weren't for the
+       fact that some BLAS test suites actually test for these cases. Also, it
+       should be emphasized that BLIS, if called natively, does NOT exhibit
+       this quirky behavior; it will scale y by beta, as one would expect. */
+
+    if ( m_y > 0 && n_x == 0 )
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if( *incx < 0 )
+    {
+        x0    = ((dcomplex*)x) + (n_x-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+    }
+    else
+    {
+        x0    = ((dcomplex*)x);
+        incx0 = (inc_t)(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = ((dcomplex*)y) + (m_y-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+    }
+    else
+    {
+        y0    = ((dcomplex*)y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    /* Set the row and column strides of A. */
+    rs_a = 1;
+    cs_a = *lda;
+
+    if( m_y == 1 )
+    {
+        conj_t conja = bli_extract_conj(blis_transa);
+        dcomplex rho;
+
+        if (bli_cpuid_is_avx_supported() == TRUE)
+        {
+            bli_zdotv_zen_int5
+            (
+              conja,
+              BLIS_NO_CONJUGATE,
+              n_x,
+              (dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
+              x0, incx0,
+              &rho,
+              NULL
+            );
+        }
+        else
+        {
+            /* Call BLIS interface. */
+            PASTEMAC2(z,dotv,BLIS_TAPI_EX_SUF)
+            (
+              conja,
+              BLIS_NO_CONJUGATE,
+              n_x,
+              (dcomplex*)a, bli_is_notrans(blis_transa)?cs_a:rs_a,
+              x0, incx0,
+              &rho,
+              NULL,
+              NULL
+            );
+        }
+
+        dcomplex yval = *y0;
+        if(!bli_zeq0(*beta))
+        {
+            bli_zscals( *beta, yval );
+        }
+        else
+        {
+            bli_zsetsc( 0.0, 0.0, &yval);
+        }
+        if(!bli_zeq0(*alpha))
+        {
+            bli_zaxpys( *alpha, rho, yval);
+        }
+        y0->real = yval.real;
+        y0->imag = yval.imag;
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    if (bli_cpuid_is_avx_supported() == FALSE)
+    {
+        /* Call BLIS interface. */
+        PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF)
+        (
+          blis_transa,
+          BLIS_NO_CONJUGATE,
+          m0,
+          n0,
+          (dcomplex*)alpha,
+          (dcomplex*)a,  rs_a, cs_a,
+          x0, incx0,
+          (dcomplex*)beta,
+          y0, incy0,
+          NULL,
+          NULL
+        );
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
+
+    /* call variants based on transpose value */
+    if( bli_does_notrans( blis_transa ) )
+    {
+        bli_zgemv_unf_var2
+        (
+            blis_transa,
+            BLIS_NO_CONJUGATE,
+            m0,
+            n0,
+            (dcomplex*)alpha,
+            (dcomplex*)a, rs_a, cs_a,
+            x0, incx0,
+            (dcomplex*)beta,
+            y0, incy0,
+            NULL
+        );
+    }
+    else
+    {
+        bli_zgemv_unf_var1
+        (
+            blis_transa,
+            BLIS_NO_CONJUGATE,
+            m0,
+            n0,
+            (dcomplex*)alpha,
+            (dcomplex*)a, rs_a, cs_a,
+            x0, incx0,
+            (dcomplex*)beta,
+            y0, incy0,
+            NULL
+        );
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+}
+
+
+
+#endif
--- a/frame/compat/bla_scal.c
+++ b/frame/compat/bla_scal.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -93,179 +93,5 @@ void PASTEF772(chx,cha,blasname) \
 }

 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_CONFIG_EPYC
-
-void sscal_
-     (
-       const f77_int* n,
-       const float* alpha,
-       float*   x, const f77_int* incx
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
-    dim_t  n0;
-    float* x0;
-    inc_t  incx0;
-    /* Initialize BLIS. */
-    //bli_init_auto();
-
-	if (*n == 0 || alpha == NULL) {
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-		return;
-	}
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = (x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = (x);
-        incx0 = ( inc_t )(*incx);
-    }
-    /* Call BLIS kernel */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-	    bli_sscalv_zen_int10
-		    (
-		     BLIS_NO_CONJUGATE,
-		     n0,
-		     (float *)alpha,
-		     x0, incx0,
-		     NULL
-		    );
-    }
-    else{
-	    PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \
-		    ( \
-		      BLIS_NO_CONJUGATE,\
-		      n0, \
-		      (float *)alpha,\
-		      x0, incx0,\
-		      NULL, \
-		      NULL  \
-		    );\
-    }
-
-    /* Finalize BLIS. */
-//    bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-}
-
-void dscal_
-     (
-       const f77_int* n,
-       const double* alpha,
-       double*   x, const f77_int* incx
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
-    dim_t  n0;
-    double* x0;
-    inc_t  incx0;
-
-    /* Initialize BLIS  */
-    //bli_init_auto();
-
-	if (*n == 0 || alpha == NULL) {
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-		return;
-	}
-
-    /* Convert typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = (x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = (x);
-        incx0 = ( inc_t )(*incx);
-    }
-    /* Call BLIS kernel */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen){
-	    bli_dscalv_zen_int10
-		    (
-		     BLIS_NO_CONJUGATE,
-		     n0,
-		     (double*) alpha,
-		     x0, incx0,
-		     NULL
-		    );
-    }
-    else{
-	    PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \
-		    ( \
-		      BLIS_NO_CONJUGATE,\
-		      n0, \
-		      (double *)alpha,\
-		      x0, incx0,\
-		      NULL, \
-		      NULL  \
-		    );\
-    }
-
-    /* Finalize BLIS. */
-//    bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-}
-
-INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
-#else
 INSERT_GENTFUNCSCAL_BLAS( scal, scalv )
 #endif
-#endif
--- a/frame/compat/bla_scal_amd.c
+++ b/frame/compat/bla_scal_amd.c
@@ -0,0 +1,260 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNCSCAL
+#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
+\
+void PASTEF772(chx,cha,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_a* alpha, \
+       ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
+	dim_t    n0; \
+	ftype_x* x0; \
+	inc_t    incx0; \
+	ftype_x  alpha_cast; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	if (*n == 0 || alpha == NULL) { \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+		return ; \
+	} \
+\
+	/* Convert/typecast negative values of n to zero. */ \
+	bli_convert_blas_dim1( *n, n0 ); \
+\
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */ \
+	bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
+\
+	/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
+	   that is, we just always sub-optimally implement those cases
+	   by casting alpha to ctype_x (potentially the complex domain) and
+	   using the homogeneous datatype instance according to that type. */ \
+	PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
+\
+	/* Call BLIS interface. */ \
+	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n0, \
+	  &alpha_cast, \
+	  x0, incx0, \
+	  NULL, \
+	  NULL  \
+	); \
+\
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	/* Finalize BLIS. */ \
+	bli_finalize_auto(); \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+
+void sscal_
+     (
+       const f77_int* n,
+       const float* alpha,
+       float*   x, const f77_int* incx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
+    dim_t  n0;
+    float* x0;
+    inc_t  incx0;
+    /* Initialize BLIS. */
+    //bli_init_auto();
+
+	if (*n == 0 || alpha == NULL) {
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+		return;
+	}
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = (x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = (x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE) {
+	    bli_sscalv_zen_int10
+		    (
+		     BLIS_NO_CONJUGATE,
+		     n0,
+		     (float *)alpha,
+		     x0, incx0,
+		     NULL
+		    );
+    }
+    else{
+	    PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \
+		    ( \
+		      BLIS_NO_CONJUGATE,\
+		      n0, \
+		      (float *)alpha,\
+		      x0, incx0,\
+		      NULL, \
+		      NULL  \
+		    );\
+    }
+
+    /* Finalize BLIS. */
+//    bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+}
+
+void dscal_
+     (
+       const f77_int* n,
+       const double* alpha,
+       double*   x, const f77_int* incx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
+    dim_t  n0;
+    double* x0;
+    inc_t  incx0;
+
+    /* Initialize BLIS  */
+    //bli_init_auto();
+
+	if (*n == 0 || alpha == NULL) {
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+		return;
+	}
+
+    /* Convert typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = (x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = (x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE){
+	    bli_dscalv_zen_int10
+		    (
+		     BLIS_NO_CONJUGATE,
+		     n0,
+		     (double*) alpha,
+		     x0, incx0,
+		     NULL
+		    );
+    }
+    else{
+	    PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \
+		    ( \
+		      BLIS_NO_CONJUGATE,\
+		      n0, \
+		      (double *)alpha,\
+		      x0, incx0,\
+		      NULL, \
+		      NULL  \
+		    );\
+    }
+
+    /* Finalize BLIS. */
+//    bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+}
+
+INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
+
+#endif
--- a/frame/compat/bla_swap.c
+++ b/frame/compat/bla_swap.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-21, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -83,198 +83,5 @@ void PASTEF77(ch,blasname) \
 }

 #ifdef BLIS_ENABLE_BLAS
-#ifdef BLIS_CONFIG_EPYC
-
-void sswap_
-     (
-       const f77_int* n,
-       float*   x, const f77_int* incx,
-       float*   y, const f77_int* incy
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-    AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
-    dim_t  n0;
-    float* x0;
-    float* y0;
-    inc_t  incx0;
-    inc_t  incy0;
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = (x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = (x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = (y) + (n0-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-
-    }
-    else
-    {
-        y0    = (y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-/* Call BLIS kernel */
-	    bli_sswapv_zen_int8
-		    (
-		     n0,
-		     x0, incx0,
-		     y0, incy0,
-		     NULL
-		    );
-    }
-    else{
-	    PASTEMAC2(s,swapv,BLIS_TAPI_EX_SUF) \
-		    ( \
-		      n0, \
-		      x0, incx0, \
-		      y0, incy0, \
-		      NULL, \
-		      NULL  \
-		    ); \
-    }
-
-    /* Finalize BLIS. */
-//    bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-}
-
-void dswap_
-     (
-       const f77_int* n,
-       double*   x, const f77_int* incx,
-       double*   y, const f77_int* incy
-     )
-{
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-    AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
-    dim_t  n0;
-    double* x0;
-    double* y0;
-    inc_t  incx0;
-    inc_t  incy0;
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
-
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
-
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
-    if ( *incx < 0 )
-    {
-        /* The semantics of negative stride in BLAS are that the vector
-        operand be traversed in reverse order. (Another way to think
-        of this is that negative strides effectively reverse the order
-        of the vector, but without any explicit data movements.) This
-        is also how BLIS interprets negative strides. The differences
-        is that with BLAS, the caller *always* passes in the 0th (i.e.,
-        top-most or left-most) element of the vector, even when the
-        stride is negative. By contrast, in BLIS, negative strides are
-        used *relative* to the vector address as it is given. Thus, in
-        BLIS, if this backwards traversal is desired, the caller *must*
-        pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
-
-        x0    = (x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
-
-    }
-    else
-    {
-        x0    = (x);
-        incx0 = ( inc_t )(*incx);
-    }
-
-    if ( *incy < 0 )
-    {
-        y0    = (y) + (n0-1)*(-*incy);
-        incy0 = ( inc_t )(*incy);
-
-    }
-    else
-    {
-        y0    = (y);
-        incy0 = ( inc_t )(*incy);
-    }
-
-
-    /* Call BLIS kernel */
-    arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-    if (bamdzen) {
-	    bli_dswapv_zen_int8
-		    (
-		     n0,
-		     x0, incx0,
-		     y0, incy0,
-		     NULL
-		    );
-    }
-    else{
-	    PASTEMAC2(d,swapv,BLIS_TAPI_EX_SUF) \
-		    ( \
-		      n0, \
-		      x0, incx0, \
-		      y0, incy0, \
-		      NULL, \
-		      NULL  \
-		    ); \
-    }
-
-    /* Finalize BLIS. */
-//    bli_finalize_auto();
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-}
-
-INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
-
-#else
 INSERT_GENTFUNC_BLAS( swap, swapv )
 #endif
-#endif
--- a/frame/compat/bla_swap_amd.c
+++ b/frame/compat/bla_swap_amd.c
@@ -0,0 +1,268 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       ftype*   x, const f77_int* incx, \
+       ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
+    dim_t  n0; \
+    ftype* x0; \
+    ftype* y0; \
+    inc_t  incx0; \
+    inc_t  incy0; \
+\
+    /* Initialize BLIS. */ \
+    bli_init_auto(); \
+\
+    /* Convert/typecast negative values of n to zero. */ \
+    bli_convert_blas_dim1( *n, n0 ); \
+\
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */ \
+    bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
+    bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
+\
+    /* Call BLIS interface. */ \
+    PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+    ( \
+      n0, \
+      x0, incx0, \
+      y0, incy0, \
+      NULL, \
+      NULL  \
+    ); \
+\
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+    /* Finalize BLIS. */ \
+    bli_finalize_auto(); \
+}
+
+#ifdef BLIS_ENABLE_BLAS
+
+void sswap_
+     (
+       const f77_int* n,
+       float*   x, const f77_int* incx,
+       float*   y, const f77_int* incy
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, *incx, *incy);
+    dim_t  n0;
+    float* x0;
+    float* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = (x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = (x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = (y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = (y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE) {
+        /* Call BLIS kernel */
+	    bli_sswapv_zen_int8
+		    (
+		     n0,
+		     x0, incx0,
+		     y0, incy0,
+		     NULL
+		    );
+    }
+    else{
+	    PASTEMAC2(s,swapv,BLIS_TAPI_EX_SUF) \
+		    ( \
+		      n0, \
+		      x0, incx0, \
+		      y0, incy0, \
+		      NULL, \
+		      NULL  \
+		    ); \
+    }
+
+    /* Finalize BLIS. */
+//    bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+}
+
+void dswap_
+     (
+       const f77_int* n,
+       double*   x, const f77_int* incx,
+       double*   y, const f77_int* incy
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_SWAP_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
+    dim_t  n0;
+    double* x0;
+    double* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(*n);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = (x) + (n0-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+
+    }
+    else
+    {
+        x0    = (x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    if ( *incy < 0 )
+    {
+        y0    = (y) + (n0-1)*(-*incy);
+        incy0 = ( inc_t )(*incy);
+
+    }
+    else
+    {
+        y0    = (y);
+        incy0 = ( inc_t )(*incy);
+    }
+
+
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == TRUE) {
+	    bli_dswapv_zen_int8
+		    (
+		     n0,
+		     x0, incx0,
+		     y0, incy0,
+		     NULL
+		    );
+    }
+    else{
+	    PASTEMAC2(d,swapv,BLIS_TAPI_EX_SUF) \
+		    ( \
+		      n0, \
+		      x0, incx0, \
+		      y0, incy0, \
+		      NULL, \
+		      NULL  \
+		    ); \
+    }
+
+    /* Finalize BLIS. */
+//    bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+}
+
+INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
+
+
+#endif
--- a/frame/compat/bla_trsm.c
+++ b/frame/compat/bla_trsm.c
--- a/frame/compat/bla_trsm_amd.c
+++ b/frame/compat/bla_trsm_amd.c
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2017 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
   Copyright (C) 2018, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
@@ -64,16 +64,7 @@ void bli_sscalv_zen_int10
 	if ( PASTEMAC(s,eq0)( *alpha ) )
 	{
 		float* zero = bli_s0;
-#ifdef BLIS_CONFIG_EPYC
-		bli_ssetv_zen_int
-		(
-		  BLIS_NO_CONJUGATE,
-		  n,
-		  zero,
-		  x, incx,
-		  cntx
-		);
-#else
+		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 		ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 		f
 		(
@@ -83,7 +74,7 @@ void bli_sscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
-#endif
+		
 		return;
 	}

@@ -342,16 +333,7 @@ void bli_dscalv_zen_int10
 	if ( PASTEMAC(d,eq0)( *alpha ) )
 	{
 		double* zero = bli_d0;
-#ifdef BLIS_CONFIG_EPYC
-		bli_dsetv_zen_int
-		(
-		  BLIS_NO_CONJUGATE,
-		  n,
-		  zero,
-		  x, incx,
-		  cntx
-		);
-#else
+		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 		dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );

 		f
@@ -362,7 +344,7 @@ void bli_dscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
-#endif
+
 		return;
 	}

--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -95,29 +95,6 @@ void bli_caxpyf_zen_int_4
    // operation as a loop over axpyv.
    if ( b_n != fuse_fac )
    {
-#ifdef BLIS_CONFIG_EPYC
-        for ( i = 0; i < b_n; ++i )
-        {
-            scomplex* a1   = a + (0  )*inca + (i  )*lda;
-            scomplex* chi1 = x + (i  )*incx;
-            scomplex* y1   = y + (0  )*incy;
-            scomplex  alpha_chi1;
-
-            bli_ccopycjs( conjx, *chi1, alpha_chi1 );
-            bli_cscals( *alpha, alpha_chi1 );
-
-            bli_caxpyv_zen_int5
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
        caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );

        for ( i = 0; i < b_n; ++i )
@@ -141,7 +118,6 @@ void bli_caxpyf_zen_int_4
            );
        }

-#endif
        return;
    }

@@ -357,28 +333,6 @@ void bli_zaxpyf_zen_int_4
    // operation as a loop over axpyv.
    if ( b_n != fuse_fac )
    {
-#ifdef BLIS_CONFIG_EPYC
-        for ( i = 0; i < b_n; ++i )
-        {
-            dcomplex* a1   = a + (0  )*inca + (i  )*lda;
-            dcomplex* chi1 = x + (i  )*incx;
-            dcomplex* y1   = y + (0  )*incy;
-            dcomplex  alpha_chi1;
-
-            bli_zcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_zscals( *alpha, alpha_chi1 );
-
-            bli_zaxpyv_zen_int5
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-#else
        zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx );

        for ( i = 0; i < b_n; ++i )
@@ -402,7 +356,6 @@ void bli_zaxpyf_zen_int_4
            );
        }

-#endif
        return;
    }

--- a/kernels/zen/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c
@@ -108,29 +108,6 @@ void bli_saxpyf_zen_int_5
    // operation as a loop over axpyv.
    if ( b_n != fuse_fac )
    {
-#ifdef BLIS_CONFIG_EPYC
-        for ( i = 0; i < b_n; ++i )
-        {
-            float* a1   = a + (0  )*inca + (i  )*lda;
-            float* chi1 = x + (i  )*incx;
-            float* y1   = y + (0  )*incy;
-            float  alpha_chi1;
-
-            bli_scopycjs( conjx, *chi1, alpha_chi1 );
-            bli_sscals( *alpha, alpha_chi1 );
-
-            bli_saxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );

        for ( i = 0; i < b_n; ++i )
@@ -154,7 +131,6 @@ void bli_saxpyf_zen_int_5
            );
        }

-#endif
        return;
    }

@@ -382,29 +358,6 @@ void bli_daxpyf_zen_int_5
    // operation as a loop over axpyv.
    if ( b_n != fuse_fac )
    {
-#ifdef BLIS_CONFIG_EPYC
-        for ( i = 0; i < b_n; ++i )
-        {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
-
-            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_dscals( *alpha, alpha_chi1 );
-
-            bli_daxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );

        for ( i = 0; i < b_n; ++i )
@@ -428,7 +381,6 @@ void bli_daxpyf_zen_int_5
            );
        }

-#endif
        return;
    }

@@ -655,29 +607,6 @@ static void bli_daxpyf_zen_int_16x2
    // operation as a loop over axpyv.
    if ( b_n != fuse_fac )
    {
-#ifdef BLIS_CONFIG_EPYC
-        for ( i = 0; i < b_n; ++i )
-        {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
-
-            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_dscals( *alpha, alpha_chi1 );
-
-            bli_daxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );

        for ( i = 0; i < b_n; ++i )
@@ -701,7 +630,6 @@ static void bli_daxpyf_zen_int_16x2
            );
        }

-#endif
        return;
    }

@@ -966,43 +894,21 @@ void bli_daxpyf_zen_int_16x4
    // operation as a loop over axpyv.
    if ( b_n != fuse_fac )
    {
-#ifdef BLIS_CONFIG_EPYC
-    if(b_n & 2)
-    {
-        bli_daxpyf_zen_int_16x2( conja,
-                              conjx,
-                              m, 2,
-                              alpha, a, inca, lda,
-                              x, incx,
-                              y, incy,
-                              cntx             
-                        );
-        b_n -= 2;
-        a += 2*lda;
-	x += 2 * incx;
-    }
-        for ( i = 0; i < b_n; ++i )
-        {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
+		if (b_n & 2)
+		{
+			bli_daxpyf_zen_int_16x2( conja,
+									 conjx,
+									 m, 2,
+									 alpha, a, inca, lda,
+									 x, incx,
+									 y, incy,
+									 cntx
+				);
+			b_n -= 2;
+			a += 2*lda;
+			x += 2 * incx;
+		}

-            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_dscals( *alpha, alpha_chi1 );
-
-            bli_daxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );

        for ( i = 0; i < b_n; ++i )
@@ -1026,7 +932,6 @@ void bli_daxpyf_zen_int_16x4
            );
        }

-#endif
        return;
    }

@@ -1396,29 +1301,6 @@ void bli_caxpyf_zen_int_5
    // operation as a loop over axpyv.
    if ( b_n != fuse_fac )
    {
-#ifdef BLIS_CONFIG_EPYC
-        for ( i = 0; i < b_n; ++i )
-        {
-            scomplex* a1   = a + (0  )*inca + (i  )*lda;
-            scomplex* chi1 = x + (i  )*incx;
-            scomplex* y1   = y + (0  )*incy;
-            scomplex  alpha_chi1;
-
-            bli_ccopycjs( conjx, *chi1, alpha_chi1 );
-            bli_cscals( *alpha, alpha_chi1 );
-
-            bli_caxpyv_zen_int5
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
        caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );

        for ( i = 0; i < b_n; ++i )
@@ -1442,7 +1324,6 @@ void bli_caxpyf_zen_int_5
            );
        }

-#endif
        return;
    }

@@ -1810,29 +1691,6 @@ void bli_zaxpyf_zen_int_5
 	// operation as a loop over axpyv.
 	if ( b_n != fuse_fac )
 	{
-#ifdef BLIS_CONFIG_EPYC
-		for ( i = 0; i < b_n; ++i )
-		{
-			dcomplex* a1   = a + (0  )*inca + (i  )*lda;
-			dcomplex* chi1 = x + (i  )*incx;
-			dcomplex* y1   = y + (0  )*incy;
-			dcomplex  alpha_chi1;
-
-			bli_zcopycjs( conjx, *chi1, alpha_chi1 );
-			bli_zscals( *alpha, alpha_chi1 );
-
-			bli_zaxpyv_zen_int5
-				(
-				 conja,
-				 m,
-				 &alpha_chi1,
-				 a1, inca,
-				 y1, incy,
-				 cntx
-				);
-		}
-
-#else
 		zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx );

 		for ( i = 0; i < b_n; ++i )
@@ -1855,8 +1713,7 @@ void bli_zaxpyf_zen_int_5
 				 cntx
 				);
 		}
-
-#endif
+        
 		return;
 	}

--- a/kernels/zen/1f/bli_axpyf_zen_int_6.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_6.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -97,28 +97,6 @@ void bli_saxpyf_zen_int_6
    // operation as a loop over axpyv.
    if ( b_n != fuse_fac )
    {
-#ifdef BLIS_CONFIG_EPYC
-        for ( i = 0; i < b_n; ++i )
-        {
-            float* a1   = a + (0  )*inca + (i  )*lda;
-            float* chi1 = x + (i  )*incx;
-            float* y1   = y + (0  )*incy;
-            float  alpha_chi1;
-
-            bli_scopycjs( conjx, *chi1, alpha_chi1 );
-            bli_sscals( *alpha, alpha_chi1 );
-
-            bli_saxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-#else
        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );

        for ( i = 0; i < b_n; ++i )
@@ -141,7 +119,7 @@ void bli_saxpyf_zen_int_6
              cntx
            );
        }
-#endif
+
        return;
    }

--- a/kernels/zen/3/bli_gemm_small.c
+++ b/kernels/zen/3/bli_gemm_small.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2017-2022, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -114,16 +114,9 @@ err_t bli_gemm_small
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
    return BLIS_NOT_YET_IMPLEMENTED;
 #else
-	// When dynamic dispatch is enabled i.e. library is built for 'amdzen' configuration.
-	// Invoke architecture specific kernels only if we are sure that we are running on zen,
-	// zen2 or zen3 otherwise fall back to reference kernels (via framework and context).
-	arch_t id = bli_arch_query_id();
-    bool bamdzen = (id == BLIS_ARCH_ZEN4) ||
-                   (id == BLIS_ARCH_ZEN3) ||
-                   (id == BLIS_ARCH_ZEN2) ||
-                   (id == BLIS_ARCH_ZEN);
-
-	if (0 == bamdzen)
+    // This function is invoked on all architectures including ‘generic’.
+    // Non-AVX platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx_supported() == FALSE)
 	{
 		return BLIS_NOT_YET_IMPLEMENTED;
 	}