Merge branch 'amd' into rt

Details: - Merged contributions made by AMD via 'amd' branch (see summary below). Special thanks to AMD for their contributions to-date, especially with regard to intrinsic- and assembly-based kernels. - Added column storage output cases to microkernels in bli_gemm_zen_asm_d6x8.c and bli_gemmtrsm_l_zen_asm_d6x8.c. Even with the extra cost of transposing the microtile in registers, this is much faster than using the general storage case when the underlying matrix is column-stored. - Added s and d assembly-based zen gemmtrsm_u microkernel (including column storage optimization mentioned above). - Updated zen sub-configuration to reflect presence of new native kernels. - Temporarily reverted zen sub-configuration's level-3 cache blocksizes to smaller haswell values. - Temporarily disabled small matrix handling for zen configuration family in config/zen/bli_family_zen.h. - Updated zen CFLAGS according to changes in 1e4365b. - Updated haswell microkernels such that: - only one vzeroupper instruction is called prior to returning - movapd/movupd are used in leiu of movaps/movups for double-real microkernels. (Note that single-real microkernels still use movaps/movups.) - Added kernel prototypes to kernels/zen/bli_kernels_zen.h, which is now included via frame/include/bli_arch_config.h. - Minor updates to bli_amaxv_ref.c (and to inlined "test" implementation in testsuite/src/test_amaxv.c). - Added early return for alpha == 0 in bli_dotxv_ref.c. - Integrated changes from f07b176, including a fix for undefined behavior when executing the 1m method under certain conditions. - Updated config_registry; no longer need haswell kernels for zen sub-configuration. - Tweaked marginal and pass thresholds for dotxf. - Reformatted level-1v, -1f, and -3 amd kernels and inserted additional comments. - Updated LICENSE file to explicitly mention that parts are copyright UT-Austin and AMD. - Added AMD copyright to header templates in build/templates. Summary of previous changes from 'amd' branch. - Added s and d assembly-based zen gemm microkernels (d6x8 and d8x6) and s and d assembly-based zen gemmtrsm_l microkernels (d6x8). - Added s and d intrinsics-based zen kernels for amaxv, axpyv, dotv, dotxv, and scalv, with extra-unrolling variants for axpyv and scalv. - Added a small matrix handler to bli_gemm_front(), with the handler implemented in kernels/zen/3/bli_gemm_small_matrix.c. - Added additional logic to sumsqv that first attempts to compute the sum of the squares via dotv(). If there is a floating-point exception (FE_OVERFLOW), then the previous (numerically conservative) code is used; otherwise, the result of dotv() is square-rooted and stored as the result. This new implementation is only enabled when FE_OVERFLOW is #defined. If the macro is not #defined, then the previous implementation is used. - Added axpyv and dotv standalone test drivers to test directory. - Added zen support to old cpuid_x86.c driver in build/auto-detect/old. - Added thread-local and __attribute__-related macros to bli_macro_defs.h.
2026-06-08 14:58:42 +00:00 · 2018-02-21 17:43:32 -06:00
parent fa74af4e1f 5a7005dd44
commit 16813335bd
46 changed files with 17161 additions and 255 deletions
--- a/21
+++ b/21
@@ -1,3 +1,18 @@
+NOTE: Portions of this project's code are copyrighted by
+
+  The University of Texas at Austin
+
+while other portions are copyrighted by
+
+  Advanced Micro Devices, Inc.
+
+with some overlap. Please see file-level license headers for file-specific
+copyright info. All parties provide their portions of the code under the
+3-clause BSD license, found below.
+
+---
+
+Copyright (C) 2017, Advanced Micro Devices, Inc.
 Copyright (C) 2014, The University of Texas at Austin

 Redistribution and use in source and binary forms, with or without
@@ -8,9 +23,9 @@ met:
 - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
- - Neither the name of The University of Texas at Austin nor the names
-   of its contributors may be used to endorse or promote products
-   derived from this software without specific prior written permission.
+ - Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
--- a/build/auto-detect/old/cpuid_x86.c
+++ b/build/auto-detect/old/cpuid_x86.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2015, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -233,6 +234,14 @@ int cpu_detect()
          else
            return CPUNAME_GENERIC; //OS don't support AVX.
        }
+      case 8:
+	switch (model){
+	case 1:
+          if(support_avx())
+	    return CPUNAME_ZEN;
+          else
+            return CPUNAME_REFERENCE; //OS don't support AVX.
+	}
      }
      break;
    }
--- a/build/templates/license.c
+++ b/build/templates/license.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas at Austin nor the names
+    - Neither the name of copyright holder(s) nor the names
      of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

--- a/build/templates/license.h
+++ b/build/templates/license.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas at Austin nor the names
+    - Neither the name of copyright holder(s) nor the names
      of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

--- a/build/templates/license.sh
+++ b/build/templates/license.sh
@@ -1,10 +1,11 @@
-#!/bin/sh
+#!/usr/bin/env bash
 #
 #  BLIS    
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2017, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -14,7 +15,7 @@
 #   - Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
-#   - Neither the name of The University of Texas at Austin nor the names
+#   - Neither the name of copyright holder(s) nor the names
 #     of its contributors may be used to endorse or promote products
 #     derived from this software without specific prior written permission.
 #
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -46,12 +46,61 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	// Update the context with optimized native gemm micro-kernels and
 	// their storage preferences.
 	bli_cntx_set_l3_nat_ukrs
+	(
+	  6,
+	  // gemm
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_zen_asm_6x16,       TRUE,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_zen_asm_6x8,        TRUE,
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_zen_asm_6x16, TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_zen_asm_6x8,  TRUE,
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_zen_asm_6x16, TRUE,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_zen_asm_6x8,  TRUE,
+	  cntx
+	);
+
+	bli_cntx_set_l1f_kers
 	(
 	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16, TRUE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,  TRUE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,  TRUE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,  TRUE,
+	  // axpyf
+	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
+	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+	  // dotxf
+	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
+	  cntx
+	);
+
+	// Update the context with optimized level-1v kernels.
+	bli_cntx_set_l1v_kers
+	(
+	  10,
+	  // amaxv
+	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
+	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
+	  // axpyv
+#if 0
+	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
+	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int,
+#else
+	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
+	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
+#endif
+	  // dotv
+	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
+	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+	  // dotxv
+	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
+	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+	  // scalv
+#if 0
+	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
+	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int,
+#else
+	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
+	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
+#endif
 	  cntx
 	);

@@ -59,20 +108,28 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	//                                           s      d      c      z
 	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     6,     3,     3 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,   510,   144,    72 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,  1024,   256,   256 );
 	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
+	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,     8,     8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,     8,     8 );

 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  BLIS_NAT, 7,
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  // level-1f
+	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
+	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 	  cntx
 	);
 }
--- a/config/zen/bli_family_zen.h
+++ b/config/zen/bli_family_zen.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2016, Advanced Micro Devices, Inc

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -35,6 +36,19 @@
 //#ifndef BLIS_FAMILY_H
 //#define BLIS_FAMILY_H

+// By default, it is effective to parallelize the outer loops.
+// Setting these macros to 1 will force JR and IR inner loops
+// to be not paralleized.
+#define BLIS_DEFAULT_MR_THREAD_MAX 1
+#define BLIS_DEFAULT_NR_THREAD_MAX 1
+
+
+//#define BLIS_ENABLE_SMALL_MATRIX
+
+// This will select the threshold below which small matrix code will be called.
+//#define BLIS_SMALL_MATRIX_THRES        700
+//#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
+//#define BLIS_SMALL_K_RECT_MATRIX_THRES 128



--- a/config/zen/bli_kernel.h
+++ b/config/zen/bli_kernel.h
@@ -0,0 +1,274 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2017, Advanced Micro Devices, Inc
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_KERNEL_H
+#define BLIS_KERNEL_H
+
+
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
+
+//
+// Constraints:
+//
+// (1) MC must be a multiple of:
+//     (a) MR (for zero-padding purposes)
+//     (b) NR (for zero-padding purposes when MR and NR are "swapped")
+// (2) NC must be a multiple of
+//     (a) NR (for zero-padding purposes)
+//     (b) MR (for zero-padding purposes when MR and NR are "swapped")
+//
+
+// threading related
+// By default it is effective to paralleize the 
+// outerloops. Setting these macros to 1 will force
+// JR and NR inner loops to be not paralleized.
+#define BLIS_DEFAULT_MR_THREAD_MAX 1 
+#define BLIS_DEFAULT_NR_THREAD_MAX 1 
+
+// sgemm micro-kernel
+
+#if 0
+
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_asm_24x4
+#define BLIS_DEFAULT_MC_S          264
+#define BLIS_DEFAULT_KC_S          128
+#define BLIS_DEFAULT_NC_S          4080
+#define BLIS_DEFAULT_MR_S          24
+#define BLIS_DEFAULT_NR_S          4
+#endif
+
+#if 0
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_asm_16x6
+#define BLIS_DEFAULT_MC_S          144
+#define BLIS_DEFAULT_KC_S          256
+#define BLIS_DEFAULT_NC_S          4080
+#define BLIS_DEFAULT_MR_S          16
+#define BLIS_DEFAULT_NR_S          6
+#endif
+
+#if 1
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_asm_6x16
+#define BLIS_DEFAULT_MC_S          144
+#define BLIS_DEFAULT_KC_S          256
+#define BLIS_DEFAULT_NC_S          4080
+#define BLIS_DEFAULT_MR_S          6
+#define BLIS_DEFAULT_NR_S          16
+
+#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
+
+#endif
+
+// dgemm micro-kernel
+
+#if 0
+
+#define BLIS_DGEMM_UKERNEL         bli_dgemm_asm_12x4
+#define BLIS_DEFAULT_MC_D          96
+#define BLIS_DEFAULT_KC_D          192
+#define BLIS_DEFAULT_NC_D          4080
+#define BLIS_DEFAULT_MR_D          12
+#define BLIS_DEFAULT_NR_D          4
+#endif
+
+#if 0
+#define BLIS_DGEMM_UKERNEL         bli_dgemm_asm_8x6
+#define BLIS_DEFAULT_MC_D          72
+#define BLIS_DEFAULT_KC_D          256
+#define BLIS_DEFAULT_NC_D          4080
+#define BLIS_DEFAULT_MR_D          8
+#define BLIS_DEFAULT_NR_D          6
+#endif
+
+#if 1
+#define BLIS_DGEMM_UKERNEL         bli_dgemm_asm_6x8
+#define BLIS_DEFAULT_MC_D         510  // 72 /* Improves performance for large Matrices */
+#define BLIS_DEFAULT_KC_D         1024 // 256
+#define BLIS_DEFAULT_NC_D          4080
+#define BLIS_DEFAULT_MR_D          6
+#define BLIS_DEFAULT_NR_D          8
+
+#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
+#endif
+
+// cgemm micro-kernel
+
+#if 1
+#define BLIS_CGEMM_UKERNEL         bli_cgemm_asm_3x8
+#define BLIS_DEFAULT_MC_C          144
+#define BLIS_DEFAULT_KC_C          256
+#define BLIS_DEFAULT_NC_C          4080
+#define BLIS_DEFAULT_MR_C          3
+#define BLIS_DEFAULT_NR_C          8
+
+#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS
+#endif
+
+// zgemm micro-kernel
+
+#if 1
+#define BLIS_ZGEMM_UKERNEL         bli_zgemm_asm_3x4
+#define BLIS_DEFAULT_MC_Z          72
+#define BLIS_DEFAULT_KC_Z          256
+#define BLIS_DEFAULT_NC_Z          4080
+#define BLIS_DEFAULT_MR_Z          3
+#define BLIS_DEFAULT_NR_Z          4
+
+#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
+#endif
+
+
+// zgemm micro-kernel
+
+#if 1
+#define BLIS_ZGEMM_UKERNEL         bli_zgemm_asm_3x4
+#define BLIS_DEFAULT_MC_Z          72
+#define BLIS_DEFAULT_KC_Z          256
+#define BLIS_DEFAULT_NC_Z          4080
+#define BLIS_DEFAULT_MR_Z          3
+#define BLIS_DEFAULT_NR_Z          4
+
+#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
+#endif
+
+// -- trsm-related --
+
+#define BLIS_STRSM_L_UKERNEL   bli_strsm_l_int_6x16
+#define BLIS_DTRSM_L_UKERNEL   bli_dtrsm_l_int_6x8
+
+// --gemmtrsm-related --
+#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_6x16
+#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_6x8
+
+#define BLIS_SMALL_MATRIX_ENABLE
+//This will select the threshold below which small matrix code will be called.
+#define BLIS_SMALL_MATRIX_THRES 700
+#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
+#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
+
+gint_t bli_gemm_small_matrix
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       cntl_t* cntl
+     );
+
+// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
+
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+
+
+
+// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
+
+// -- packm --
+
+// -- unpackm --
+
+#define BLIS_DEFAULT_1F_S    8
+#define BLIS_DEFAULT_1F_D    4
+
+
+// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
+
+// -- axpy2v --
+
+// -- dotaxpyv --
+
+// -- axpyf --
+#define BLIS_SAXPYF_KERNEL           bli_saxpyf_int_var1
+#define BLIS_DAXPYF_KERNEL           bli_daxpyf_int_var1
+
+// -- dotxf --
+#define BLIS_SDOTXF_KERNEL           bli_sdotxf_int_var1
+#define BLIS_DDOTXF_KERNEL           bli_ddotxf_int_var1
+
+// -- dotxaxpyf --
+
+
+// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
+
+// -- packm --
+
+// -- unpackm --
+
+
+
+
+// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
+// -- amax --
+#define BLIS_SAMAXV_KERNEL         bli_samaxv_opt_var1
+#define BLIS_DAMAXV_KERNEL         bli_damaxv_opt_var1
+// -- addv --
+
+// -- axpyv --
+#define BLIS_DAXPYV_KERNEL         bli_daxpyv_opt_var10
+#define BLIS_SAXPYV_KERNEL         bli_saxpyv_opt_var10
+
+
+// -- copyv --
+
+// -- dotv --
+#define BLIS_DDOTV_KERNEL          bli_ddotv_opt_var1
+#define BLIS_SDOTV_KERNEL          bli_sdotv_opt_var1
+
+// -- dotxv --
+#define BLIS_SDOTXV_KERNEL         bli_sdotxv_unb_var1
+#define BLIS_DDOTXV_KERNEL         bli_ddotxv_unb_var1
+
+// -- invertv --
+
+// -- scal2v --
+
+// -- scalv --
+#define BLIS_SSCALV_KERNEL   bli_sscalv_opt_var2
+#define BLIS_DSCALV_KERNEL   bli_dscalv_opt_var2
+
+// -- setv --
+
+// -- subv --
+
+// -- swapv --
+
+
+
+#endif
+
--- a/config/zen/make_defs.mk
+++ b/config/zen/make_defs.mk
@@ -62,16 +62,20 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2 -fomit-frame-pointer
+COPTFLAGS      := -O3 -fomit-frame-pointer
 endif

 CKOPTFLAGS     := $(COPTFLAGS)

 ifeq ($(CC_VENDOR),gcc)
-CVECFLAGS      := -mavx2 -mfpmath=sse -mfma -march=bdver4
+# gcc 6.3 or later:
+#CVECFLAGS      := -mavx2 -mfpmath=sse -mfma -march=znver1
+# gcc 5.4:
+# possibly add zen-specific instructions: -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
+CVECFLAGS      := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 else
 ifeq ($(CC_VENDOR),clang)
-CVECFLAGS      := -mavx2 -mfpmath=sse -mfma -march=bdver4
+CVECFLAGS      := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 else
 $(error gcc or clang are required for this configuration.)
 endif
--- a/2
+++ b/2
@@ -21,7 +21,7 @@ knl:         knl
 skx:         skx

 # AMD architectures.
-zen:         zen/haswell
+zen:         zen
 excavator:   excavator/piledriver
 steamroller: steamroller/piledriver
 piledriver:  piledriver
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -51,6 +52,11 @@ void bli_gemm_front
 	obj_t   b_local;
 	obj_t   c_local;

+#ifdef BLIS_ENABLE_SMALL_MATRIX
+	gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
+	if ( status == BLIS_SUCCESS ) return;
+#endif
+
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
 		bli_gemm_check( alpha, a, b, beta, c, cntx );
@@ -82,9 +88,9 @@ void bli_gemm_front

 	// Record the threading for each level within the context.
 	bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
-                                   bli_obj_length( c_local ),
-                                   bli_obj_width( c_local ),
-                                   bli_obj_width( a_local ) );
+	                               bli_obj_length( c_local ),
+	                               bli_obj_width( c_local ),
+	                               bli_obj_width( a_local ) );

 	// Invoke the internal back-end via the thread handler.
 	bli_l3_thread_decorator
--- a/frame/3/gemm/bli_gemm_front.h
+++ b/frame/3/gemm/bli_gemm_front.h
@@ -42,3 +42,14 @@ void bli_gemm_front
       cntx_t* cntx,
       cntl_t* cntl
     );
+
+err_t bli_gemm_small
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       cntl_t* cntl
+     );
--- a/frame/base/bli_init.c
+++ b/frame/base/bli_init.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -214,9 +214,9 @@ CNTX_INIT_PROTS( generic )

 // -- AMD64 architectures --

-//#ifdef BLIS_KERNELS_ZEN
-//#include "bli_kernels_zen.h"
-//#endif
+#ifdef BLIS_KERNELS_ZEN
+#include "bli_kernels_zen.h"
+#endif
 //#ifdef BLIS_KERNELS_EXCAVATOR
 //#include "bli_kernels_excavator.h"
 //#endif
--- a/frame/include/bli_macro_defs.h
+++ b/frame/include/bli_macro_defs.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -64,6 +65,49 @@
 #endif


+// -- BLIS Thread Local Storage Keyword --
+
+// __thread for TLS is supported by GCC, CLANG, ICC, and IBMC.
+// There is a small risk here as __GNUC__ can also be defined by some other
+// compiler (other than ICC and CLANG which we know define it) that
+// doesn't support __thread, as __GNUC__ is not quite unique to GCC.
+// But the possibility of someone using such non-main-stream compiler
+// for building BLIS is low.
+#if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__)
+  #define BLIS_THREAD_LOCAL __thread
+#else
+  #define BLIS_THREAD_LOCAL
+#endif
+
+
+// -- BLIS constructor/destructor function attribute --
+
+// __attribute__((constructor/destructor)) is supported by GCC only.
+// There is a small risk here as __GNUC__ can also be defined by some other
+// compiler (other than ICC and CLANG which we know define it) that
+// doesn't support this, as __GNUC__ is not quite unique to GCC.
+// But the possibility of someone using such non-main-stream compiler
+// for building BLIS is low.
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+  // ICC defines __GNUC__ but doesn't support this
+  #define BLIS_ATTRIB_CTOR
+  #define BLIS_ATTRIB_DTOR
+#elif defined(__clang__)
+  // CLANG supports __attribute__, but its documentation doesn't
+  // mention support for constructor/destructor. Compiling with
+  // clang and testing shows that it does support.
+  #define BLIS_ATTRIB_CTOR __attribute__((constructor))
+  #define BLIS_ATTRIB_DTOR __attribute__((destructor))
+#elif defined(__GNUC__)
+  #define BLIS_ATTRIB_CTOR __attribute__((constructor))
+  #define BLIS_ATTRIB_DTOR __attribute__((destructor))
+#else
+  #define BLIS_ATTRIB_CTOR
+  #define BLIS_ATTRIB_DTOR
+#endif
+
+
 // -- Concatenation macros --

 #define BLIS_FUNC_PREFIX_STR       "bli"
--- a/frame/ind/bli_l3_ind.c
+++ b/frame/ind/bli_l3_ind.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -53,7 +54,15 @@ static void* bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
             bli_syrknat,  bli_syr2knat, bli_trmm3nat, bli_trmmnat,  bli_trsmnat  },
 };

-static bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = 
+//
+// NOTE: "2" is used instead of BLIS_NUM_FP_TYPES/2.
+//
+// BLIS provides APIs to modify this state during runtime. So, one application thread
+// can modify the state, before another starts the corresponding BLIS operation.
+// This is solved by making the induced method status array local to threads.
+
+static BLIS_THREAD_LOCAL
+bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
 {
        /*   gemm   hemm   herk   her2k  symm   syrk,  syr2k  trmm3  trmm   trsm  */
        /*    c     z    */
--- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c
+++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c
@@ -82,9 +82,9 @@ void PASTEMAC(opname,imeth) \
 \
 	/* Query a context for the current induced method. This context is
 	   managed and cached by the gks and should not be freed by the caller.
-	   Note that we pass in the datatype because it will be needed when
-	   bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
-	   family of functions. */ \
+	   Note that the datatype argument is needed because it will be passed
+	   in when bli_gks_query_ind_cntx() eventually calls the induced method's
+	   _cntx_init() function. */ \
 	cntx = bli_gks_query_ind_cntx( ind, dt ); \
 \
 	/* Some induced methods execute in multiple "stages". */ \
@@ -162,9 +162,9 @@ void PASTEMAC(opname,imeth) \
 \
 	/* Query a context for the current induced method. This context is
 	   managed and cached by the gks and should not be freed by the caller.
-	   Note that we pass in the datatype because it will be needed when
-	   bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
-	   family of functions. */ \
+	   Note that the datatype argument is needed because it will be passed
+	   in when bli_gks_query_ind_cntx() eventually calls the induced method's
+	   _cntx_init() function. */ \
 	cntx = bli_gks_query_ind_cntx( ind, dt ); \
 \
 	/* Some induced methods execute in multiple "stages". */ \
@@ -240,9 +240,9 @@ void PASTEMAC(opname,imeth) \
 \
 	/* Query a context for the current induced method. This context is
 	   managed and cached by the gks and should not be freed by the caller.
-	   Note that we pass in the datatype because it will be needed when
-	   bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
-	   family of functions. */ \
+	   Note that the datatype argument is needed because it will be passed
+	   in when bli_gks_query_ind_cntx() eventually calls the induced method's
+	   _cntx_init() function. */ \
 	cntx = bli_gks_query_ind_cntx( ind, dt ); \
 \
 	/* Some induced methods execute in multiple "stages". */ \
@@ -309,9 +309,9 @@ void PASTEMAC(opname,imeth) \
 \
 	/* Query a context for the current induced method. This context is
 	   managed and cached by the gks and should not be freed by the caller.
-	   Note that we pass in the datatype because it will be needed when
-	   bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
-	   family of functions. */ \
+	   Note that the datatype argument is needed because it will be passed
+	   in when bli_gks_query_ind_cntx() eventually calls the induced method's
+	   _cntx_init() function. */ \
 	cntx = bli_gks_query_ind_cntx( ind, dt ); \
 \
 	/* Some induced methods execute in multiple "stages". */ \
@@ -364,9 +364,9 @@ void PASTEMAC(opname,imeth) \
 \
 	/* Query a context for the current induced method. This context is
 	   managed and cached by the gks and should not be freed by the caller.
-	   Note that we pass in the datatype because it will be needed when
-	   bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
-	   family of functions. */ \
+	   Note that the datatype argument is needed because it will be passed
+	   in when bli_gks_query_ind_cntx() eventually calls the induced method's
+	   _cntx_init() function. */ \
 	cntx = bli_gks_query_ind_cntx( ind, dt ); \
 \
 	{ \
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -33,6 +34,7 @@
 */

 #include "blis.h"
+#include <fenv.h>

 //
 // Define BLAS-like interfaces with typed operands.
@@ -293,7 +295,129 @@ void PASTEMAC(ch,varname) \
 	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
 }

-INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
+//INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
+GENTFUNCR( scomplex, float,  c, s, normfv_unb_var1, sumsqv_unb_var1 )
+GENTFUNCR( dcomplex, double, z, d, normfv_unb_var1, sumsqv_unb_var1 )
+
+#undef  GENTFUNCR
+#ifdef FE_OVERFLOW
+#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx  \
+     ) \
+{ \
+	ctype_r* zero       = PASTEMAC(chr,0); \
+	ctype_r* one        = PASTEMAC(chr,1); \
+	ctype_r  scale; \
+	ctype_r  sumsq; \
+	ctype_r  sqrt_sumsq; \
+\
+	/* Initialize scale and sumsq to begin the summation. */ \
+	PASTEMAC(chr,copys)( *zero, scale ); \
+	PASTEMAC(chr,copys)( *one,  sumsq ); \
+\
+	/* An optimization: first try to use dotv to compute the sum of
+	   the squares of the vector. If no floating-point exceptions
+	   (specifically, overflow and invalid exceptions) were produced,
+	   then we accept the computed value and returne early. The cost
+	   of this optimization is the "sunk" cost of the initial dotv
+	   when sumsqv must be used instead. However, we expect that the
+	   vast majority of use cases will not produce exceptions, and
+	   therefore only one pass through the data, via dotv, will be
+	   required. */ \
+	if ( TRUE ) \
+	{ \
+		int      f_exp_raised;\
+		ctype    sumsqc; \
+\
+		feclearexcept( FE_ALL_EXCEPT );\
+\
+		PASTEMAC(ch,dotv)\
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  BLIS_NO_CONJUGATE, \
+		  n,\
+		  x, incx, \
+		  x, incx, \
+		  &sumsqc, \
+		  cntx  \
+		); \
+\
+		PASTEMAC2(ch,chr,copys)( sumsqc, sumsq ); \
+\
+		f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\
+\
+		if ( !f_exp_raised ) \
+		{ \
+		    PASTEMAC(chr,sqrt2s)( sumsq, *norm ); \
+		    return; \
+		} \
+	} \
+\
+	/* Compute the sum of the squares of the vector. */ \
+	PASTEMAC(ch,kername) \
+	( \
+	  n, \
+	  x, incx, \
+	  &scale, \
+	  &sumsq, \
+	  cntx  \
+	); \
+\
+	/* Compute: norm = scale * sqrt( sumsq ) */ \
+	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
+	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
+\
+	/* Store the final value to the output variable. */ \
+	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
+}
+#else
+#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx  \
+     ) \
+{ \
+	ctype_r* zero       = PASTEMAC(chr,0); \
+	ctype_r* one        = PASTEMAC(chr,1); \
+	ctype_r  scale; \
+	ctype_r  sumsq; \
+	ctype_r  sqrt_sumsq; \
+\
+	/* Initialize scale and sumsq to begin the summation. */ \
+	PASTEMAC(chr,copys)( *zero, scale ); \
+	PASTEMAC(chr,copys)( *one,  sumsq ); \
+\
+	/* Compute the sum of the squares of the vector. */ \
+\
+	PASTEMAC(ch,kername) \
+	( \
+	  n, \
+	  x, incx, \
+	  &scale, \
+	  &sumsq, \
+	  cntx  \
+	); \
+\
+	/* Compute: norm = scale * sqrt( sumsq ) */ \
+	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
+	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
+\
+	/* Store the final value to the output variable. */ \
+	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
+}
+#endif
+GENTFUNCR( float,   float,  s, s, normfv_unb_var1, sumsqv_unb_var1 )
+GENTFUNCR( double,  double, d, d, normfv_unb_var1, sumsqv_unb_var1 )


 #undef  GENTFUNCR
@@ -898,7 +1022,7 @@ void PASTEMAC(ch,varname) \
 				  n_elem - 1, \
 				  &beta, \
 				  x0, incx, \
-                  cntx  \
+				  cntx  \
 				); \
 */ \
 			} \
@@ -937,7 +1061,7 @@ void PASTEMAC(ch,varname) \
 				  n_elem - 1, \
 				  &beta, \
 				  x2, incx, \
-                  cntx  \
+				  cntx  \
 				); \
 */ \
 			} \
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d12x4.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d12x4.c
@@ -634,8 +634,6 @@ void bli_sgemm_haswell_asm_24x4
 	"                                            \n\t"
 	"                                            \n\t"
 	".SDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
@@ -1257,8 +1255,6 @@ void bli_dgemm_haswell_asm_12x4
 	"                                            \n\t"
 	"                                            \n\t"
 	".DDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d4x12.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d4x12.c
@@ -600,8 +600,6 @@ void bli_sgemm_haswell_asm_4x24
 	"                                            \n\t"
 	"                                            \n\t"
 	".SDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
@@ -1019,46 +1017,46 @@ void bli_dgemm_haswell_asm_4x12
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3,  %%ymm4   \n\t"
-	"vmovups          %%ymm4,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm4,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3,  %%ymm5   \n\t"
-	"vmovups          %%ymm5,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm5,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"vfmadd231pd      (%%r12), %%ymm3,  %%ymm6   \n\t"
-	"vmovups          %%ymm6,  (%%r12)           \n\t"
+	"vmovupd          %%ymm6,  (%%r12)           \n\t"
 	"addq      %%rdi, %%r12                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3,  %%ymm7   \n\t"
-	"vmovups          %%ymm7,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm7,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3,  %%ymm8   \n\t"
-	"vmovups          %%ymm8,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm8,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"vfmadd231pd      (%%r12), %%ymm3,  %%ymm9   \n\t"
-	"vmovups          %%ymm9,  (%%r12)           \n\t"
+	"vmovupd          %%ymm9,  (%%r12)           \n\t"
 	"addq      %%rdi, %%r12                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3,  %%ymm10  \n\t"
-	"vmovups          %%ymm10, (%%rcx)           \n\t"
+	"vmovupd          %%ymm10, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3,  %%ymm11  \n\t"
-	"vmovups          %%ymm11, (%%rdx)           \n\t"
+	"vmovupd          %%ymm11, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"vfmadd231pd      (%%r12), %%ymm3,  %%ymm12  \n\t"
-	"vmovups          %%ymm12, (%%r12)           \n\t"
+	"vmovupd          %%ymm12, (%%r12)           \n\t"
 	"addq      %%rdi, %%r12                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3,  %%ymm13  \n\t"
-	"vmovups          %%ymm13, (%%rcx)           \n\t"
+	"vmovupd          %%ymm13, (%%rcx)           \n\t"
 	//"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3,  %%ymm14  \n\t"
-	"vmovups          %%ymm14, (%%rdx)           \n\t"
+	"vmovupd          %%ymm14, (%%rdx)           \n\t"
 	//"addq      %%rdi, %%rdx                      \n\t"
 	"vfmadd231pd      (%%r12), %%ymm3,  %%ymm15  \n\t"
-	"vmovups          %%ymm15, (%%r12)           \n\t"
+	"vmovupd          %%ymm15, (%%r12)           \n\t"
 	//"addq      %%rdi, %%r12                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -1190,8 +1188,6 @@ void bli_dgemm_haswell_asm_4x12
 	"                                            \n\t"
 	"                                            \n\t"
 	".DDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -595,8 +595,6 @@ void bli_sgemm_haswell_asm_6x16
 	"                                            \n\t"
 	"                                            \n\t"
 	".SDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
@@ -634,10 +632,10 @@ void bli_sgemm_haswell_asm_6x16
 	"vmovlpd    (%%rcx,%%rsi,2),  %%xmm1,  %%xmm1  \n\t" \
 	"vmovhpd    (%%rcx,%%r13  ),  %%xmm1,  %%xmm1  \n\t" \
 	"vperm2f128 $0x20,   %%ymm1,  %%ymm0,  %%ymm0  \n\t" /*\
-	"vmovlps    (%%rcx,%%rsi,4),  %%xmm2,  %%xmm2  \n\t" \
-	"vmovhps    (%%rcx,%%r15  ),  %%xmm2,  %%xmm2  \n\t" \
-	"vmovlps    (%%rcx,%%r13,2),  %%xmm1,  %%xmm1  \n\t" \
-	"vmovhps    (%%rcx,%%r10  ),  %%xmm1,  %%xmm1  \n\t" \
+	"vmovlpd    (%%rcx,%%rsi,4),  %%xmm2,  %%xmm2  \n\t" \
+	"vmovhpd    (%%rcx,%%r15  ),  %%xmm2,  %%xmm2  \n\t" \
+	"vmovlpd    (%%rcx,%%r13,2),  %%xmm1,  %%xmm1  \n\t" \
+	"vmovhpd    (%%rcx,%%r10  ),  %%xmm1,  %%xmm1  \n\t" \
 	"vperm2f128 $0x20,   %%ymm1,  %%ymm2,  %%ymm2  \n\t"*/

 #define DGEMM_OUTPUT_GS_BETA_NZ \
@@ -682,8 +680,8 @@ void bli_dgemm_haswell_asm_6x8
 	"                                            \n\t"
 	"addq           $32 * 4, %%rbx               \n\t"
 	"                                            \n\t" // initialize loop by pre-loading
-	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"movq                %6, %%rcx               \n\t" // load address of c
 	"movq                %7, %%rdi               \n\t" // load rs_c
@@ -734,12 +732,10 @@ void bli_dgemm_haswell_asm_6x8
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps           -2 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps           -1 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd           -2 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd           -1 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 1
-	"prefetcht0   72 * 8(%%rax)                  \n\t"
-	"                                            \n\t"
 	"vbroadcastsd       6 *  8(%%rax), %%ymm2    \n\t"
 	"vbroadcastsd       7 *  8(%%rax), %%ymm3    \n\t"
 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
@@ -761,11 +757,11 @@ void bli_dgemm_haswell_asm_6x8
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps            0 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps            1 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd            0 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd            1 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 2
-	"prefetcht0   80 * 8(%%rax)                  \n\t"
+	"prefetcht0   76 * 8(%%rax)                  \n\t"
 	"                                            \n\t"
 	"vbroadcastsd      12 *  8(%%rax), %%ymm2    \n\t"
 	"vbroadcastsd      13 *  8(%%rax), %%ymm3    \n\t"
@@ -788,8 +784,8 @@ void bli_dgemm_haswell_asm_6x8
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps            2 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps            3 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd            2 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd            3 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 3
 	"vbroadcastsd      18 *  8(%%rax), %%ymm2    \n\t"
@@ -816,8 +812,8 @@ void bli_dgemm_haswell_asm_6x8
 	"addq           $4 * 6 * 8, %%rax            \n\t" // a += 4*6 (unroll x mr)
 	"addq           $4 * 8 * 8, %%rbx            \n\t" // b += 4*8 (unroll x nr)
 	"                                            \n\t"
-	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"decq   %%rsi                                \n\t" // i -= 1;
@@ -864,8 +860,8 @@ void bli_dgemm_haswell_asm_6x8
 	"addq           $1 * 6 * 8, %%rax            \n\t" // a += 1*6 (unroll x mr)
 	"addq           $1 * 8 * 8, %%rbx            \n\t" // b += 1*8 (unroll x nr)
 	"                                            \n\t"
-	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"decq   %%rsi                                \n\t" // i -= 1;
@@ -1008,50 +1004,50 @@ void bli_dgemm_haswell_asm_6x8
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm4    \n\t"
-	"vmovups          %%ymm4,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm4,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm5    \n\t"
-	"vmovups          %%ymm5,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm5,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm6    \n\t"
-	"vmovups          %%ymm6,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm6,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm7    \n\t"
-	"vmovups          %%ymm7,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm7,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm8    \n\t"
-	"vmovups          %%ymm8,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm8,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm9    \n\t"
-	"vmovups          %%ymm9,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm9,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm10   \n\t"
-	"vmovups          %%ymm10, (%%rcx)           \n\t"
+	"vmovupd          %%ymm10, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm11   \n\t"
-	"vmovups          %%ymm11, (%%rdx)           \n\t"
+	"vmovupd          %%ymm11, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm12   \n\t"
-	"vmovups          %%ymm12, (%%rcx)           \n\t"
+	"vmovupd          %%ymm12, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm13   \n\t"
-	"vmovups          %%ymm13, (%%rdx)           \n\t"
+	"vmovupd          %%ymm13, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm14   \n\t"
-	"vmovups          %%ymm14, (%%rcx)           \n\t"
+	"vmovupd          %%ymm14, (%%rcx)           \n\t"
 	//"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm15   \n\t"
-	"vmovups          %%ymm15, (%%rdx)           \n\t"
+	"vmovupd          %%ymm15, (%%rdx)           \n\t"
 	//"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -1070,64 +1066,64 @@ void bli_dgemm_haswell_asm_6x8
 	".DGENSTORBZ:                                \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm4,  %%ymm0           \n\t"
+	"vmovapd           %%ymm4,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm6,  %%ymm0           \n\t"
+	"vmovapd           %%ymm6,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm8,  %%ymm0           \n\t"
+	"vmovapd           %%ymm8,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm10, %%ymm0           \n\t"
+	"vmovapd           %%ymm10, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm12, %%ymm0           \n\t"
+	"vmovapd           %%ymm12, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm14, %%ymm0           \n\t"
+	"vmovapd           %%ymm14, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"                                            \n\t"
 	"                                            \n\t"
 	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 4*cs_c
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm5,  %%ymm0           \n\t"
+	"vmovapd           %%ymm5,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm7,  %%ymm0           \n\t"
+	"vmovapd           %%ymm7,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm9,  %%ymm0           \n\t"
+	"vmovapd           %%ymm9,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm11, %%ymm0           \n\t"
+	"vmovapd           %%ymm11, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm13, %%ymm0           \n\t"
+	"vmovapd           %%ymm13, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm15, %%ymm0           \n\t"
+	"vmovapd           %%ymm15, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"                                            \n\t"
 	"                                            \n\t"
@@ -1139,38 +1135,38 @@ void bli_dgemm_haswell_asm_6x8
 	".DROWSTORBZ:                                \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm4,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm4,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm5,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm5,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm6,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm6,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm7,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm7,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm8,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm8,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm9,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm9,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm10, (%%rcx)           \n\t"
+	"vmovupd          %%ymm10, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm11, (%%rdx)           \n\t"
+	"vmovupd          %%ymm11, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm12, (%%rcx)           \n\t"
+	"vmovupd          %%ymm12, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm13, (%%rdx)           \n\t"
+	"vmovupd          %%ymm13, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm14, (%%rcx)           \n\t"
+	"vmovupd          %%ymm14, (%%rcx)           \n\t"
 	//"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm15, (%%rdx)           \n\t"
+	"vmovupd          %%ymm15, (%%rdx)           \n\t"
 	//"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -1179,8 +1175,6 @@ void bli_dgemm_haswell_asm_6x8
 	"                                            \n\t"
 	"                                            \n\t"
 	".DDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
@@ -1710,8 +1704,6 @@ void bli_cgemm_haswell_asm_3x8
 	"                                            \n\t"
 	"                                            \n\t"
 	".CDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
@@ -1761,7 +1753,7 @@ void bli_cgemm_haswell_asm_3x8
 	"vmovupd           %%xmm3,  (%%rcx,%%rsi  )  \n\t" \

 #define ZGEMM_INPUT_SCALE_RS_BETA_NZ \
-	"vmovups    (%%rcx),       %%ymm0            \n\t" \
+	"vmovupd    (%%rcx),       %%ymm0            \n\t" \
 	"vpermilpd  $0x5, %%ymm0,  %%ymm3            \n\t" \
 	"vmulpd           %%ymm1,  %%ymm0,  %%ymm0   \n\t" \
 	"vmulpd           %%ymm2,  %%ymm3,  %%ymm3   \n\t" \
@@ -1803,8 +1795,8 @@ void bli_zgemm_haswell_asm_3x4
 	"                                            \n\t"
 	"addq           $32 * 4, %%rbx               \n\t"
 	"                                            \n\t" // initialize loop by pre-loading
-	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"movq                %6, %%rcx               \n\t" // load address of c
 	"movq                %7, %%rdi               \n\t" // load rs_c
@@ -1854,8 +1846,8 @@ void bli_zgemm_haswell_asm_3x4
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps           -2 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps           -1 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd           -2 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd           -1 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 1
 	"vbroadcastsd       6 *  8(%%rax), %%ymm2    \n\t"
@@ -1879,8 +1871,8 @@ void bli_zgemm_haswell_asm_3x4
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps            0 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps            1 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd            0 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd            1 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 2
 	"prefetcht0  38 * 16(%%rax)                  \n\t"
@@ -1906,8 +1898,8 @@ void bli_zgemm_haswell_asm_3x4
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps            2 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps            3 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd            2 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd            3 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 3
 	"vbroadcastsd      18 *  8(%%rax), %%ymm2    \n\t"
@@ -1934,8 +1926,8 @@ void bli_zgemm_haswell_asm_3x4
 	"addq          $4 * 3 * 16, %%rax            \n\t" // a += 4*3 (unroll x mr)
 	"addq          $4 * 4 * 16, %%rbx            \n\t" // b += 4*4 (unroll x nr)
 	"                                            \n\t"
-	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"decq   %%rsi                                \n\t" // i -= 1;
@@ -1982,8 +1974,8 @@ void bli_zgemm_haswell_asm_3x4
 	"addq          $1 * 3 * 16, %%rax            \n\t" // a += 1*3 (unroll x mr)
 	"addq          $1 * 4 * 16, %%rbx            \n\t" // b += 1*4 (unroll x nr)
 	"                                            \n\t"
-	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rbx), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rbx), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"decq   %%rsi                                \n\t" // i -= 1;
@@ -2186,34 +2178,34 @@ void bli_zgemm_haswell_asm_3x4
 	".ZGENSTORBZ:                                \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm4,  %%ymm0            \n\t"
+	"vmovapd          %%ymm4,  %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"addq      %%rdx, %%rcx                      \n\t" // c += 2*cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm5,  %%ymm0            \n\t"
+	"vmovapd          %%ymm5,  %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"movq      %%r11, %%rcx                      \n\t" // rcx = c + 1*rs_c
 	"                                            \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm8,  %%ymm0            \n\t"
+	"vmovapd          %%ymm8,  %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"addq      %%rdx, %%rcx                      \n\t" // c += 2*cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm9,  %%ymm0            \n\t"
+	"vmovapd          %%ymm9,  %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"movq      %%r12, %%rcx                      \n\t" // rcx = c + 2*rs_c
 	"                                            \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm12, %%ymm0            \n\t"
+	"vmovapd          %%ymm12, %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"addq      %%rdx, %%rcx                      \n\t" // c += 2*cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm13, %%ymm0            \n\t"
+	"vmovapd          %%ymm13, %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"                                            \n\t"
 	"                                            \n\t"
@@ -2225,14 +2217,14 @@ void bli_zgemm_haswell_asm_3x4
 	".ZROWSTORBZ:                                \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm4,  (%%rcx)           \n\t"
-	"vmovups          %%ymm5,  (%%rcx,%%rdx,1)   \n\t"
+	"vmovupd          %%ymm4,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm5,  (%%rcx,%%rdx,1)   \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm8,  (%%r11)           \n\t"
-	"vmovups          %%ymm9,  (%%r11,%%rdx,1)   \n\t"
+	"vmovupd          %%ymm8,  (%%r11)           \n\t"
+	"vmovupd          %%ymm9,  (%%r11,%%rdx,1)   \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm12, (%%r12)           \n\t"
-	"vmovups          %%ymm13, (%%r12,%%rdx,1)   \n\t"
+	"vmovupd          %%ymm12, (%%r12)           \n\t"
+	"vmovupd          %%ymm13, (%%r12,%%rdx,1)   \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -2240,8 +2232,6 @@ void bli_zgemm_haswell_asm_3x4
 	"                                            \n\t"
 	"                                            \n\t"
 	".ZDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -596,8 +596,6 @@ void bli_sgemm_haswell_asm_16x6
 	"                                            \n\t"
 	"                                            \n\t"
 	".SDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
@@ -632,10 +630,10 @@ void bli_sgemm_haswell_asm_16x6
 	"vmovlpd    (%%rcx,%%rsi,2),  %%xmm1,  %%xmm1  \n\t" \
 	"vmovhpd    (%%rcx,%%r13  ),  %%xmm1,  %%xmm1  \n\t" \
 	"vperm2f128 $0x20,   %%ymm1,  %%ymm0,  %%ymm0  \n\t" /*\
-	"vmovlps    (%%rcx,%%rsi,4),  %%xmm2,  %%xmm2  \n\t" \
-	"vmovhps    (%%rcx,%%r15  ),  %%xmm2,  %%xmm2  \n\t" \
-	"vmovlps    (%%rcx,%%r13,2),  %%xmm1,  %%xmm1  \n\t" \
-	"vmovhps    (%%rcx,%%r10  ),  %%xmm1,  %%xmm1  \n\t" \
+	"vmovlpd    (%%rcx,%%rsi,4),  %%xmm2,  %%xmm2  \n\t" \
+	"vmovhpd    (%%rcx,%%r15  ),  %%xmm2,  %%xmm2  \n\t" \
+	"vmovlpd    (%%rcx,%%r13,2),  %%xmm1,  %%xmm1  \n\t" \
+	"vmovhpd    (%%rcx,%%r10  ),  %%xmm1,  %%xmm1  \n\t" \
 	"vperm2f128 $0x20,   %%ymm1,  %%ymm2,  %%ymm2  \n\t"*/

 #define DGEMM_OUTPUT_GS_BETA_NZ \
@@ -680,8 +678,8 @@ void bli_dgemm_haswell_asm_8x6
 	"                                            \n\t"
 	"addq           $32 * 4, %%rax               \n\t"
 	"                                            \n\t" // initialize loop by pre-loading
-	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"movq                %6, %%rcx               \n\t" // load address of c
 	"movq                %8, %%rdi               \n\t" // load cs_c
@@ -732,8 +730,8 @@ void bli_dgemm_haswell_asm_8x6
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps           -2 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps           -1 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd           -2 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd           -1 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 1
 	"vbroadcastsd       6 *  8(%%rbx), %%ymm2    \n\t"
@@ -757,8 +755,8 @@ void bli_dgemm_haswell_asm_8x6
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps            0 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps            1 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd            0 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd            1 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 2
 	"prefetcht0   76 * 8(%%rax)                  \n\t"
@@ -784,8 +782,8 @@ void bli_dgemm_haswell_asm_8x6
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps            2 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps            3 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd            2 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd            3 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 3
 	"vbroadcastsd      18 *  8(%%rbx), %%ymm2    \n\t"
@@ -812,8 +810,8 @@ void bli_dgemm_haswell_asm_8x6
 	"addq           $4 * 8 * 8, %%rax            \n\t" // a += 4*8 (unroll x mr)
 	"addq           $4 * 6 * 8, %%rbx            \n\t" // b += 4*6 (unroll x nr)
 	"                                            \n\t"
-	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"decq   %%rsi                                \n\t" // i -= 1;
@@ -860,8 +858,8 @@ void bli_dgemm_haswell_asm_8x6
 	"addq           $1 * 8 * 8, %%rax            \n\t" // a += 1*8 (unroll x mr)
 	"addq           $1 * 6 * 8, %%rbx            \n\t" // b += 1*6 (unroll x nr)
 	"                                            \n\t"
-	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"decq   %%rsi                                \n\t" // i -= 1;
@@ -1006,50 +1004,50 @@ void bli_dgemm_haswell_asm_8x6
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm4    \n\t"
-	"vmovups          %%ymm4,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm4,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm5    \n\t"
-	"vmovups          %%ymm5,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm5,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm6    \n\t"
-	"vmovups          %%ymm6,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm6,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm7    \n\t"
-	"vmovups          %%ymm7,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm7,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm8    \n\t"
-	"vmovups          %%ymm8,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm8,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm9    \n\t"
-	"vmovups          %%ymm9,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm9,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm10   \n\t"
-	"vmovups          %%ymm10, (%%rcx)           \n\t"
+	"vmovupd          %%ymm10, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm11   \n\t"
-	"vmovups          %%ymm11, (%%rdx)           \n\t"
+	"vmovupd          %%ymm11, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm12   \n\t"
-	"vmovups          %%ymm12, (%%rcx)           \n\t"
+	"vmovupd          %%ymm12, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm13   \n\t"
-	"vmovups          %%ymm13, (%%rdx)           \n\t"
+	"vmovupd          %%ymm13, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm14   \n\t"
-	"vmovups          %%ymm14, (%%rcx)           \n\t"
+	"vmovupd          %%ymm14, (%%rcx)           \n\t"
 	//"addq      %%rdi, %%rcx                      \n\t"
 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm15   \n\t"
-	"vmovups          %%ymm15, (%%rdx)           \n\t"
+	"vmovupd          %%ymm15, (%%rdx)           \n\t"
 	//"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -1068,32 +1066,32 @@ void bli_dgemm_haswell_asm_8x6
 	".DGENSTORBZ:                                \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm4,  %%ymm0           \n\t"
+	"vmovapd           %%ymm4,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm6,  %%ymm0           \n\t"
+	"vmovapd           %%ymm6,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm8,  %%ymm0           \n\t"
+	"vmovapd           %%ymm8,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm10, %%ymm0           \n\t"
+	"vmovapd           %%ymm10, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm12, %%ymm0           \n\t"
+	"vmovapd           %%ymm12, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm14, %%ymm0           \n\t"
+	"vmovapd           %%ymm14, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
@@ -1101,32 +1099,32 @@ void bli_dgemm_haswell_asm_8x6
 	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 4*rs_c
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm5,  %%ymm0           \n\t"
+	"vmovapd           %%ymm5,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm7,  %%ymm0           \n\t"
+	"vmovapd           %%ymm7,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm9,  %%ymm0           \n\t"
+	"vmovapd           %%ymm9,  %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm11, %%ymm0           \n\t"
+	"vmovapd           %%ymm11, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm13, %%ymm0           \n\t"
+	"vmovapd           %%ymm13, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps           %%ymm15, %%ymm0           \n\t"
+	"vmovapd           %%ymm15, %%ymm0           \n\t"
 	DGEMM_OUTPUT_GS_BETA_NZ
 	//"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
 	"                                            \n\t"
@@ -1139,38 +1137,38 @@ void bli_dgemm_haswell_asm_8x6
 	".DCOLSTORBZ:                                \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm4,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm4,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm5,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm5,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm6,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm6,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm7,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm7,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm8,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm8,  (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm9,  (%%rdx)           \n\t"
+	"vmovupd          %%ymm9,  (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm10, (%%rcx)           \n\t"
+	"vmovupd          %%ymm10, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm11, (%%rdx)           \n\t"
+	"vmovupd          %%ymm11, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm12, (%%rcx)           \n\t"
+	"vmovupd          %%ymm12, (%%rcx)           \n\t"
 	"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm13, (%%rdx)           \n\t"
+	"vmovupd          %%ymm13, (%%rdx)           \n\t"
 	"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm14, (%%rcx)           \n\t"
+	"vmovupd          %%ymm14, (%%rcx)           \n\t"
 	//"addq      %%rdi, %%rcx                      \n\t"
-	"vmovups          %%ymm15, (%%rdx)           \n\t"
+	"vmovupd          %%ymm15, (%%rdx)           \n\t"
 	//"addq      %%rdi, %%rdx                      \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -1180,8 +1178,6 @@ void bli_dgemm_haswell_asm_8x6
 	"                                            \n\t"
 	"                                            \n\t"
 	".DDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
@@ -1711,8 +1707,6 @@ void bli_cgemm_haswell_asm_8x3
 	"                                            \n\t"
 	"                                            \n\t"
 	".CDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
@@ -1804,8 +1798,8 @@ void bli_zgemm_haswell_asm_4x3
 	"                                            \n\t"
 	"addq           $32 * 4, %%rax               \n\t"
 	"                                            \n\t" // initialize loop by pre-loading
-	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"movq                %6, %%rcx               \n\t" // load address of c
 	"movq                %8, %%rdi               \n\t" // load cs_c
@@ -1855,8 +1849,8 @@ void bli_zgemm_haswell_asm_4x3
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps           -2 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps           -1 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd           -2 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd           -1 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 1
 	"vbroadcastsd       6 *  8(%%rbx), %%ymm2    \n\t"
@@ -1880,8 +1874,8 @@ void bli_zgemm_haswell_asm_4x3
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps            0 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps            1 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd            0 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd            1 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 2
 	"prefetcht0  38 * 16(%%rax)                  \n\t"
@@ -1907,8 +1901,8 @@ void bli_zgemm_haswell_asm_4x3
 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
 	"                                            \n\t"
-	"vmovaps            2 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps            3 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd            2 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd            3 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t" // iteration 3
 	"vbroadcastsd      18 *  8(%%rbx), %%ymm2    \n\t"
@@ -1935,8 +1929,8 @@ void bli_zgemm_haswell_asm_4x3
 	"addq          $4 * 4 * 16, %%rax            \n\t" // a += 4*4 (unroll x mr)
 	"addq          $4 * 3 * 16, %%rbx            \n\t" // b += 4*3 (unroll x nr)
 	"                                            \n\t"
-	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"decq   %%rsi                                \n\t" // i -= 1;
@@ -1983,8 +1977,8 @@ void bli_zgemm_haswell_asm_4x3
 	"addq          $1 * 4 * 16, %%rax            \n\t" // a += 1*4 (unroll x mr)
 	"addq          $1 * 3 * 16, %%rbx            \n\t" // b += 1*3 (unroll x nr)
 	"                                            \n\t"
-	"vmovaps           -4 * 32(%%rax), %%ymm0    \n\t"
-	"vmovaps           -3 * 32(%%rax), %%ymm1    \n\t"
+	"vmovapd           -4 * 32(%%rax), %%ymm0    \n\t"
+	"vmovapd           -3 * 32(%%rax), %%ymm1    \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"decq   %%rsi                                \n\t" // i -= 1;
@@ -2187,34 +2181,34 @@ void bli_zgemm_haswell_asm_4x3
 	".ZGENSTORBZ:                                \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm4,  %%ymm0            \n\t"
+	"vmovapd          %%ymm4,  %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm5,  %%ymm0            \n\t"
+	"vmovapd          %%ymm5,  %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"movq      %%r11, %%rcx                      \n\t" // rcx = c + 1*cs_c
 	"                                            \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm8,  %%ymm0            \n\t"
+	"vmovapd          %%ymm8,  %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm9,  %%ymm0            \n\t"
+	"vmovapd          %%ymm9,  %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"movq      %%r12, %%rcx                      \n\t" // rcx = c + 2*cs_c
 	"                                            \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm12, %%ymm0            \n\t"
+	"vmovapd          %%ymm12, %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"addq      %%rdx, %%rcx                      \n\t" // c += 2*rs_c;
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovaps          %%ymm13, %%ymm0            \n\t"
+	"vmovapd          %%ymm13, %%ymm0            \n\t"
 	ZGEMM_OUTPUT_GS
 	"                                            \n\t"
 	"                                            \n\t"
@@ -2226,14 +2220,14 @@ void bli_zgemm_haswell_asm_4x3
 	".ZCOLSTORBZ:                                \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm4,  (%%rcx)           \n\t"
-	"vmovups          %%ymm5,  (%%rcx,%%rdx,1)   \n\t"
+	"vmovupd          %%ymm4,  (%%rcx)           \n\t"
+	"vmovupd          %%ymm5,  (%%rcx,%%rdx,1)   \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm8,  (%%r11)           \n\t"
-	"vmovups          %%ymm9,  (%%r11,%%rdx,1)   \n\t"
+	"vmovupd          %%ymm8,  (%%r11)           \n\t"
+	"vmovupd          %%ymm9,  (%%r11,%%rdx,1)   \n\t"
 	"                                            \n\t"
-	"vmovups          %%ymm12, (%%r12)           \n\t"
-	"vmovups          %%ymm13, (%%r12,%%rdx,1)   \n\t"
+	"vmovupd          %%ymm12, (%%r12)           \n\t"
+	"vmovupd          %%ymm13, (%%r12,%%rdx,1)   \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
 	"                                            \n\t"
@@ -2241,8 +2235,6 @@ void bli_zgemm_haswell_asm_4x3
 	"                                            \n\t"
 	"                                            \n\t"
 	".ZDONE:                                     \n\t"
-    "                                            \n\t"
-    "vzeroupper                                  \n\t"
 	"                                            \n\t"
 	"vzeroupper                                  \n\t"
 	"                                            \n\t"
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -0,0 +1,484 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016, Advanced Micro Devices, Inc
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+typedef union
+{
+	__m128  v;
+	float   f[4];
+} v4sf_t;
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+}v4df_t;
+
+typedef union
+{
+	__m128d v;
+	double  d[2];
+}v2dd_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_samaxv_zen_int
+     (
+       dim_t   n,
+       float*  x, inc_t incx,
+       dim_t*  i_max,
+       cntx_t* cntx
+     )
+{
+	float*  minus_one = PASTEMAC(s,m1);
+	dim_t*  zero_i    = PASTEMAC(i,0);
+
+	float   chi1_r;
+	//float   chi1_i;
+	float   abs_chi1;
+	float   abs_chi1_max;
+	dim_t   i_max_l;
+	dim_t   i;
+
+	/* If the vector length is zero, return early. This directly emulates
+	   the behavior of netlib BLAS's i?amax() routines. */
+	if ( bli_zero_dim1( n ) )
+	{
+		PASTEMAC(i,copys)( *zero_i, *i_max );
+		return;
+	}
+
+	/* Initialize the index of the maximum absolute value to zero. */
+	PASTEMAC(i,copys)( *zero_i, i_max_l );
+
+	/* Initialize the maximum absolute value search candidate with
+	   -1, which is guaranteed to be less than all values we will
+	   compute. */
+	PASTEMAC(s,copys)( *minus_one, abs_chi1_max );
+
+	// For non-unit strides, or very small vector lengths, compute with
+	// scalar code.
+	if ( incx != 1 || n < 8 )
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			float* chi1 = x + (i  )*incx;
+
+			/* Get the real and imaginary components of chi1. */
+			chi1_r = *chi1;
+
+			/* Replace chi1_r and chi1_i with their absolute values. */
+			chi1_r = fabsf( chi1_r );
+
+			/* Add the real and imaginary absolute values together. */
+			abs_chi1 = chi1_r;
+
+			/* If the absolute value of the current element exceeds that of
+			   the previous largest, save it and its index. If NaN is
+			   encountered, then treat it the same as if it were a valid
+			   value that was smaller than any previously seen. This
+			   behavior mimics that of LAPACK's ?lange(). */
+			if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
+			{
+				abs_chi1_max = abs_chi1;
+				i_max_l      = i;
+			}
+		}
+	}
+	else
+	{
+		dim_t  n_iter, n_left;
+		dim_t  num_vec_elements = 8;
+		v8sf_t x_vec, max_vec, maxInx_vec, mask_vec;
+		v8sf_t idx_vec, inc_vec;
+		v8sf_t sign_mask;
+
+		v4sf_t max_vec_lo, max_vec_hi, mask_vec_lo;
+		v4sf_t maxInx_vec_lo, maxInx_vec_hi;
+
+		n_iter = n / num_vec_elements;
+		n_left = n % num_vec_elements;
+
+		idx_vec.v    = _mm256_set_ps( 7, 6, 5, 4, 3, 2, 1, 0 );
+		inc_vec.v    = _mm256_set1_ps( 8 );
+		max_vec.v    = _mm256_set1_ps( -1 );
+		maxInx_vec.v = _mm256_setzero_ps();
+		sign_mask.v  = _mm256_set1_ps( -0.f );
+
+		for ( i = 0; i < n_iter; ++i )
+		{
+			x_vec.v      = _mm256_loadu_ps( x );
+
+			// Get the absolute value of the vector element.
+			x_vec.v      = _mm256_andnot_ps( sign_mask.v, x_vec.v );
+
+			mask_vec.v   = _mm256_cmp_ps( x_vec.v, max_vec.v, _CMP_GT_OS );
+
+			max_vec.v    = _mm256_blendv_ps( max_vec.v, x_vec.v, mask_vec.v );
+			maxInx_vec.v = _mm256_blendv_ps( maxInx_vec.v, idx_vec.v, mask_vec.v );
+
+			idx_vec.v += inc_vec.v;
+			x         += num_vec_elements;
+		}
+
+		max_vec_lo.v  = _mm256_extractf128_ps( max_vec.v, 0 );
+		max_vec_hi.v  = _mm256_extractf128_ps( max_vec.v, 1 );
+		mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
+
+		max_vec_lo.v  = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
+
+		maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 );
+		maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 );
+		maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
+
+		max_vec_hi.v    = _mm_permute_ps( max_vec_lo.v, 14 );
+		maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
+		mask_vec_lo.v   = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
+
+		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
+		maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
+
+		if ( max_vec_lo.f[0] > max_vec_lo.f[1] )
+		{
+			abs_chi1_max = max_vec_lo.f[0];
+			i_max_l      = maxInx_vec_lo.f[0];
+		}
+		else
+		{
+			abs_chi1_max = max_vec_lo.f[1];
+			i_max_l      = maxInx_vec_lo.f[1];
+		}
+
+		for ( i = n - n_left; i < n; i++ )
+		{
+			float* chi1 = x;
+
+			/* Get the real and imaginary components of chi1. */
+			chi1_r = *chi1;
+
+			/* Replace chi1_r and chi1_i with their absolute values. */
+			abs_chi1 = fabsf( chi1_r );
+
+			/* If the absolute value of the current element exceeds that of
+			   the previous largest, save it and its index. If NaN is
+			   encountered, then treat it the same as if it were a valid
+			   value that was smaller than any previously seen. This
+			   behavior mimics that of LAPACK's ?lange(). */
+			if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
+			{
+				abs_chi1_max = abs_chi1;
+				i_max_l      = i;
+			}
+
+			x += 1;
+		}
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// later, especially if BLIS is compiled with -mfpmath=sse).
+	_mm256_zeroupper();
+
+	/* Store final index to output variable. */
+	*i_max = i_max_l;
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_damaxv_zen_int
+     (
+       dim_t    n,
+       double*  x, inc_t incx,
+       dim_t*   i_max,
+       cntx_t*  cntx
+     )
+{
+	double* minus_one = PASTEMAC(d,m1);
+	dim_t*  zero_i    = PASTEMAC(i,0);
+
+	double  chi1_r;
+	//double  chi1_i;
+	double  abs_chi1;
+	double  abs_chi1_max;
+	dim_t   i_max_l;
+	dim_t   i;
+
+	/* If the vector length is zero, return early. This directly emulates
+	   the behavior of netlib BLAS's i?amax() routines. */
+	if ( bli_zero_dim1( n ) )
+	{
+		PASTEMAC(i,copys)( *zero_i, *i_max );
+		return;
+	}
+
+	/* Initialize the index of the maximum absolute value to zero. */ \
+	PASTEMAC(i,copys)( *zero_i, i_max_l );
+
+	/* Initialize the maximum absolute value search candidate with
+	   -1, which is guaranteed to be less than all values we will
+	   compute. */
+	PASTEMAC(d,copys)( *minus_one, abs_chi1_max );
+
+	// For non-unit strides, or very small vector lengths, compute with
+	// scalar code.
+	if ( incx != 1 || n < 4 )
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			double* chi1 = x + (i  )*incx;
+
+			/* Get the real and imaginary components of chi1. */
+			chi1_r = *chi1;
+
+			/* Replace chi1_r and chi1_i with their absolute values. */
+			chi1_r = fabs( chi1_r );
+
+			/* Add the real and imaginary absolute values together. */
+			abs_chi1 = chi1_r;
+
+			/* If the absolute value of the current element exceeds that of
+			   the previous largest, save it and its index. If NaN is
+			   encountered, then treat it the same as if it were a valid
+			   value that was smaller than any previously seen. This
+			   behavior mimics that of LAPACK's ?lange(). */
+			if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
+			{
+				abs_chi1_max = abs_chi1;
+				i_max_l      = i;
+			}
+		}
+	}
+	else
+	{
+		dim_t  n_iter, n_left;
+		dim_t  num_vec_elements = 4;
+		v4df_t x_vec, max_vec, maxInx_vec, mask_vec;
+		v4df_t idx_vec, inc_vec;
+		v4df_t sign_mask;
+
+		v2dd_t max_vec_lo, max_vec_hi, mask_vec_lo;
+		v2dd_t maxInx_vec_lo, maxInx_vec_hi;
+
+		n_iter = n / num_vec_elements;
+		n_left = n % num_vec_elements;
+
+		idx_vec.v    = _mm256_set_pd( 3, 2, 1, 0 );
+		inc_vec.v    = _mm256_set1_pd( 4 );
+		max_vec.v    = _mm256_set1_pd( -1 );
+		maxInx_vec.v = _mm256_setzero_pd();
+		sign_mask.v  = _mm256_set1_pd( -0.f );
+
+		for ( i = 0; i < n_iter; ++i )
+		{
+			x_vec.v      = _mm256_loadu_pd( x );
+
+			// Get the absolute value of the vector element.
+			x_vec.v      = _mm256_andnot_pd( sign_mask.v, x_vec.v );
+
+			mask_vec.v   = _mm256_cmp_pd( x_vec.v, max_vec.v, _CMP_GT_OS );
+
+			max_vec.v    = _mm256_blendv_pd( max_vec.v, x_vec.v, mask_vec.v );
+			maxInx_vec.v = _mm256_blendv_pd( maxInx_vec.v, idx_vec.v, mask_vec.v );
+
+			idx_vec.v += inc_vec.v;
+			x         += num_vec_elements;
+		}
+
+		max_vec_lo.v  = _mm256_extractf128_pd( max_vec.v, 0 );
+		max_vec_hi.v  = _mm256_extractf128_pd( max_vec.v, 1 );
+		mask_vec_lo.v = _mm_cmp_pd( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
+
+		max_vec_lo.v  = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
+
+		maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 );
+		maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 );
+		maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
+
+		if ( max_vec_lo.d[0] > max_vec_lo.d[1] )
+		{
+			abs_chi1_max = max_vec_lo.d[0];
+			i_max_l      = maxInx_vec_lo.d[0];
+		}
+		else
+		{
+			abs_chi1_max = max_vec_lo.d[1];
+			i_max_l      = maxInx_vec_lo.d[1];
+		}
+
+		for ( i = n - n_left; i < n; i++ )
+		{
+			double* chi1 = x;
+
+			/* Get the real and imaginary components of chi1. */
+			chi1_r = *chi1;
+
+			/* Replace chi1_r and chi1_i with their absolute values. */
+			abs_chi1 = fabs( chi1_r );
+
+			/* If the absolute value of the current element exceeds that of
+			   the previous largest, save it and its index. If NaN is
+			   encountered, then treat it the same as if it were a valid
+			   value that was smaller than any previously seen. This
+			   behavior mimics that of LAPACK's ?lange(). */
+			if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
+			{
+				abs_chi1_max = abs_chi1;
+				i_max_l      = i;
+			}
+
+			x += 1;
+		}
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// later, especially if BLIS is compiled with -mfpmath=sse).
+	_mm256_zeroupper();
+
+	/* Store final index to output variable. */
+	*i_max = i_max_l;
+}
+
+// -----------------------------------------------------------------------------
+
+#if 0
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       dim_t*   i_max, \
+       cntx_t*  cntx  \
+     ) \
+{ \
+	ctype_r* minus_one = PASTEMAC(chr,m1); \
+	dim_t*   zero_i    = PASTEMAC(i,0); \
+\
+	ctype_r  chi1_r; \
+	ctype_r  chi1_i; \
+	ctype_r  abs_chi1; \
+	ctype_r  abs_chi1_max; \
+	dim_t    i; \
+\
+	/* Initialize the index of the maximum absolute value to zero. */ \
+	PASTEMAC(i,copys)( zero_i, *i_max ); \
+\
+	/* If the vector length is zero, return early. This directly emulates
+	   the behavior of netlib BLAS's i?amax() routines. */ \
+	if ( bli_zero_dim1( n ) ) return; \
+\
+	/* Initialize the maximum absolute value search candidate with
+	   -1, which is guaranteed to be less than all values we will
+	   compute. */ \
+	PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \
+\
+	if ( incx == 1 ) \
+	{ \
+		for ( i = 0; i < n; ++i ) \
+		{ \
+			/* Get the real and imaginary components of chi1. */ \
+			PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \
+\
+			/* Replace chi1_r and chi1_i with their absolute values. */ \
+			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
+			PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \
+\
+			/* Add the real and imaginary absolute values together. */ \
+			PASTEMAC(chr,set0s)( abs_chi1 ); \
+			PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \
+			PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \
+\
+			/* If the absolute value of the current element exceeds that of
+			   the previous largest, save it and its index. If NaN is
+			   encountered, then treat it the same as if it were a valid
+			   value that was smaller than any previously seen. This
+			   behavior mimics that of LAPACK's ?lange(). */ \
+			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
+			{ \
+				abs_chi1_max = abs_chi1; \
+				*i_max       = i; \
+			} \
+		} \
+	} \
+	else \
+	{ \
+		for ( i = 0; i < n; ++i ) \
+		{ \
+			ctype* chi1 = x + (i  )*incx; \
+\
+			/* Get the real and imaginary components of chi1. */ \
+			PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+\
+			/* Replace chi1_r and chi1_i with their absolute values. */ \
+			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
+			PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \
+\
+			/* Add the real and imaginary absolute values together. */ \
+			PASTEMAC(chr,set0s)( abs_chi1 ); \
+			PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \
+			PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \
+\
+			/* If the absolute value of the current element exceeds that of
+			   the previous largest, save it and its index. If NaN is
+			   encountered, then treat it the same as if it were a valid
+			   value that was smaller than any previously seen. This
+			   behavior mimics that of LAPACK's ?lange(). */ \
+			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
+			{ \
+				abs_chi1_max = abs_chi1; \
+				*i_max       = i; \
+			} \
+		} \
+	} \
+}
+GENTFUNCR( scomplex, float,  c, s, amaxv_zen_int )
+GENTFUNCR( dcomplex, double, z, d, amaxv_zen_int )
+#endif
--- a/kernels/zen/1/bli_axpyv_zen_int.c
+++ b/kernels/zen/1/bli_axpyv_zen_int.c
@@ -0,0 +1,262 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_saxpyv_zen_int
+     (
+       conj_t           conjx,
+       dim_t            n,
+       float*  restrict alpha,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 8;
+	const dim_t      n_iter_unroll  = 4;
+
+	dim_t            i;
+	dim_t            n_viter;
+	dim_t            n_left;
+
+	float*  restrict x0;
+	float*  restrict y0;
+
+	v8sf_t           alphav;
+	v8sf_t           x0v, x1v, x2v, x3v;
+	v8sf_t           y0v, y1v, y2v, y3v;
+
+	// If the vector dimension is zero, or if alpha is zero, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override n_viter and n_left to use scalar code
+	// for all iterations.
+	if ( incx != 1 || incy != 1 )
+	{
+		n_viter = 0;
+		n_left  = n;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	// Broadcast the alpha scalar to all elements of a vector register.
+	alphav.v = _mm256_broadcast_ss( alpha );
+
+	// If there are vectorized iterations, perform them with vector
+	// instructions.
+	for ( i = 0; i < n_viter; ++i )
+	{
+		// Load the input values.
+		y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+
+		y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+		x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+
+		y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+		x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+
+		y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+		x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+
+		// perform : y += alpha * x;
+		y0v.v = _mm256_fmadd_ps( alphav.v, x0v.v, y0v.v );
+		y1v.v = _mm256_fmadd_ps( alphav.v, x1v.v, y1v.v );
+		y2v.v = _mm256_fmadd_ps( alphav.v, x2v.v, y2v.v );
+		y3v.v = _mm256_fmadd_ps( alphav.v, x3v.v, y3v.v );
+
+		// Store the output.
+		_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+		_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
+		_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), y2v.v );
+		_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), y3v.v );
+
+		x0 += n_elem_per_reg * n_iter_unroll;
+		y0 += n_elem_per_reg * n_iter_unroll;
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	const float alphac = *alpha;
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < n_left; ++i )
+	{
+		const float x0c = *x0;
+
+		*y0 += alphac * x0c;
+
+		x0 += incx;
+		y0 += incy;
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyv_zen_int
+     (
+       conj_t           conjx,
+       dim_t            n,
+       double* restrict alpha,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t       n_elem_per_reg = 4;
+	const dim_t       n_iter_unroll  = 4;
+
+	dim_t             i;
+	dim_t             n_viter;
+	dim_t             n_left;
+
+	double*  restrict x0;
+	double*  restrict y0;
+
+	v4df_t            alphav;
+	v4df_t            x0v, x1v, x2v, x3v;
+	v4df_t            y0v, y1v, y2v, y3v;
+
+	// If the vector dimension is zero, or if alpha is zero, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override n_viter and n_left to use scalar code
+	// for all iterations.
+	if ( incx != 1 || incy != 1 )
+	{
+		n_viter = 0;
+		n_left  = n;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	// Broadcast the alpha scalar to all elements of a vector register.
+	alphav.v = _mm256_broadcast_sd( alpha );
+
+	// If there are vectorized iterations, perform them with vector
+	// instructions.
+	for ( i = 0; i < n_viter; ++i )
+	{
+		// Load the input values.
+		y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+
+		y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+		x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+
+		y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+		x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+
+		y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+
+		// perform : y += alpha * x;
+		y0v.v = _mm256_fmadd_pd( alphav.v, x0v.v, y0v.v );
+		y1v.v = _mm256_fmadd_pd( alphav.v, x1v.v, y1v.v );
+		y2v.v = _mm256_fmadd_pd( alphav.v, x2v.v, y2v.v );
+		y3v.v = _mm256_fmadd_pd( alphav.v, x3v.v, y3v.v );
+
+		// Store the output.
+		_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+		_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
+		_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), y2v.v );
+		_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), y3v.v );
+
+		x0 += n_elem_per_reg * n_iter_unroll;
+		y0 += n_elem_per_reg * n_iter_unroll;
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	const double alphac = *alpha;
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < n_left; ++i )
+	{
+		const double x0c = *x0;
+
+		*y0 += alphac * x0c;
+
+		x0 += incx;
+		y0 += incy;
+	}
+}
+
--- a/kernels/zen/1/bli_axpyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpyv_zen_int10.c
@@ -0,0 +1,466 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_saxpyv_zen_int10
+     (
+       conj_t           conjx,
+       dim_t            n,
+       float*  restrict alpha,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 8;
+
+	dim_t            i;
+
+	float*  restrict x0;
+	float*  restrict y0;
+
+	__m256           alphav;
+	__m256           xv[10];
+	__m256           yv[10];
+	__m256           zv[10];
+
+	// If the vector dimension is zero, or if alpha is zero, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	if ( incx == 1 && incy == 1 )
+	{
+		// Broadcast the alpha scalar to all elements of a vector register.
+		alphav = _mm256_broadcast_ss( alpha );
+
+		for ( i = 0; (i + 79) < n; i += 80 )
+		{
+			// 80 elements will be processed per loop; 10 FMAs will run per loop.
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
+			yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg );
+			yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
+			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
+			zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
+			zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
+			zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
+			zv[5] = _mm256_fmadd_ps( xv[5], alphav, yv[5] );
+			zv[6] = _mm256_fmadd_ps( xv[6], alphav, yv[6] );
+			zv[7] = _mm256_fmadd_ps( xv[7], alphav, yv[7] );
+			zv[8] = _mm256_fmadd_ps( xv[8], alphav, yv[8] );
+			zv[9] = _mm256_fmadd_ps( xv[9], alphav, yv[9] );
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_ps( (y0 + 5*n_elem_per_reg), zv[5] );
+			_mm256_storeu_ps( (y0 + 6*n_elem_per_reg), zv[6] );
+			_mm256_storeu_ps( (y0 + 7*n_elem_per_reg), zv[7] );
+			_mm256_storeu_ps( (y0 + 8*n_elem_per_reg), zv[8] );
+			_mm256_storeu_ps( (y0 + 9*n_elem_per_reg), zv[9] );
+
+			x0 += 10*n_elem_per_reg;
+			y0 += 10*n_elem_per_reg;
+		}
+
+		for ( ; (i + 39) < n; i += 40 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
+			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
+			zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
+			zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
+			zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
+
+			x0 += 5*n_elem_per_reg;
+			y0 += 5*n_elem_per_reg;
+		}
+
+		for ( ; (i + 31) < n; i += 32 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
+			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
+			zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
+			zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
+
+			x0 += 4*n_elem_per_reg;
+			y0 += 4*n_elem_per_reg;
+		}
+
+		for ( ; (i + 15) < n; i += 16 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
+			zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
+
+			x0 += 2*n_elem_per_reg;
+			y0 += 2*n_elem_per_reg;
+		}
+
+		for ( ; (i + 7) < n; i += 8 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
+
+			x0 += 1*n_elem_per_reg;
+			y0 += 1*n_elem_per_reg;
+		}
+
+		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+		// This avoids a performance penalty caused by false dependencies when
+		// transitioning from from AVX to SSE instructions (which may occur
+		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// -mfpmath=sse).
+		_mm256_zeroupper();
+
+		for ( ; (i + 0) < n; i += 1 )
+		{
+			*y0 += (*alpha) * (*x0);
+
+			x0 += 1;
+			y0 += 1;
+		}
+	}
+	else
+	{
+		const float alphac = *alpha;
+
+		for ( i = 0; i < n; ++i )
+		{
+			const float x0c = *x0;
+
+			*y0 += alphac * x0c;
+
+			x0 += incx;
+			y0 += incy;
+		}
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyv_zen_int10
+     (
+       conj_t           conjx,
+       dim_t            n,
+       double* restrict alpha,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 4;
+
+	dim_t            i;
+
+	double* restrict x0 = x;
+	double* restrict y0 = y;
+
+	__m256d          alphav;
+	__m256d          xv[10];
+	__m256d          yv[10];
+	__m256d          zv[10];
+
+	// If the vector dimension is zero, or if alpha is zero, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	if ( incx == 1 && incy == 1 )
+	{
+		// Broadcast the alpha scalar to all elements of a vector register.
+		alphav = _mm256_broadcast_sd( alpha );
+
+		for ( i = 0; (i + 39) < n; i += 40 )
+		{
+			// 40 elements will be processed per loop; 10 FMAs will run per loop.
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
+			yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
+			yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
+			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
+			zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
+			zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
+			zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
+			zv[5] = _mm256_fmadd_pd( xv[5], alphav, yv[5] );
+			zv[6] = _mm256_fmadd_pd( xv[6], alphav, yv[6] );
+			zv[7] = _mm256_fmadd_pd( xv[7], alphav, yv[7] );
+			zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] );
+			zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] );
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] );
+			_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] );
+			_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] );
+			_mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] );
+			_mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] );
+
+			x0 += 10*n_elem_per_reg;
+			y0 += 10*n_elem_per_reg;
+		}
+
+		for ( ; (i + 19) < n; i += 20 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
+			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
+			zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
+			zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
+			zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
+
+			x0 += 5*n_elem_per_reg;
+			y0 += 5*n_elem_per_reg;
+		}
+
+		for ( ; (i + 15) < n; i += 16 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
+			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
+			zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
+			zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
+
+			x0 += 4*n_elem_per_reg;
+			y0 += 4*n_elem_per_reg;
+		}
+
+		for ( ; i + 7 < n; i += 8 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
+			zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
+
+			x0 += 2*n_elem_per_reg;
+			y0 += 2*n_elem_per_reg;
+		}
+
+		for ( ; i + 3 < n; i += 4 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+			zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
+
+			x0 += 1*n_elem_per_reg;
+			y0 += 1*n_elem_per_reg;
+		}
+
+		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+		// This avoids a performance penalty caused by false dependencies when
+		// transitioning from from AVX to SSE instructions (which may occur
+		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// -mfpmath=sse).
+		_mm256_zeroupper();
+
+		for ( ; i < n; i += 1 )
+		{
+			*y0 += (*alpha) * (*x0);
+
+			y0 += 1;
+			x0 += 1;
+		}
+	}
+	else
+	{
+		const double alphac = *alpha;
+
+		for ( i = 0; i < n; ++i )
+		{
+			const double x0c = *x0;
+
+			*y0 += alphac * x0c;
+
+			x0 += incx;
+			y0 += incy;
+		}
+	}
+}
+
--- a/kernels/zen/1/bli_dotv_zen_int.c
+++ b/kernels/zen/1/bli_dotv_zen_int.c
@@ -0,0 +1,296 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_sdotv_zen_int
+     (
+       conj_t           conjx,
+       conj_t           conjy,
+       dim_t            n,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       float*  restrict rho,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 8;
+	const dim_t      n_iter_unroll  = 4;
+
+	dim_t            i;
+	dim_t            n_viter;
+	dim_t            n_left;
+
+	float*  restrict x0;
+	float*  restrict y0;
+	float            rho0;
+
+	v8sf_t           rho0v, rho1v, rho2v, rho3v;
+	v8sf_t           x0v, y0v;
+	v8sf_t           x1v, y1v;
+	v8sf_t           x2v, y2v;
+	v8sf_t           x3v, y3v;
+
+	// If the vector dimension is zero, set rho to zero and return early.
+	if ( bli_zero_dim1( n ) )
+	{
+		PASTEMAC(s,set0s)( *rho );
+		return;
+	}
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override n_viter and n_left to use scalar code
+	// for all iterations.
+	if ( incx != 1 || incy != 1 )
+	{
+		n_viter = 0;
+		n_left  = n;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	// Initialize the local scalar rho1 to zero.
+	PASTEMAC(s,set0s)( rho0 );
+
+	// Initialize the unrolled iterations' rho vectors to zero.
+	rho0v.v = _mm256_setzero_ps();
+	rho1v.v = _mm256_setzero_ps();
+	rho2v.v = _mm256_setzero_ps();
+	rho3v.v = _mm256_setzero_ps();
+
+	for ( i = 0; i < n_viter; ++i )
+	{
+		// Load the x and y input vector elements.
+		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+		x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+		x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+
+		x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+
+		// Compute the element-wise product of the x and y vectors,
+		// storing in the corresponding rho vectors.
+		rho0v.v = _mm256_fmadd_ps( x0v.v, y0v.v, rho0v.v );
+		rho1v.v = _mm256_fmadd_ps( x1v.v, y1v.v, rho1v.v );
+		rho2v.v = _mm256_fmadd_ps( x2v.v, y2v.v, rho2v.v );
+		rho3v.v = _mm256_fmadd_ps( x3v.v, y3v.v, rho3v.v );
+
+		x0 += ( n_elem_per_reg * n_iter_unroll );
+		y0 += ( n_elem_per_reg * n_iter_unroll );
+	}
+
+	// Accumulate the unrolled rho vectors into a single vector.
+	rho0v.v += rho1v.v;
+	rho0v.v += rho2v.v;
+	rho0v.v += rho3v.v;
+
+	// Accumulate the final rho vector into a single scalar result.
+	rho0 += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
+	        rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < n_left; ++i )
+	{
+		const float x0c = *x0;
+		const float y0c = *y0;
+
+		rho0 += x0c * y0c;
+
+		x0 += incx;
+		y0 += incy;
+	}
+
+	// Copy the final result into the output variable.
+	PASTEMAC(s,copys)( rho0, *rho );
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_ddotv_zen_int
+     (
+       conj_t           conjx,
+       conj_t           conjy,
+       dim_t            n,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       double* restrict rho,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 4;
+	const dim_t      n_iter_unroll  = 4;
+
+	dim_t            i;
+	dim_t            n_viter;
+	dim_t            n_left;
+
+	double* restrict x0;
+	double* restrict y0;
+	double           rho0;
+
+	v4df_t           rho0v, rho1v, rho2v, rho3v;
+	v4df_t           x0v, y0v;
+	v4df_t           x1v, y1v;
+	v4df_t           x2v, y2v;
+	v4df_t           x3v, y3v;
+
+	// If the vector dimension is zero, set rho to zero and return early.
+	if ( bli_zero_dim1( n ) )
+	{
+		PASTEMAC(d,set0s)( *rho );
+		return;
+	}
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override n_viter and n_left to use scalar code
+	// for all iterations.
+	if ( incx != 1 || incy != 1 )
+	{
+		n_viter = 0;
+		n_left  = n;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	// Initialize the local scalar rho1 to zero.
+	PASTEMAC(d,set0s)( rho0 );
+
+	// Initialize the unrolled iterations' rho vectors to zero.
+	rho0v.v = _mm256_setzero_pd();
+	rho1v.v = _mm256_setzero_pd();
+	rho2v.v = _mm256_setzero_pd();
+	rho3v.v = _mm256_setzero_pd();
+
+	for ( i = 0; i < n_viter; ++i )
+	{
+		// Load the x and y input vector elements.
+		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+		x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+		x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+
+		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+		// Compute the element-wise product of the x and y vectors,
+		// storing in the corresponding rho vectors.
+		rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v );
+		rho1v.v = _mm256_fmadd_pd( x1v.v, y1v.v, rho1v.v );
+		rho2v.v = _mm256_fmadd_pd( x2v.v, y2v.v, rho2v.v );
+		rho3v.v = _mm256_fmadd_pd( x3v.v, y3v.v, rho3v.v );
+
+		x0 += ( n_elem_per_reg * n_iter_unroll );
+		y0 += ( n_elem_per_reg * n_iter_unroll );
+	}
+
+	// Accumulate the unrolled rho vectors into a single vector.
+	rho0v.v += rho1v.v;
+	rho0v.v += rho2v.v;
+	rho0v.v += rho3v.v;
+
+	// Accumulate the final rho vector into a single scalar result.
+	rho0 += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < n_left; ++i )
+	{
+		const double x0c = *x0;
+		const double y0c = *y0;
+
+		rho0 += x0c * y0c;
+
+		x0 += incx;
+		y0 += incy;
+	}
+
+	// Copy the final result into the output variable.
+	PASTEMAC(d,copys)( rho0, *rho );
+}
+
--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
@@ -0,0 +1,440 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_sdotv_zen_int10
+     (
+       conj_t           conjx,
+       conj_t           conjy,
+       dim_t            n,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       float*  restrict rho,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 8;
+
+	dim_t            i;
+
+	float*  restrict x0;
+	float*  restrict y0;
+
+	float            rho0;
+
+	__m256           xv[10];
+	__m256           yv[10];
+	v8sf_t           rhov[2];
+
+	// If the vector dimension is zero, or if alpha is zero, return early.
+	if ( bli_zero_dim1( n ) )
+	{
+		PASTEMAC(s,set0s)( *rho );
+		return;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	PASTEMAC(s,set0s)( rho0 );
+
+	if ( incx == 1 && incy == 1 )
+	{
+		rhov[0].v = _mm256_setzero_ps();
+		rhov[1].v = _mm256_setzero_ps();
+
+		for ( i = 0; (i + 79) < n; i += 80 )
+		{
+			// 80 elements will be processed per loop; 10 FMAs will run per loop.
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
+			yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg );
+			yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[1].v );
+
+			x0 += 10*n_elem_per_reg;
+			y0 += 10*n_elem_per_reg;
+		}
+
+		for ( ; (i + 39) < n; i += 40 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
+
+			x0 += 5*n_elem_per_reg;
+			y0 += 5*n_elem_per_reg;
+		}
+
+		for ( ; (i + 31) < n; i += 32 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
+
+			x0 += 4*n_elem_per_reg;
+			y0 += 4*n_elem_per_reg;
+		}
+
+		for ( ; (i + 15) < n; i += 16 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
+
+			x0 += 2*n_elem_per_reg;
+			y0 += 2*n_elem_per_reg;
+		}
+
+		for ( ; (i + 7) < n; i += 8 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
+
+			x0 += 1*n_elem_per_reg;
+			y0 += 1*n_elem_per_reg;
+		}
+
+		for ( ; (i + 0) < n; i += 1 )
+		{
+			rhov[0].f[0] += x0[i] * y0[i];
+		}
+
+		v8sf_t onev;
+
+		onev.v = _mm256_set1_ps( 1.0f );
+
+		rhov[0].v = _mm256_dp_ps( rhov[0].v, onev.v, 0xf1 );
+        rhov[1].v = _mm256_dp_ps( rhov[1].v, onev.v, 0xf1 );
+
+		// Manually add the results from above to finish the sum.
+		rho0   += rhov[0].f[0] + rhov[0].f[4];
+		rho0   += rhov[1].f[0] + rhov[1].f[4];
+
+		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+		// This avoids a performance penalty caused by false dependencies when
+		// transitioning from from AVX to SSE instructions (which may occur
+		// later, especially if BLIS is compiled with -mfpmath=sse).
+		_mm256_zeroupper();
+
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			const float x0c = *x0;
+			const float y0c = *y0;
+
+			rho0 += x0c * y0c;
+
+			x0 += incx;
+			y0 += incy;
+		}
+	}
+
+	// Copy the final result into the output variable.
+	PASTEMAC(s,copys)( rho0, *rho );
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_ddotv_zen_int10
+     (
+       conj_t           conjx,
+       conj_t           conjy,
+       dim_t            n,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       double* restrict rho,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 4;
+
+	dim_t            i;
+
+	double* restrict x0;
+	double* restrict y0;
+
+	double           rho0;
+
+	__m256d          xv[10];
+	__m256d          yv[10];
+	v4df_t           rhov[2];
+
+	// If the vector dimension is zero, or if alpha is zero, return early.
+	if ( bli_zero_dim1( n ) )
+	{
+		PASTEMAC(d,set0s)( *rho );
+		return;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	PASTEMAC(d,set0s)( rho0 );
+
+	if ( incx == 1 && incy == 1 )
+	{
+		rhov[0].v = _mm256_setzero_pd();
+		rhov[1].v = _mm256_setzero_pd();
+
+		for ( i = 0; (i + 39) < n; i += 40 )
+		{
+			// 80 elements will be processed per loop; 10 FMAs will run per loop.
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
+			yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
+			yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[1].v );
+
+			x0 += 10*n_elem_per_reg;
+			y0 += 10*n_elem_per_reg;
+		}
+
+		for ( ; (i + 19) < n; i += 20 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
+
+			x0 += 5*n_elem_per_reg;
+			y0 += 5*n_elem_per_reg;
+		}
+
+		for ( ; (i + 15) < n; i += 16 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
+			rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
+
+			x0 += 4*n_elem_per_reg;
+			y0 += 4*n_elem_per_reg;
+		}
+
+		for ( ; (i + 7) < n; i += 8 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
+			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
+
+			x0 += 2*n_elem_per_reg;
+			y0 += 2*n_elem_per_reg;
+		}
+
+		for ( ; (i + 3) < n; i += 4 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
+
+			x0 += 1*n_elem_per_reg;
+			y0 += 1*n_elem_per_reg;
+		}
+
+		for ( ; (i + 0) < n; i += 1 )
+		{
+			rhov[0].d[0] += x0[i] * y0[i];
+		}
+
+		// Manually add the results from above to finish the sum.
+		rho0   += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
+		rho0   += rhov[1].d[0] + rhov[1].d[1] + rhov[1].d[2] + rhov[1].d[3];
+
+		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+		// This avoids a performance penalty caused by false dependencies when
+		// transitioning from from AVX to SSE instructions (which may occur
+		// later, especially if BLIS is compiled with -mfpmath=sse).
+		_mm256_zeroupper();
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			const double x0c = *x0;
+			const double y0c = *y0;
+
+			rho0 += x0c * y0c;
+
+			x0 += incx;
+			y0 += incy;
+		}
+	}
+
+	// Copy the final result into the output variable.
+	PASTEMAC(d,copys)( rho0, *rho );
+}
+
--- a/kernels/zen/1/bli_dotxv_zen_int.c
+++ b/kernels/zen/1/bli_dotxv_zen_int.c
@@ -0,0 +1,308 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_sdotxv_zen_int
+     (
+       conj_t           conjx,
+       conj_t           conjy,
+       dim_t            n,
+       float*  restrict alpha,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       float*  restrict beta,
+       float*  restrict rho,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 8;
+	const dim_t      n_iter_unroll  = 4;
+
+	dim_t            i;
+	dim_t            n_viter;
+	dim_t            n_left;
+
+	float*  restrict x0;
+	float*  restrict y0;
+	float            rho0;
+
+	v8sf_t           rho0v, rho1v, rho2v, rho3v;
+	v8sf_t           x0v, y0v;
+	v8sf_t           x1v, y1v;
+	v8sf_t           x2v, y2v;
+	v8sf_t           x3v, y3v;
+
+	// If beta is zero, initialize rho1 to zero instead of scaling
+	// rho by beta (in case rho contains NaN or Inf).
+	if ( PASTEMAC(s,eq0)( *beta ) )
+	{
+		PASTEMAC(s,set0s)( *rho );
+	}
+	else
+	{
+		PASTEMAC(s,scals)( *beta, *rho );
+	}
+
+	// If the vector dimension is zero, output rho and return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override n_viter and n_left to use scalar code
+	// for all iterations.
+	if ( incx != 1 || incy != 1 )
+	{
+		n_viter = 0;
+		n_left  = n;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	// Initialize the unrolled iterations' rho vectors to zero.
+	rho0v.v = _mm256_setzero_ps();
+	rho1v.v = _mm256_setzero_ps();
+	rho2v.v = _mm256_setzero_ps();
+	rho3v.v = _mm256_setzero_ps();
+
+	for ( i = 0; i < n_viter; ++i )
+	{
+		// Load the x and y input vector elements.
+		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+		x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+		x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+
+		x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+
+		// Compute the element-wise product of the x and y vectors,
+		// storing in the corresponding rho vectors.
+		rho0v.v = _mm256_fmadd_ps( x0v.v, y0v.v, rho0v.v );
+		rho1v.v = _mm256_fmadd_ps( x1v.v, y1v.v, rho1v.v );
+		rho2v.v = _mm256_fmadd_ps( x2v.v, y2v.v, rho2v.v );
+		rho3v.v = _mm256_fmadd_ps( x3v.v, y3v.v, rho3v.v );
+
+		x0 += ( n_elem_per_reg * n_iter_unroll );
+		y0 += ( n_elem_per_reg * n_iter_unroll );
+	}
+
+	// Accumulate the unrolled rho vectors into a single vector.
+	rho0v.v += rho1v.v;
+	rho0v.v += rho2v.v;
+	rho0v.v += rho3v.v;
+
+	// Accumulate the final rho vector into a single scalar result.
+	rho0 = rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
+	       rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < n_left; ++i )
+	{
+		const float x0c = *x0;
+		const float y0c = *y0;
+
+		rho0 += x0c * y0c;
+
+		x0 += incx;
+		y0 += incy;
+	}
+
+	// Accumulate the final result into the output variable.
+	PASTEMAC(s,axpys)( *alpha, rho0, *rho );
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_ddotxv_zen_int
+     (
+       conj_t           conjx,
+       conj_t           conjy,
+       dim_t            n,
+       double* restrict alpha,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       double* restrict beta,
+       double* restrict rho,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 4;
+	const dim_t      n_iter_unroll  = 4;
+
+	dim_t            i;
+	dim_t            n_viter;
+	dim_t            n_left;
+
+	double* restrict x0;
+	double* restrict y0;
+	double           rho0;
+
+	v4df_t           rho0v, rho1v, rho2v, rho3v;
+	v4df_t           x0v, y0v;
+	v4df_t           x1v, y1v;
+	v4df_t           x2v, y2v;
+	v4df_t           x3v, y3v;
+
+	// If beta is zero, initialize rho1 to zero instead of scaling
+	// rho by beta (in case rho contains NaN or Inf).
+	if ( PASTEMAC(d,eq0)( *beta ) )
+	{
+		PASTEMAC(d,set0s)( *rho );
+	}
+	else
+	{
+		PASTEMAC(d,scals)( *beta, *rho );
+	}
+
+	// If the vector dimension is zero, output rho and return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override n_viter and n_left to use scalar code
+	// for all iterations.
+	if ( incx != 1 || incy != 1 )
+	{
+		n_viter = 0;
+		n_left  = n;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+	y0 = y;
+
+	// Initialize the unrolled iterations' rho vectors to zero.
+	rho0v.v = _mm256_setzero_pd();
+	rho1v.v = _mm256_setzero_pd();
+	rho2v.v = _mm256_setzero_pd();
+	rho3v.v = _mm256_setzero_pd();
+
+	for ( i = 0; i < n_viter; ++i )
+	{
+		// Load the x and y input vector elements.
+		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+		y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+		x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+		y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+		x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+		y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+
+		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+		y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+		
+		// Compute the element-wise product of the x and y vectors,
+		// storing in the corresponding rho vectors.
+		rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v );
+		rho1v.v = _mm256_fmadd_pd( x1v.v, y1v.v, rho1v.v );
+		rho2v.v = _mm256_fmadd_pd( x2v.v, y2v.v, rho2v.v );
+		rho3v.v = _mm256_fmadd_pd( x3v.v, y3v.v, rho3v.v );
+
+		x0 += ( n_elem_per_reg * n_iter_unroll );
+		y0 += ( n_elem_per_reg * n_iter_unroll );
+	}
+
+	// Accumulate the unrolled rho vectors into a single vector.
+	rho0v.v += rho1v.v;
+	rho0v.v += rho2v.v;
+	rho0v.v += rho3v.v;
+
+	// Accumulate the final rho vector into a single scalar result.
+	rho0 = rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < n_left; ++i )
+	{
+		const double x0c = *x0;
+		const double y0c = *y0;
+
+		rho0 += x0c * y0c;
+
+		x0 += incx;
+		y0 += incy;
+	}
+
+	// Accumulate the final result into the output variable.
+	PASTEMAC(d,axpys)( *alpha, rho0, *rho );
+}
+
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -0,0 +1,266 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_sscalv_zen_int
+     (
+       conj_t           conjalpha,
+       dim_t            n,
+       float*  restrict alpha,
+       float*  restrict x, inc_t incx,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 8;
+	const dim_t      n_iter_unroll  = 4;
+
+	dim_t            i;
+	dim_t            n_viter;
+	dim_t            n_left;
+
+	float*  restrict x0;
+
+	v8sf_t           alphav;
+	v8sf_t           x0v, x1v, x2v, x3v;
+
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
+
+	// If alpha is zero, use setv (in case y contains NaN or Inf).
+	if ( PASTEMAC(s,eq0)( *alpha ) )
+	{
+		float*   zero = bli_s0;
+		ssetv_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+
+		f
+		(
+		  BLIS_NO_CONJUGATE,
+		  n,
+		  zero,
+		  x, incx,
+		  cntx
+		);
+		return;
+	}
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override n_viter and n_left to use scalar code
+	// for all iterations.
+	if ( incx != 1 )
+	{
+		n_viter = 0;
+		n_left  = n;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+
+	// Broadcast the alpha scalar to all elements of a vector register.
+	alphav.v = _mm256_broadcast_ss( alpha );
+
+	// If there are vectorized iterations, perform them with vector
+	// instructions.
+	for ( i = 0; i < n_viter; ++i )
+	{
+		// Load the input values.
+		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+		x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+		x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+		x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+
+		// perform : x := alpha * x;
+		x0v.v = _mm256_mul_ps( alphav.v, x0v.v );
+		x1v.v = _mm256_mul_ps( alphav.v, x1v.v );
+		x2v.v = _mm256_mul_ps( alphav.v, x2v.v );
+		x3v.v = _mm256_mul_ps( alphav.v, x3v.v );
+
+		// Store the output.
+		_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), x0v.v );
+		_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), x1v.v );
+		_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), x2v.v );
+		_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), x3v.v );
+
+		x0 += n_elem_per_reg * n_iter_unroll;
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	const float alphac = *alpha;
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < n_left; ++i )
+	{
+		*x0 *= alphac;
+
+		x0 += incx;
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_dscalv_zen_int
+     (
+       conj_t           conjalpha,
+       dim_t            n,
+       double* restrict alpha,
+       double* restrict x, inc_t incx,
+       cntx_t*          cntx
+     )
+{
+	const dim_t       n_elem_per_reg = 4;
+	const dim_t       n_iter_unroll  = 4;
+
+	dim_t             i;
+	dim_t             n_viter;
+	dim_t             n_left;
+
+	double*  restrict x0;
+
+	v4df_t            alphav;
+	v4df_t            x0v, x1v, x2v, x3v;
+
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
+
+	// If alpha is zero, use setv (in case y contains NaN or Inf).
+	if ( PASTEMAC(d,eq0)( *alpha ) )
+	{
+		double*  zero = bli_d0;
+		dsetv_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+
+		f
+		(
+		  BLIS_NO_CONJUGATE,
+		  n,
+		  zero,
+		  x, incx,
+		  cntx
+		);
+		return;
+	}
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override n_viter and n_left to use scalar code
+	// for all iterations.
+	if ( incx != 1 )
+	{
+		n_viter = 0;
+		n_left  = n;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+
+	// Broadcast the alpha scalar to all elements of a vector register.
+	alphav.v = _mm256_broadcast_sd( alpha );
+
+	// If there are vectorized iterations, perform them with vector
+	// instructions.
+	for ( i = 0; i < n_viter; ++i )
+	{
+		// Load the input values.
+		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+		x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+		x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+
+		// perform : y += alpha * x;
+		x0v.v = _mm256_mul_pd( alphav.v, x0v.v );
+		x1v.v = _mm256_mul_pd( alphav.v, x1v.v );
+		x2v.v = _mm256_mul_pd( alphav.v, x2v.v );
+		x3v.v = _mm256_mul_pd( alphav.v, x3v.v );
+
+		// Store the output.
+		_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), x0v.v );
+		_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), x1v.v );
+		_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), x2v.v );
+		_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), x3v.v );
+
+		x0 += n_elem_per_reg * n_iter_unroll;
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	const double alphac = *alpha;
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < n_left; ++i )
+	{
+		*x0 *= alphac;
+
+		x0 += incx;
+	}
+}
+
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -0,0 +1,448 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_sscalv_zen_int10
+     (
+       conj_t           conjalpha,
+       dim_t            n,
+       float*  restrict alpha,
+       float*  restrict x, inc_t incx,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 8;
+
+	dim_t            i;
+
+	float*  restrict x0;
+
+	__m256           alphav;
+	__m256           xv[10];
+	__m256           zv[10];
+
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
+
+	// If alpha is zero, use setv.
+	if ( PASTEMAC(s,eq0)( *alpha ) )
+	{
+		float*   zero = bli_s0;
+		ssetv_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+
+		f
+		(
+		  BLIS_NO_CONJUGATE,
+		  n,
+		  zero,
+		  x, incx,
+		  cntx
+		);
+		return;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+
+	if ( incx == 1 )
+	{
+		// Broadcast the alpha scalar to all elements of a vector register.
+		alphav = _mm256_broadcast_ss( alpha );
+
+		for ( i = 0; (i + 79) < n; i += 80 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_ps( alphav, xv[0] );
+			zv[1] = _mm256_mul_ps( alphav, xv[1] );
+			zv[2] = _mm256_mul_ps( alphav, xv[2] );
+			zv[3] = _mm256_mul_ps( alphav, xv[3] );
+			zv[4] = _mm256_mul_ps( alphav, xv[4] );
+			zv[5] = _mm256_mul_ps( alphav, xv[5] );
+			zv[6] = _mm256_mul_ps( alphav, xv[6] );
+			zv[7] = _mm256_mul_ps( alphav, xv[7] );
+			zv[8] = _mm256_mul_ps( alphav, xv[8] );
+			zv[9] = _mm256_mul_ps( alphav, xv[9] );
+
+			// Store the output.
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_ps( (x0 + 5*n_elem_per_reg), zv[5] );
+			_mm256_storeu_ps( (x0 + 6*n_elem_per_reg), zv[6] );
+			_mm256_storeu_ps( (x0 + 7*n_elem_per_reg), zv[7] );
+			_mm256_storeu_ps( (x0 + 8*n_elem_per_reg), zv[8] );
+			_mm256_storeu_ps( (x0 + 9*n_elem_per_reg), zv[9] );
+
+			x0 += 10*n_elem_per_reg;
+		}
+
+		for ( ; (i + 39) < n; i += 40 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_ps( alphav, xv[0] );
+			zv[1] = _mm256_mul_ps( alphav, xv[1] );
+			zv[2] = _mm256_mul_ps( alphav, xv[2] );
+			zv[3] = _mm256_mul_ps( alphav, xv[3] );
+			zv[4] = _mm256_mul_ps( alphav, xv[4] );
+
+			// Store the output.
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), zv[4] );
+
+			x0 += 5*n_elem_per_reg;
+		}
+
+		for ( ; (i + 31) < n; i += 32 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_ps( alphav, xv[0] );
+			zv[1] = _mm256_mul_ps( alphav, xv[1] );
+			zv[2] = _mm256_mul_ps( alphav, xv[2] );
+			zv[3] = _mm256_mul_ps( alphav, xv[3] );
+
+			// Store the output.
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
+
+			x0 += 4*n_elem_per_reg;
+		}
+
+		for ( ; (i + 15) < n; i += 16 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_ps( alphav, xv[0] );
+			zv[1] = _mm256_mul_ps( alphav, xv[1] );
+
+			// Store the output.
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
+
+			x0 += 2*n_elem_per_reg;
+		}
+
+		for ( ; (i + 7) < n; i += 8 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_ps( alphav, xv[0] );
+
+			// Store the output.
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
+
+			x0 += 1*n_elem_per_reg;
+		}
+
+		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+		// This avoids a performance penalty caused by false dependencies when
+		// transitioning from from AVX to SSE instructions (which may occur
+		// as soon as the cleanup loop below if BLIS is compiled with
+		// -mfpmath=sse).
+		_mm256_zeroupper();
+
+		for ( ; (i + 0) < n; i += 1 )
+		{
+			*x0 *= *alpha;
+
+			x0 += 1;
+		}
+	}
+	else
+	{
+		const float alphac = *alpha;
+
+		for ( i = 0; i < n; ++i )
+		{
+			*x0 *= alphac;
+
+			x0 += incx;
+		}
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_dscalv_zen_int10
+     (
+       conj_t           conjalpha,
+       dim_t            n,
+       double* restrict alpha,
+       double* restrict x, inc_t incx,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 4;
+
+	dim_t            i;
+
+	double* restrict x0;
+
+	__m256d          alphav;
+	__m256d          xv[10];
+	__m256d          zv[10];
+
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
+
+	// If alpha is zero, use setv.
+	if ( PASTEMAC(d,eq0)( *alpha ) )
+	{
+		double*  zero = bli_d0;
+		dsetv_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+
+		f
+		(
+		  BLIS_NO_CONJUGATE,
+		  n,
+		  zero,
+		  x, incx,
+		  cntx
+		);
+		return;
+	}
+
+	// Initialize local pointers.
+	x0 = x;
+
+	if ( incx == 1 )
+	{
+		// Broadcast the alpha scalar to all elements of a vector register.
+		alphav = _mm256_broadcast_sd( alpha );
+
+		for ( i = 0; (i + 39) < n; i += 40 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
+			xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
+			xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_pd( alphav, xv[0] );
+			zv[1] = _mm256_mul_pd( alphav, xv[1] );
+			zv[2] = _mm256_mul_pd( alphav, xv[2] );
+			zv[3] = _mm256_mul_pd( alphav, xv[3] );
+			zv[4] = _mm256_mul_pd( alphav, xv[4] );
+			zv[5] = _mm256_mul_pd( alphav, xv[5] );
+			zv[6] = _mm256_mul_pd( alphav, xv[6] );
+			zv[7] = _mm256_mul_pd( alphav, xv[7] );
+			zv[8] = _mm256_mul_pd( alphav, xv[8] );
+			zv[9] = _mm256_mul_pd( alphav, xv[9] );
+
+			// Store the output.
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), zv[4] );
+			_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), zv[5] );
+			_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), zv[6] );
+			_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), zv[7] );
+			_mm256_storeu_pd( (x0 + 8*n_elem_per_reg), zv[8] );
+			_mm256_storeu_pd( (x0 + 9*n_elem_per_reg), zv[9] );
+
+			x0 += 10*n_elem_per_reg;
+		}
+
+		for ( ; (i + 19) < n; i += 20 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_pd( alphav, xv[0] );
+			zv[1] = _mm256_mul_pd( alphav, xv[1] );
+			zv[2] = _mm256_mul_pd( alphav, xv[2] );
+			zv[3] = _mm256_mul_pd( alphav, xv[3] );
+			zv[4] = _mm256_mul_pd( alphav, xv[4] );
+
+			// Store the output.
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
+			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), zv[4] );
+
+			x0 += 5*n_elem_per_reg;
+		}
+
+		for ( ; (i + 15) < n; i += 16 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_pd( alphav, xv[0] );
+			zv[1] = _mm256_mul_pd( alphav, xv[1] );
+			zv[2] = _mm256_mul_pd( alphav, xv[2] );
+			zv[3] = _mm256_mul_pd( alphav, xv[3] );
+
+			// Store the output.
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
+
+			x0 += 4*n_elem_per_reg;
+		}
+
+		for ( ; (i + 7) < n; i += 8 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_pd( alphav, xv[0] );
+			zv[1] = _mm256_mul_pd( alphav, xv[1] );
+
+			// Store the output.
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
+
+			x0 += 2*n_elem_per_reg;
+		}
+
+		for ( ; (i + 3) < n; i += 4 )
+		{
+			// Load the input values.
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+
+			// perform : x := alpha * x;
+			zv[0] = _mm256_mul_pd( alphav, xv[0] );
+
+			// Store the output.
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
+
+			x0 += 1*n_elem_per_reg;
+		}
+
+		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+		// This avoids a performance penalty caused by false dependencies when
+		// transitioning from from AVX to SSE instructions (which may occur
+		// as soon as the cleanup loop below if BLIS is compiled with
+		// -mfpmath=sse).
+		_mm256_zeroupper();
+
+		for ( ; (i + 0) < n; i += 1 )
+		{
+			*x0 *= *alpha;
+
+			x0 += 1;
+		}
+	}
+	else
+	{
+		const double alphac = *alpha;
+
+		for ( i = 0; i < n; ++i )
+		{
+			*x0 *= alphac;
+
+			x0 += incx;
+		}
+	}
+}
+
--- a/kernels/zen/1f/bli_axpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c
@@ -0,0 +1,485 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_saxpyf_zen_int_8
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       float*  restrict alpha,
+       float*  restrict a, inc_t inca, inc_t lda,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      fuse_fac       = 8;
+
+	const dim_t      n_elem_per_reg = 8;
+	const dim_t      n_iter_unroll  = 1;
+
+	dim_t            i;
+	dim_t            m_viter;
+	dim_t            m_left;
+
+	float*  restrict a0;
+	float*  restrict a1;
+	float*  restrict a2;
+	float*  restrict a3;
+	float*  restrict a4;
+	float*  restrict a5;
+	float*  restrict a6;
+	float*  restrict a7;
+
+	float*  restrict y0;
+
+	v8sf_t           chi0v, chi1v, chi2v, chi3v;
+	v8sf_t           chi4v, chi5v, chi6v, chi7v;
+
+	v8sf_t           a0v, a1v, a2v, a3v;
+	v8sf_t           a4v, a5v, a6v, a7v;
+	v8sf_t           y0v;
+
+	float            chi0, chi1, chi2, chi3;
+	float            chi4, chi5, chi6, chi7;
+
+	// If either dimension is zero, or if alpha is zero, return early.
+	if ( bli_zero_dim2( m, b_n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
+
+	// If b_n is not equal to the fusing factor, then perform the entire
+	// operation as a loop over axpyv.
+	if ( b_n != fuse_fac )
+	{
+		saxpyv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+
+		for ( i = 0; i < b_n; ++i )
+		{
+			float* a1   = a + (0  )*inca + (i  )*lda;
+			float* chi1 = x + (i  )*incx;
+			float* y1   = y + (0  )*incy;
+			float  alpha_chi1;
+
+			PASTEMAC(s,copycjs)( conjx, *chi1, alpha_chi1 );
+			PASTEMAC(s,scals)( *alpha, alpha_chi1 );
+
+			f
+			(
+			  conja,
+			  m,
+			  &alpha_chi1,
+			  a1, inca,
+			  y1, incy,
+			  cntx
+			);
+		}
+
+		return;
+	}
+
+	// At this point, we know that b_n is exactly equal to the fusing factor.
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
+	m_left  = ( m ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override m_viter and m_left to use scalar code
+	// for all iterations.
+	if ( inca != 1 || incy != 1 )
+	{
+		m_viter = 0;
+		m_left  = m;
+	}
+
+	a0   = a + 0*lda;
+	a1   = a + 1*lda;
+	a2   = a + 2*lda;
+	a3   = a + 3*lda;
+	a4   = a + 4*lda;
+	a5   = a + 5*lda;
+	a6   = a + 6*lda;
+	a7   = a + 7*lda;
+	y0   = y;
+
+	chi0 = *( x + 0*incx );
+	chi1 = *( x + 1*incx );
+	chi2 = *( x + 2*incx );
+	chi3 = *( x + 3*incx );
+	chi4 = *( x + 4*incx );
+	chi5 = *( x + 5*incx );
+	chi6 = *( x + 6*incx );
+	chi7 = *( x + 7*incx );
+
+	// Scale each chi scalar by alpha.
+	PASTEMAC(s,scals)( *alpha, chi0 );
+	PASTEMAC(s,scals)( *alpha, chi1 );
+	PASTEMAC(s,scals)( *alpha, chi2 );
+	PASTEMAC(s,scals)( *alpha, chi3 );
+	PASTEMAC(s,scals)( *alpha, chi4 );
+	PASTEMAC(s,scals)( *alpha, chi5 );
+	PASTEMAC(s,scals)( *alpha, chi6 );
+	PASTEMAC(s,scals)( *alpha, chi7 );
+
+	// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+	chi0v.v = _mm256_broadcast_ss( &chi0 );
+	chi1v.v = _mm256_broadcast_ss( &chi1 );
+	chi2v.v = _mm256_broadcast_ss( &chi2 );
+	chi3v.v = _mm256_broadcast_ss( &chi3 );
+	chi4v.v = _mm256_broadcast_ss( &chi4 );
+	chi5v.v = _mm256_broadcast_ss( &chi5 );
+	chi6v.v = _mm256_broadcast_ss( &chi6 );
+	chi7v.v = _mm256_broadcast_ss( &chi7 );
+
+	// If there are vectorized iterations, perform them with vector
+	// instructions.
+	for ( i = 0; i < m_viter; ++i )
+	{
+		// Load the input values.
+		y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+		a0v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+		a1v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+		a2v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+		a3v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+		a4v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+		a5v.v = _mm256_loadu_ps( a5 + 0*n_elem_per_reg );
+		a6v.v = _mm256_loadu_ps( a6 + 0*n_elem_per_reg );
+		a7v.v = _mm256_loadu_ps( a7 + 0*n_elem_per_reg );
+
+		// perform : y += alpha * x;
+		y0v.v = _mm256_fmadd_ps( a0v.v, chi0v.v, y0v.v );
+		y0v.v = _mm256_fmadd_ps( a1v.v, chi1v.v, y0v.v );
+		y0v.v = _mm256_fmadd_ps( a2v.v, chi2v.v, y0v.v );
+		y0v.v = _mm256_fmadd_ps( a3v.v, chi3v.v, y0v.v );
+		y0v.v = _mm256_fmadd_ps( a4v.v, chi4v.v, y0v.v );
+		y0v.v = _mm256_fmadd_ps( a5v.v, chi5v.v, y0v.v );
+		y0v.v = _mm256_fmadd_ps( a6v.v, chi6v.v, y0v.v );
+		y0v.v = _mm256_fmadd_ps( a7v.v, chi7v.v, y0v.v );
+
+		// Store the output.
+		_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+
+		y0 += n_elem_per_reg;
+		a0 += n_elem_per_reg;
+		a1 += n_elem_per_reg;
+		a2 += n_elem_per_reg;
+		a3 += n_elem_per_reg;
+		a4 += n_elem_per_reg;
+		a5 += n_elem_per_reg;
+		a6 += n_elem_per_reg;
+		a7 += n_elem_per_reg;
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the m_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < m_left ; ++i )
+	{
+		float       y0c = *y0;
+
+		const float a0c = *a0;
+		const float a1c = *a1;
+		const float a2c = *a2;
+		const float a3c = *a3;
+		const float a4c = *a4;
+		const float a5c = *a5;
+		const float a6c = *a6;
+		const float a7c = *a7;
+
+		y0c += chi0 * a0c;
+		y0c += chi1 * a1c;
+		y0c += chi2 * a2c;
+		y0c += chi3 * a3c;
+		y0c += chi4 * a4c;
+		y0c += chi5 * a5c;
+		y0c += chi6 * a6c;
+		y0c += chi7 * a7c;
+
+		*y0 = y0c;
+
+		a0 += inca;
+		a1 += inca;
+		a2 += inca;
+		a3 += inca;
+		a4 += inca;
+		a5 += inca;
+		a6 += inca;
+		a7 += inca;
+		y0 += incy;
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyf_zen_int_8
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      fuse_fac       = 8;
+
+	const dim_t      n_elem_per_reg = 4;
+	const dim_t      n_iter_unroll  = 1;
+
+	dim_t            i;
+	dim_t            m_viter;
+	dim_t            m_left;
+
+	double* restrict a0;
+	double* restrict a1;
+	double* restrict a2;
+	double* restrict a3;
+	double* restrict a4;
+	double* restrict a5;
+	double* restrict a6;
+	double* restrict a7;
+
+	double* restrict y0;
+
+	v4df_t           chi0v, chi1v, chi2v, chi3v;
+	v4df_t           chi4v, chi5v, chi6v, chi7v;
+
+	v4df_t           a0v, a1v, a2v, a3v;
+	v4df_t           a4v, a5v, a6v, a7v;
+	v4df_t           y0v;
+
+	double           chi0, chi1, chi2, chi3;
+	double           chi4, chi5, chi6, chi7;
+
+	// If either dimension is zero, or if alpha is zero, return early.
+	if ( bli_zero_dim2( m, b_n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
+
+	// If b_n is not equal to the fusing factor, then perform the entire
+	// operation as a loop over axpyv.
+	if ( b_n != fuse_fac )
+	{
+		daxpyv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+		for ( i = 0; i < b_n; ++i )
+		{
+			double* a1   = a + (0  )*inca + (i  )*lda;
+			double* chi1 = x + (i  )*incx;
+			double* y1   = y + (0  )*incy;
+			double  alpha_chi1;
+
+			PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
+			PASTEMAC(d,scals)( *alpha, alpha_chi1 );
+
+			f
+			(
+			  conja,
+			  m,
+			  &alpha_chi1,
+			  a1, inca,
+			  y1, incy,
+			  cntx
+			);
+		}
+
+		return;
+	}
+
+	// At this point, we know that b_n is exactly equal to the fusing factor.
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
+	m_left  = ( m ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override m_viter and m_left to use scalar code
+	// for all iterations.
+	if ( inca != 1 || incy != 1 )
+	{
+		m_viter = 0;
+		m_left  = m;
+	}
+
+	a0   = a + 0*lda;
+	a1   = a + 1*lda;
+	a2   = a + 2*lda;
+	a3   = a + 3*lda;
+	a4   = a + 4*lda;
+	a5   = a + 5*lda;
+	a6   = a + 6*lda;
+	a7   = a + 7*lda;
+	y0   = y;
+
+	chi0 = *( x + 0*incx );
+	chi1 = *( x + 1*incx );
+	chi2 = *( x + 2*incx );
+	chi3 = *( x + 3*incx );
+	chi4 = *( x + 4*incx );
+	chi5 = *( x + 5*incx );
+	chi6 = *( x + 6*incx );
+	chi7 = *( x + 7*incx );
+
+	// Scale each chi scalar by alpha.
+	PASTEMAC(d,scals)( *alpha, chi0 );
+	PASTEMAC(d,scals)( *alpha, chi1 );
+	PASTEMAC(d,scals)( *alpha, chi2 );
+	PASTEMAC(d,scals)( *alpha, chi3 );
+	PASTEMAC(d,scals)( *alpha, chi4 );
+	PASTEMAC(d,scals)( *alpha, chi5 );
+	PASTEMAC(d,scals)( *alpha, chi6 );
+	PASTEMAC(d,scals)( *alpha, chi7 );
+
+	// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+	chi0v.v = _mm256_broadcast_sd( &chi0 );
+	chi1v.v = _mm256_broadcast_sd( &chi1 );
+	chi2v.v = _mm256_broadcast_sd( &chi2 );
+	chi3v.v = _mm256_broadcast_sd( &chi3 );
+	chi4v.v = _mm256_broadcast_sd( &chi4 );
+	chi5v.v = _mm256_broadcast_sd( &chi5 );
+	chi6v.v = _mm256_broadcast_sd( &chi6 );
+	chi7v.v = _mm256_broadcast_sd( &chi7 );
+
+	// If there are vectorized iterations, perform them with vector
+	// instructions.
+	for ( i = 0; i < m_viter; ++i )
+	{
+		// Load the input values.
+		y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+		a0v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+		a1v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+		a2v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+		a3v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+		a4v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+		a5v.v = _mm256_loadu_pd( a5 + 0*n_elem_per_reg );
+		a6v.v = _mm256_loadu_pd( a6 + 0*n_elem_per_reg );
+		a7v.v = _mm256_loadu_pd( a7 + 0*n_elem_per_reg );
+
+		// perform : y += alpha * x;
+		y0v.v = _mm256_fmadd_pd( a0v.v, chi0v.v, y0v.v );
+		y0v.v = _mm256_fmadd_pd( a1v.v, chi1v.v, y0v.v );
+		y0v.v = _mm256_fmadd_pd( a2v.v, chi2v.v, y0v.v );
+		y0v.v = _mm256_fmadd_pd( a3v.v, chi3v.v, y0v.v );
+		y0v.v = _mm256_fmadd_pd( a4v.v, chi4v.v, y0v.v );
+		y0v.v = _mm256_fmadd_pd( a5v.v, chi5v.v, y0v.v );
+		y0v.v = _mm256_fmadd_pd( a6v.v, chi6v.v, y0v.v );
+		y0v.v = _mm256_fmadd_pd( a7v.v, chi7v.v, y0v.v );
+
+		// Store the output.
+		_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+
+		y0 += n_elem_per_reg;
+		a0 += n_elem_per_reg;
+		a1 += n_elem_per_reg;
+		a2 += n_elem_per_reg;
+		a3 += n_elem_per_reg;
+		a4 += n_elem_per_reg;
+		a5 += n_elem_per_reg;
+		a6 += n_elem_per_reg;
+		a7 += n_elem_per_reg;
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the m_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < m_left ; ++i )
+	{
+		double       y0c = *y0;
+
+		const double a0c = *a0;
+		const double a1c = *a1;
+		const double a2c = *a2;
+		const double a3c = *a3;
+		const double a4c = *a4;
+		const double a5c = *a5;
+		const double a6c = *a6;
+		const double a7c = *a7;
+
+		y0c += chi0 * a0c;
+		y0c += chi1 * a1c;
+		y0c += chi2 * a2c;
+		y0c += chi3 * a3c;
+		y0c += chi4 * a4c;
+		y0c += chi5 * a5c;
+		y0c += chi6 * a6c;
+		y0c += chi7 * a7c;
+
+		*y0 = y0c;
+
+		a0 += inca;
+		a1 += inca;
+		a2 += inca;
+		a3 += inca;
+		a4 += inca;
+		a5 += inca;
+		a6 += inca;
+		a7 += inca;
+		y0 += incy;
+	}
+}
+
--- a/kernels/zen/1f/bli_dotxf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c
@@ -0,0 +1,702 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "immintrin.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_sdotxf_zen_int_8
+     (
+       conj_t           conjat,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       float*  restrict alpha,
+       float*  restrict a, inc_t inca, inc_t lda,
+       float*  restrict x, inc_t incx,
+       float*  restrict beta,
+       float*  restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      fuse_fac       = 8;
+
+	const dim_t      n_elem_per_reg = 8;
+	const dim_t      n_iter_unroll  = 1;
+
+	dim_t            i;
+	dim_t            m_viter;
+	dim_t            m_left;
+
+	float*  restrict a0;
+	float*  restrict a1;
+	float*  restrict a2;
+	float*  restrict a3;
+	float*  restrict a4;
+	float*  restrict a5;
+	float*  restrict a6;
+	float*  restrict a7;
+
+	float*  restrict x0;
+
+	float            rho0, rho1, rho2, rho3;
+	float            rho4, rho5, rho6, rho7;
+
+	v8sf_t           a0v, a1v, a2v, a3v;
+	v8sf_t           a4v, a5v, a6v, a7v;
+
+	v8sf_t           x0v;
+
+	v8sf_t           rho0v, rho1v, rho2v, rho3v;
+	v8sf_t           rho4v, rho5v, rho6v, rho7v;
+
+	v8sf_t           alphav;
+	v8sf_t           betav;
+	v8sf_t           y0v;
+
+	v8sf_t           onev;
+
+	// If the b_n dimension is zero, y is empty and there is no computation.
+	if ( bli_zero_dim1( b_n ) ) return;
+
+	// If the m dimension is zero, or if alpha is zero, the computation
+	// simplifies to updating y.
+	if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) )
+	{
+		sscalv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx );
+		
+		f
+		(
+		  BLIS_NO_CONJUGATE,
+		  b_n,
+		  beta,
+		  y, incy,
+		  cntx
+		);
+		return;
+	}
+
+	// If b_n is not equal to the fusing factor, then perform the entire
+	// operation as a loop over dotxv.
+	if ( b_n != fuse_fac )
+	{
+		sdotxv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx );
+
+		for ( i = 0; i < b_n; ++i )
+		{
+			float* a1   = a + (0  )*inca + (i  )*lda;
+			float* x1   = x + (0  )*incx;
+			float* psi1 = y + (i  )*incy;
+
+			f
+			(
+			  conjat,
+			  conjx,
+			  m,
+			  alpha,
+			  a1, inca,
+			  x1, incx,
+			  beta,
+			  psi1,
+			  cntx
+			);
+		}
+		return;
+	}
+
+	// At this point, we know that b_n is exactly equal to the fusing factor.
+	// However, m may not be a multiple of the number of elements per vector.
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
+	m_left  = ( m ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override m_viter and m_left to use scalar code
+	// for all iterations. (NOTE: vector instructions can be used even if
+	// the increment of the output vector, incy, is nonunit.)
+	if ( inca != 1 || incx != 1 )
+	{
+		m_viter = 0;
+		m_left  = m;
+	}
+
+	// Set up pointers for x and the b_n columns of A (rows of A^T).
+	x0 = x;
+	a0 = a + 0*lda;
+	a1 = a + 1*lda;
+	a2 = a + 2*lda;
+	a3 = a + 3*lda;
+	a4 = a + 4*lda;
+	a5 = a + 5*lda;
+	a6 = a + 6*lda;
+	a7 = a + 7*lda;
+
+	// Initialize b_n rho vector accumulators to zero.
+	rho0v.v = _mm256_setzero_ps();
+	rho1v.v = _mm256_setzero_ps();
+	rho2v.v = _mm256_setzero_ps();
+	rho3v.v = _mm256_setzero_ps();
+	rho4v.v = _mm256_setzero_ps();
+	rho5v.v = _mm256_setzero_ps();
+	rho6v.v = _mm256_setzero_ps();
+	rho7v.v = _mm256_setzero_ps();
+
+	// If there are vectorized iterations, perform them with vector
+	// instructions.
+	for ( i = 0; i < m_viter; ++i )
+	{
+		// Load the input values.
+		x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+
+		a0v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+		a1v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+		a2v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+		a3v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+		a4v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+		a5v.v = _mm256_loadu_ps( a5 + 0*n_elem_per_reg );
+		a6v.v = _mm256_loadu_ps( a6 + 0*n_elem_per_reg );
+		a7v.v = _mm256_loadu_ps( a7 + 0*n_elem_per_reg );
+
+		// perform : rho?v += a?v * x0v;
+		rho0v.v = _mm256_fmadd_ps( a0v.v, x0v.v, rho0v.v );
+		rho1v.v = _mm256_fmadd_ps( a1v.v, x0v.v, rho1v.v );
+		rho2v.v = _mm256_fmadd_ps( a2v.v, x0v.v, rho2v.v );
+		rho3v.v = _mm256_fmadd_ps( a3v.v, x0v.v, rho3v.v );
+		rho4v.v = _mm256_fmadd_ps( a4v.v, x0v.v, rho4v.v );
+		rho5v.v = _mm256_fmadd_ps( a5v.v, x0v.v, rho5v.v );
+		rho6v.v = _mm256_fmadd_ps( a6v.v, x0v.v, rho6v.v );
+		rho7v.v = _mm256_fmadd_ps( a7v.v, x0v.v, rho7v.v );
+
+		x0 += n_elem_per_reg * n_iter_unroll;
+		a0 += n_elem_per_reg * n_iter_unroll;
+		a1 += n_elem_per_reg * n_iter_unroll;
+		a2 += n_elem_per_reg * n_iter_unroll;
+		a3 += n_elem_per_reg * n_iter_unroll;
+		a4 += n_elem_per_reg * n_iter_unroll;
+		a5 += n_elem_per_reg * n_iter_unroll;
+		a6 += n_elem_per_reg * n_iter_unroll;
+		a7 += n_elem_per_reg * n_iter_unroll;
+	}
+
+#if 0
+	rho0 += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
+	        rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
+	rho1 += rho1v.f[0] + rho1v.f[1] + rho1v.f[2] + rho1v.f[3] +
+	        rho1v.f[4] + rho1v.f[5] + rho1v.f[6] + rho1v.f[7];
+	rho2 += rho2v.f[0] + rho2v.f[1] + rho2v.f[2] + rho2v.f[3] +
+	        rho2v.f[4] + rho2v.f[5] + rho2v.f[6] + rho2v.f[7];
+	rho3 += rho3v.f[0] + rho3v.f[1] + rho3v.f[2] + rho3v.f[3] +
+	        rho3v.f[4] + rho3v.f[5] + rho3v.f[6] + rho3v.f[7];
+	rho4 += rho4v.f[0] + rho4v.f[1] + rho4v.f[2] + rho4v.f[3] +
+	        rho4v.f[4] + rho4v.f[5] + rho4v.f[6] + rho4v.f[7];
+	rho5 += rho5v.f[0] + rho5v.f[1] + rho5v.f[2] + rho5v.f[3] +
+	        rho5v.f[4] + rho5v.f[5] + rho5v.f[6] + rho5v.f[7];
+	rho6 += rho6v.f[0] + rho6v.f[1] + rho6v.f[2] + rho6v.f[3] +
+	        rho6v.f[4] + rho6v.f[5] + rho6v.f[6] + rho6v.f[7];
+	rho7 += rho7v.f[0] + rho7v.f[1] + rho7v.f[2] + rho7v.f[3] +
+	        rho7v.f[4] + rho7v.f[5] + rho7v.f[6] + rho7v.f[7];
+#else
+	// Now we need to sum the elements within each vector.
+
+	onev.v = _mm256_set1_ps( 1.0f );
+
+	// Sum the elements of a given rho?v by dotting it with 1. The '1' in
+	// '0xf1' stores the sum of the upper four and lower four values to
+	// the low elements of each lane: elements 4 and 0, respectively. (The
+	// 'f' in '0xf1' means include all four elements of each lane in the
+	// summation.)
+	rho0v.v = _mm256_dp_ps( rho0v.v, onev.v, 0xf1 );
+	rho1v.v = _mm256_dp_ps( rho1v.v, onev.v, 0xf1 );
+	rho2v.v = _mm256_dp_ps( rho2v.v, onev.v, 0xf1 );
+	rho3v.v = _mm256_dp_ps( rho3v.v, onev.v, 0xf1 );
+	rho4v.v = _mm256_dp_ps( rho4v.v, onev.v, 0xf1 );
+	rho5v.v = _mm256_dp_ps( rho5v.v, onev.v, 0xf1 );
+	rho6v.v = _mm256_dp_ps( rho6v.v, onev.v, 0xf1 );
+	rho7v.v = _mm256_dp_ps( rho7v.v, onev.v, 0xf1 );
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the m_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	// Manually add the results from above to finish the sum.
+	rho0    = rho0v.f[0] + rho0v.f[4];
+	rho1    = rho1v.f[0] + rho1v.f[4];
+	rho2    = rho2v.f[0] + rho2v.f[4];
+	rho3    = rho3v.f[0] + rho3v.f[4];
+	rho4    = rho4v.f[0] + rho4v.f[4];
+	rho5    = rho5v.f[0] + rho5v.f[4];
+	rho6    = rho6v.f[0] + rho6v.f[4];
+	rho7    = rho7v.f[0] + rho7v.f[4];
+#endif
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < m_left ; ++i )
+	{
+		const float x0c = *x0;
+
+		const float a0c = *a0;
+		const float a1c = *a1;
+		const float a2c = *a2;
+		const float a3c = *a3;
+		const float a4c = *a4;
+		const float a5c = *a5;
+		const float a6c = *a6;
+		const float a7c = *a7;
+
+		rho0 += a0c * x0c;
+		rho1 += a1c * x0c;
+		rho2 += a2c * x0c;
+		rho3 += a3c * x0c;
+		rho4 += a4c * x0c;
+		rho5 += a5c * x0c;
+		rho6 += a6c * x0c;
+		rho7 += a7c * x0c;
+
+		x0 += incx;
+		a0 += inca;
+		a1 += inca;
+		a2 += inca;
+		a3 += inca;
+		a4 += inca;
+		a5 += inca;
+		a6 += inca;
+		a7 += inca;
+	}
+
+	// Insert the scalar rho values into a single vector.
+	rho0v.f[0] = rho0;
+	rho0v.f[1] = rho1;
+	rho0v.f[2] = rho2;
+	rho0v.f[3] = rho3;
+	rho0v.f[4] = rho4;
+	rho0v.f[5] = rho5;
+	rho0v.f[6] = rho6;
+	rho0v.f[7] = rho7;
+
+	// Broadcast the alpha scalar.
+	alphav.v = _mm256_broadcast_ss( alpha );
+
+	// We know at this point that alpha is nonzero; however, beta may still
+	// be zero. If beta is indeed zero, we must overwrite y rather than scale
+	// by beta (in case y contains NaN or Inf).
+	if ( PASTEMAC(s,eq0)( *beta ) )
+	{
+		// Apply alpha to the accumulated dot product in rho:
+		//   y := alpha * rho
+		y0v.v = _mm256_mul_ps( alphav.v, rho0v.v );
+	}
+	else
+	{
+		// Broadcast the beta scalar.
+		betav.v = _mm256_broadcast_ss( beta );
+
+		// Load y.
+		if ( incy == 1 )
+		{
+			y0v.v = _mm256_loadu_ps( y + 0*n_elem_per_reg );
+		}
+		else
+		{
+			y0v.f[0] = *(y + 0*incy); y0v.f[1] = *(y + 1*incy);
+			y0v.f[2] = *(y + 2*incy); y0v.f[3] = *(y + 3*incy);
+			y0v.f[4] = *(y + 4*incy); y0v.f[5] = *(y + 5*incy);
+			y0v.f[6] = *(y + 6*incy); y0v.f[7] = *(y + 7*incy);
+		}
+
+		// Apply beta to y and alpha to the accumulated dot product in rho:
+		//   y := beta * y + alpha * rho
+		y0v.v = _mm256_mul_ps( betav.v, y0v.v );
+		y0v.v = _mm256_fmadd_ps( alphav.v, rho0v.v, y0v.v );
+	}
+
+	// Store the output.
+	if ( incy == 1 )
+	{
+		_mm256_storeu_ps( (y + 0*n_elem_per_reg), y0v.v );
+	}
+	else
+	{
+		*(y + 0*incy) = y0v.f[0]; *(y + 1*incy) = y0v.f[1];
+		*(y + 2*incy) = y0v.f[2]; *(y + 3*incy) = y0v.f[3];
+		*(y + 4*incy) = y0v.f[4]; *(y + 5*incy) = y0v.f[5];
+		*(y + 6*incy) = y0v.f[6]; *(y + 7*incy) = y0v.f[7];
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions.
+	_mm256_zeroupper();
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_ddotxf_zen_int_8
+     (
+       conj_t           conjat,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict beta,
+       double* restrict y, inc_t incy,
+       cntx_t*          cntx
+     )
+{
+	const dim_t      fuse_fac       = 8;
+
+	const dim_t      n_elem_per_reg = 4;
+	const dim_t      n_iter_unroll  = 1;
+
+	dim_t            i;
+	dim_t            m_viter;
+	dim_t            m_left;
+
+	double* restrict a0;
+	double* restrict a1;
+	double* restrict a2;
+	double* restrict a3;
+	double* restrict a4;
+	double* restrict a5;
+	double* restrict a6;
+	double* restrict a7;
+
+	double* restrict x0;
+
+	double           rho0, rho1, rho2, rho3;
+	double           rho4, rho5, rho6, rho7;
+
+	v4df_t           a0v, a1v, a2v, a3v;
+	v4df_t           a4v, a5v, a6v, a7v;
+
+	v4df_t           x0v;
+
+	v4df_t           rho0v, rho1v, rho2v, rho3v;
+	v4df_t           rho4v, rho5v, rho6v, rho7v;
+
+	v4df_t           alphav;
+	v4df_t           betav;
+	v4df_t           y0v, y1v;
+
+	// If the b_n dimension is zero, y is empty and there is no computation.
+	if ( bli_zero_dim1( b_n ) ) return;
+
+	// If the m dimension is zero, or if alpha is zero, the computation
+	// simplifies to updating y.
+	if ( bli_zero_dim1( m ) || PASTEMAC(d,eq0)( *alpha ) )
+	{
+		dscalv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+		
+		f
+		(
+		  BLIS_NO_CONJUGATE,
+		  b_n,
+		  beta,
+		  y, incy,
+		  cntx
+		);
+		return;
+	}
+
+	// If b_n is not equal to the fusing factor, then perform the entire
+	// operation as a loop over dotxv.
+	if ( b_n != fuse_fac )
+	{
+		ddotxv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx );
+
+		for ( i = 0; i < b_n; ++i )
+		{
+			double* a1   = a + (0  )*inca + (i  )*lda;
+			double* x1   = x + (0  )*incx;
+			double* psi1 = y + (i  )*incy;
+
+			f
+			(
+			  conjat,
+			  conjx,
+			  m,
+			  alpha,
+			  a1, inca,
+			  x1, incx,
+			  beta,
+			  psi1,
+			  cntx
+			);
+		}
+		return;
+	}
+
+	// At this point, we know that b_n is exactly equal to the fusing factor.
+	// However, m may not be a multiple of the number of elements per vector.
+
+	// Use the unrolling factor and the number of elements per register
+	// to compute the number of vectorized and leftover iterations.
+	m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
+	m_left  = ( m ) % ( n_elem_per_reg * n_iter_unroll );
+
+	// If there is anything that would interfere with our use of contiguous
+	// vector loads/stores, override m_viter and m_left to use scalar code
+	// for all iterations. (NOTE: vector instructions can be used even if
+	// the increment of the output vector, incy, is nonunit.)
+	if ( inca != 1 || incx != 1 )
+	{
+		m_viter = 0;
+		m_left  = m;
+	}
+
+	// Set up pointers for x and the b_n columns of A (rows of A^T).
+	x0 = x;
+	a0 = a + 0*lda;
+	a1 = a + 1*lda;
+	a2 = a + 2*lda;
+	a3 = a + 3*lda;
+	a4 = a + 4*lda;
+	a5 = a + 5*lda;
+	a6 = a + 6*lda;
+	a7 = a + 7*lda;
+
+	// Initialize b_n rho vector accumulators to zero.
+	rho0v.v = _mm256_setzero_pd();
+	rho1v.v = _mm256_setzero_pd();
+	rho2v.v = _mm256_setzero_pd();
+	rho3v.v = _mm256_setzero_pd();
+	rho4v.v = _mm256_setzero_pd();
+	rho5v.v = _mm256_setzero_pd();
+	rho6v.v = _mm256_setzero_pd();
+	rho7v.v = _mm256_setzero_pd();
+
+	// If there are vectorized iterations, perform them with vector
+	// instructions.
+	for ( i = 0; i < m_viter; ++i )
+	{
+		// Load the input values.
+		x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+
+		a0v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+		a1v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+		a2v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+		a3v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+		a4v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+		a5v.v = _mm256_loadu_pd( a5 + 0*n_elem_per_reg );
+		a6v.v = _mm256_loadu_pd( a6 + 0*n_elem_per_reg );
+		a7v.v = _mm256_loadu_pd( a7 + 0*n_elem_per_reg );
+
+		// perform : rho?v += a?v * x0v;
+		rho0v.v = _mm256_fmadd_pd( a0v.v, x0v.v, rho0v.v );
+		rho1v.v = _mm256_fmadd_pd( a1v.v, x0v.v, rho1v.v );
+		rho2v.v = _mm256_fmadd_pd( a2v.v, x0v.v, rho2v.v );
+		rho3v.v = _mm256_fmadd_pd( a3v.v, x0v.v, rho3v.v );
+		rho4v.v = _mm256_fmadd_pd( a4v.v, x0v.v, rho4v.v );
+		rho5v.v = _mm256_fmadd_pd( a5v.v, x0v.v, rho5v.v );
+		rho6v.v = _mm256_fmadd_pd( a6v.v, x0v.v, rho6v.v );
+		rho7v.v = _mm256_fmadd_pd( a7v.v, x0v.v, rho7v.v );
+
+		x0 += n_elem_per_reg * n_iter_unroll;
+		a0 += n_elem_per_reg * n_iter_unroll;
+		a1 += n_elem_per_reg * n_iter_unroll;
+		a2 += n_elem_per_reg * n_iter_unroll;
+		a3 += n_elem_per_reg * n_iter_unroll;
+		a4 += n_elem_per_reg * n_iter_unroll;
+		a5 += n_elem_per_reg * n_iter_unroll;
+		a6 += n_elem_per_reg * n_iter_unroll;
+		a7 += n_elem_per_reg * n_iter_unroll;
+	}
+
+#if 0
+	rho0 += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
+	rho1 += rho1v.d[0] + rho1v.d[1] + rho1v.d[2] + rho1v.d[3];
+	rho2 += rho2v.d[0] + rho2v.d[1] + rho2v.d[2] + rho2v.d[3];
+	rho3 += rho3v.d[0] + rho3v.d[1] + rho3v.d[2] + rho3v.d[3];
+	rho4 += rho4v.d[0] + rho4v.d[1] + rho4v.d[2] + rho4v.d[3];
+	rho5 += rho5v.d[0] + rho5v.d[1] + rho5v.d[2] + rho5v.d[3];
+	rho6 += rho6v.d[0] + rho6v.d[1] + rho6v.d[2] + rho6v.d[3];
+	rho7 += rho7v.d[0] + rho7v.d[1] + rho7v.d[2] + rho7v.d[3];
+#else
+	// Sum the elements of a given rho?v. This computes the sum of
+	// elements within lanes and stores the sum to both elements.
+	rho0v.v = _mm256_hadd_pd( rho0v.v, rho0v.v );
+	rho1v.v = _mm256_hadd_pd( rho1v.v, rho1v.v );
+	rho2v.v = _mm256_hadd_pd( rho2v.v, rho2v.v );
+	rho3v.v = _mm256_hadd_pd( rho3v.v, rho3v.v );
+	rho4v.v = _mm256_hadd_pd( rho4v.v, rho4v.v );
+	rho5v.v = _mm256_hadd_pd( rho5v.v, rho5v.v );
+	rho6v.v = _mm256_hadd_pd( rho6v.v, rho6v.v );
+	rho7v.v = _mm256_hadd_pd( rho7v.v, rho7v.v );
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions (which may occur
+	// as soon as the m_left cleanup loop below if BLIS is compiled with
+	// -mfpmath=sse).
+	_mm256_zeroupper();
+
+	// Manually add the results from above to finish the sum.
+	rho0 = rho0v.d[0] + rho0v.d[2];
+	rho1 = rho1v.d[0] + rho1v.d[2];
+	rho2 = rho2v.d[0] + rho2v.d[2];
+	rho3 = rho3v.d[0] + rho3v.d[2];
+	rho4 = rho4v.d[0] + rho4v.d[2];
+	rho5 = rho5v.d[0] + rho5v.d[2];
+	rho6 = rho6v.d[0] + rho6v.d[2];
+	rho7 = rho7v.d[0] + rho7v.d[2];
+#endif
+
+	// If there are leftover iterations, perform them with scalar code.
+	for ( i = 0; i < m_left ; ++i )
+	{
+		const double x0c = *x0;
+
+		const double a0c = *a0;
+		const double a1c = *a1;
+		const double a2c = *a2;
+		const double a3c = *a3;
+		const double a4c = *a4;
+		const double a5c = *a5;
+		const double a6c = *a6;
+		const double a7c = *a7;
+
+		rho0 += a0c * x0c;
+		rho1 += a1c * x0c;
+		rho2 += a2c * x0c;
+		rho3 += a3c * x0c;
+		rho4 += a4c * x0c;
+		rho5 += a5c * x0c;
+		rho6 += a6c * x0c;
+		rho7 += a7c * x0c;
+
+		x0 += incx;
+		a0 += inca;
+		a1 += inca;
+		a2 += inca;
+		a3 += inca;
+		a4 += inca;
+		a5 += inca;
+		a6 += inca;
+		a7 += inca;
+	}
+
+	// Insert the scalar rho values into a single vector.
+	rho0v.d[0] = rho0;
+	rho0v.d[1] = rho1;
+	rho0v.d[2] = rho2;
+	rho0v.d[3] = rho3;
+	rho1v.d[0] = rho4;
+	rho1v.d[1] = rho5;
+	rho1v.d[2] = rho6;
+	rho1v.d[3] = rho7;
+
+	// Broadcast the alpha scalar.
+	alphav.v = _mm256_broadcast_sd( alpha );
+
+	// We know at this point that alpha is nonzero; however, beta may still
+	// be zero. If beta is indeed zero, we must overwrite y rather than scale
+	// by beta (in case y contains NaN or Inf).
+	if ( PASTEMAC(d,eq0)( *beta ) )
+	{
+		// Apply alpha to the accumulated dot product in rho:
+		//   y := alpha * rho
+		y0v.v = _mm256_mul_pd( alphav.v, rho0v.v );
+		y1v.v = _mm256_mul_pd( alphav.v, rho1v.v );
+	}
+	else
+	{
+		// Broadcast the beta scalar.
+		betav.v = _mm256_broadcast_sd( beta );
+
+		// Load y.
+		if ( incy == 1 )
+		{
+			y0v.v = _mm256_loadu_pd( y + 0*n_elem_per_reg );
+			y1v.v = _mm256_loadu_pd( y + 1*n_elem_per_reg );
+		}
+		else
+		{
+			y0v.d[0] = *(y + 0*incy); y0v.d[1] = *(y + 1*incy);
+			y0v.d[2] = *(y + 2*incy); y0v.d[3] = *(y + 3*incy);
+			y1v.d[0] = *(y + 4*incy); y1v.d[1] = *(y + 5*incy);
+			y1v.d[2] = *(y + 6*incy); y1v.d[3] = *(y + 7*incy);
+		}
+
+		// Apply beta to y and alpha to the accumulated dot product in rho:
+		//   y := beta * y + alpha * rho
+		y0v.v = _mm256_mul_pd( betav.v, y0v.v );
+		y1v.v = _mm256_mul_pd( betav.v, y1v.v );
+		y0v.v = _mm256_fmadd_pd( alphav.v, rho0v.v, y0v.v );
+		y1v.v = _mm256_fmadd_pd( alphav.v, rho1v.v, y1v.v );
+	}
+
+	if ( incy == 1 )
+	{
+		// Store the output.
+		_mm256_storeu_pd( (y + 0*n_elem_per_reg), y0v.v );
+		_mm256_storeu_pd( (y + 1*n_elem_per_reg), y1v.v );
+	}
+	else
+	{
+		*(y + 0*incy) = y0v.d[0]; *(y + 1*incy) = y0v.d[1];
+		*(y + 2*incy) = y0v.d[2]; *(y + 3*incy) = y0v.d[3];
+		*(y + 4*incy) = y1v.d[0]; *(y + 5*incy) = y1v.d[1];
+		*(y + 6*incy) = y1v.d[2]; *(y + 7*incy) = y1v.d[3];
+	}
+
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from from AVX to SSE instructions.
+	_mm256_zeroupper();
+}
+
--- a/kernels/zen/3/bli_gemm_small.c
+++ b/kernels/zen/3/bli_gemm_small.c
--- a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c
+++ b/kernels/zen/3/bli_gemm_zen_asm_d6x8.c
--- a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c
+++ b/kernels/zen/3/bli_gemm_zen_asm_d8x6.c
--- a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c
+++ b/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c
--- a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c
+++ b/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -0,0 +1,101 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// -- level-1v --
+
+// amaxv (intrinsics)
+AMAXV_KER_PROT( float,    s, amaxv_zen_int )
+AMAXV_KER_PROT( double,   d, amaxv_zen_int )
+
+// axpyv (intrinsics)
+AXPYV_KER_PROT( float,    s, axpyv_zen_int )
+AXPYV_KER_PROT( double,   d, axpyv_zen_int )
+
+	// axpyv (intrinsics unrolled x10)
+	AXPYV_KER_PROT( float,    s, axpyv_zen_int10 )
+	AXPYV_KER_PROT( double,   d, axpyv_zen_int10 )
+
+// dotv (intrinsics)
+DOTV_KER_PROT( float,    s, dotv_zen_int )
+DOTV_KER_PROT( double,   d, dotv_zen_int )
+
+	// dotv (intrinsics, unrolled x10)
+	DOTV_KER_PROT( float,    s, dotv_zen_int10 )
+	DOTV_KER_PROT( double,   d, dotv_zen_int10 )
+
+// dotxv (intrinsics)
+DOTXV_KER_PROT( float,    s, dotxv_zen_int )
+DOTXV_KER_PROT( double,   d, dotxv_zen_int )
+
+// scalv (intrinsics)
+SCALV_KER_PROT( float,    s, scalv_zen_int )
+SCALV_KER_PROT( double,   d, scalv_zen_int )
+
+	// scalv (intrinsics unrolled x10)
+	SCALV_KER_PROT( float,    s, scalv_zen_int10 )
+	SCALV_KER_PROT( double,   d, scalv_zen_int10 )
+
+// -- level-1f --
+
+// axpyf (intrinsics)
+AXPYF_KER_PROT( float,    s, axpyf_zen_int_8 )
+AXPYF_KER_PROT( double,   d, axpyf_zen_int_8 )
+
+// dotxf (intrinsics)
+DOTXF_KER_PROT( float,    s, dotxf_zen_int_8 )
+DOTXF_KER_PROT( double,   d, dotxf_zen_int_8 )
+
+// -- level-3 --
+
+// gemm (asm d6x8)
+GEMM_UKR_PROT( float,    s, gemm_zen_asm_6x16 )
+GEMM_UKR_PROT( double,   d, gemm_zen_asm_6x8 )
+GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_3x8 )
+GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_3x4 )
+
+// gemmtrsm_l (asm d6x8)
+GEMMTRSM_UKR_PROT( float,    s, gemmtrsm_l_zen_asm_6x16 )
+GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_l_zen_asm_6x8 )
+
+// gemmtrsm_u (asm d6x8)
+GEMMTRSM_UKR_PROT( float,    s, gemmtrsm_u_zen_asm_6x16 )
+GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_zen_asm_6x8 )
+
+
+// gemm (asm d8x6)
+//GEMM_UKR_PROT( float,    s, gemm_zen_asm_16x6 )
+//GEMM_UKR_PROT( double,   d, gemm_zen_asm_8x6 )
+//GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_8x3 )
+//GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_4x3 )
+
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -56,14 +56,19 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	ctype_r  chi1_i; \
 	ctype_r  abs_chi1; \
 	ctype_r  abs_chi1_max; \
+	dim_t    i_max_l; \
 	dim_t    i; \
-\
-	/* Initialize the index of the maximum absolute value to zero. */ \
-	PASTEMAC(i,copys)( zero_i, *i_max ); \
 \
 	/* If the vector length is zero, return early. This directly emulates
 	   the behavior of netlib BLAS's i?amax() routines. */ \
-	if ( bli_zero_dim1( n ) ) return; \
+	if ( bli_zero_dim1( n ) ) \
+	{ \
+		PASTEMAC(i,copys)( *zero_i, *i_max ); \
+		return; \
+	} \
+\
+	/* Initialize the index of the maximum absolute value to zero. */ \
+	PASTEMAC(i,copys)( *zero_i, i_max_l ); \
 \
 	/* Initialize the maximum absolute value search candidate with
 	   -1, which is guaranteed to be less than all values we will
@@ -72,10 +77,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	if ( incx == 1 ) \
 	{ \
+		ctype* chi1 = x; \
+\
 		for ( i = 0; i < n; ++i ) \
 		{ \
 			/* Get the real and imaginary components of chi1. */ \
-			PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \
+			PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
 \
 			/* Replace chi1_r and chi1_i with their absolute values. */ \
 			PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
@@ -94,8 +101,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
-				*i_max       = i; \
+				i_max_l      = i; \
 			} \
+\
+			chi1 += 1; \
 		} \
 	} \
 	else \
@@ -124,10 +133,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
-				*i_max       = i; \
+				i_max_l      = i; \
 			} \
 		} \
 	} \
+\
+	/* Store the final index to the output variable. */ \
+	PASTEMAC(i,copys)( i_max_l, *i_max ); \
 }

 INSERT_GENTFUNCR_BASIC2( amaxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
--- a/ref_kernels/1/bli_dotxv_ref.c
+++ b/ref_kernels/1/bli_dotxv_ref.c
@@ -66,7 +66,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		PASTEMAC(ch,scals)( *beta, *rho ); \
 	} \
 \
-	if ( bli_zero_dim1( n ) ) return; \
+	/* If the vectors are empty or if alpha is zero, return early. */ \
+	if ( bli_zero_dim1( n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
 \
 	PASTEMAC(ch,set0s)( dotxy ); \
 \
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -712,6 +712,18 @@ void GENBAINAME(cntx_init)
 	{
 		const bool_t is_pb = FALSE;

+		// We MUST set the induced method in the context prior to calling
+		// bli_cntx_l3_ukr_prefers_cols_dt() because that function queries
+		// the induced method. It needs the induced method value in order
+		// to determine whether to evaluate the "prefers column storage"
+		// predicate using the storage preference of the kernel for dt, or
+		// the storage preference of the kernel for the real projection of
+		// dt. Failing to set the induced method here can lead to strange
+		// undefined behavior at runtime if the native complex kernel's
+		// storage preference happens to not equal that of the native real
+		// kernel.
+		bli_cntx_set_method( method, cntx );
+
 		// Initialize the blocksizes according to the micro-kernel preference as
 		// well as the algorithm.
 		if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
--- a/test/Makefile
+++ b/test/Makefile
@@ -5,6 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2017, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -164,7 +165,9 @@ LDFLAGS        += -lgfortran -lm -lpthread -fopenmp
 #all: blis openblas atlas mkl
 all: blis openblas mkl

-blis: test_gemv_blis.x \
+blis: test_dotv_blis.x \
+      test_axpyv_blis.x \
+      test_gemv_blis.x \
      test_ger_blis.x \
      test_hemv_blis.x \
      test_her_blis.x \
@@ -179,7 +182,10 @@ blis: test_gemv_blis.x \
      test_trmm_blis.x \
      test_trsm_blis.x

-openblas: test_gemv_openblas.x \
+openblas: \
+      test_dotv_openblas.x \
+      test_axpyv_openblas.x \
+      test_gemv_openblas.x \
      test_ger_openblas.x \
      test_hemv_openblas.x \
      test_her_openblas.x \
@@ -194,7 +200,10 @@ openblas: test_gemv_openblas.x \
      test_trmm_openblas.x \
      test_trsm_openblas.x

-atlas: test_gemv_atlas.x \
+atlas: \
+      test_dotv_atlas.x \
+      test_axpyv_atlas.x \
+      test_gemv_atlas.x \
      test_ger_atlas.x \
      test_hemv_atlas.x \
      test_her_atlas.x \
@@ -209,7 +218,9 @@ atlas: test_gemv_atlas.x \
      test_trmm_atlas.x \
      test_trsm_atlas.x

-mkl:  test_gemv_mkl.x \
+mkl:  test_dotv_mkl.x \
+      test_axpyv_mkl.x \
+      test_gemv_mkl.x \
      test_ger_mkl.x \
      test_hemv_mkl.x \
      test_her_mkl.x \
@@ -224,7 +235,9 @@ mkl:  test_gemv_mkl.x \
      test_trmm_mkl.x \
      test_trsm_mkl.x

-essl: test_gemv_essl.x \
+essl: test_dotv_essl.x \
+      test_axpyv_essl.x \
+      test_gemv_essl.x \
      test_ger_essl.x \
      test_hemv_essl.x \
      test_her_essl.x \
@@ -239,7 +252,9 @@ essl: test_gemv_essl.x \
      test_trmm_essl.x \
      test_trsm_essl.x

-mac:  test_gemv_mac.x \
+mac:  test_dotv_mac.x \
+      test_axpyv_mac.x \
+      test_gemv_mac.x \
      test_ger_mac.x \
      test_hemv_mac.x \
      test_her_mac.x \
--- a/test/test_axpyv.c
+++ b/test/test_axpyv.c
@@ -0,0 +1,192 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+//                n    alpha     x      incx      y        incy
+//void daxpyv_( int*, double*, double*, int*, double*,   int* );
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t x, y;
+	obj_t y_save;
+	obj_t alpha;
+	dim_t n;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   n_input;
+	num_t dt_x, dt_y;
+	num_t dt_alpha;
+	int   r, n_repeats;
+	num_t dt;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+
+	bli_init();
+
+	n_repeats = 3;
+
+#ifndef PRINT
+	p_begin = 40;
+	p_end   = 4000;
+	p_inc   = 40;
+
+	n_input = -1;
+#else
+	p_begin = 16;
+	p_end   = 16;
+	p_inc   = 1;
+
+	n_input = 15;
+#endif
+
+#if 1
+	dt = BLIS_FLOAT;
+	//dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+
+	dt_x = dt_y = dt_alpha = dt;
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+
+		bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
+
+		bli_obj_create( dt_x, n, 1, 0, 0, &x );
+		bli_obj_create( dt_y, n, 1, 0, 0, &y );
+		bli_obj_create( dt_y, n, 1, 0, 0, &y_save );
+
+		bli_randm( &x );
+		bli_randm( &y );
+
+		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+
+		bli_copym( &y, &y_save );
+
+		dtime_save = 1.0e9;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &y_save, &y );
+
+			dtime = bli_clock();
+
+#ifdef PRINT
+			bli_printm( "alpha", &alpha, "%4.1f", "" );
+			bli_printm( "x", &x, "%4.1f", "" );
+			bli_printm( "y", &y, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_axpyv( &alpha,
+			           &x,
+			           &y );
+#else
+			if ( bli_is_float( dt ) )
+			{
+				f77_int  nn     = bli_obj_length( x );
+				f77_int  incx   = bli_obj_vector_inc( x );
+				f77_int  incy   = bli_obj_vector_inc( y );
+				float*  alphap = bli_obj_buffer( alpha );
+				float*  xp     = bli_obj_buffer( x );
+				float*  yp     = bli_obj_buffer( y );
+
+				saxpy_( &nn,
+				        alphap,
+				        xp, &incx,
+				        yp, &incy );
+
+
+			}
+			else if ( bli_is_double( dt ) )
+			{
+
+				f77_int  nn     = bli_obj_length( x );
+				f77_int  incx   = bli_obj_vector_inc( x );
+				f77_int  incy   = bli_obj_vector_inc( y );
+				double*  alphap = bli_obj_buffer( alpha );
+				double*  xp     = bli_obj_buffer( x );
+				double*  yp     = bli_obj_buffer( y );
+
+				daxpy_( &nn,
+				        alphap,
+				        xp, &incx,
+				        yp, &incy );
+			}
+#endif
+
+#ifdef PRINT
+			bli_printm( "y after", &y, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 2.0  * n ) / ( dtime_save * 1.0e9 );
+
+#ifdef BLIS
+		printf( "data_axpyv_blis" );
+#else
+		printf( "data_axpyv_%s", BLAS );
+#endif
+		printf( "( %2lu, 1:4 ) = [ %4lu  %10.3e  %6.3f ];\n",
+				( unsigned long )(p - p_begin + 1)/p_inc + 1,
+				( unsigned long )n, dtime_save, gflops );
+
+		bli_obj_free( &alpha );
+
+		bli_obj_free( &x );
+		bli_obj_free( &y );
+		bli_obj_free( &y_save );
+	}
+
+	bli_finalize();
+
+	return 0;
+}
--- a/test/test_dotv.c
+++ b/test/test_dotv.c
@@ -0,0 +1,174 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+//        res             n     x      incx      y        incy
+//double  res =  ddotv_( int*, double*, int*, double*,   int* );
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t x, y;
+	obj_t res;
+	dim_t n;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   n_input;
+	num_t dt_x, dt_y, dt_res;
+	int   r, n_repeats;
+	num_t dt;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+
+	bli_init();
+
+	n_repeats = 10;
+
+#ifndef PRINT
+	p_begin = 40;
+	p_end   = 4000;
+	p_inc   = 40;
+
+	n_input = -1;
+#else
+	p_begin = 16;
+	p_end   = 16;
+	p_inc   = 1;
+
+	n_input = 16;
+#endif
+
+#if 1
+	dt = BLIS_FLOAT;
+	//dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+	dt_x = dt_y  = dt_res = dt;
+
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+
+		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+
+		bli_obj_create( dt_x, n, 1, 0, 0, &x );
+		bli_obj_create( dt_y, n, 1, 0, 0, &y );
+		bli_obj_create( dt_res, 1, 1, 0, 0, &res );
+
+		bli_randm( &x );
+		bli_randm( &y );
+
+		dtime_save = 1.0e9;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+
+			dtime = bli_clock();
+
+#ifdef PRINT
+			bli_printm( "x", &x, "%4.1f", "" );
+			bli_printm( "y", &y, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+			bli_dotv( &x,
+			          &y,
+			          &res);
+#else
+			if ( bli_is_float( dt ) )
+			{
+				f77_int nn     = bli_obj_length( x );
+				f77_int incx   = bli_obj_vector_inc( x );
+				f77_int incy   = bli_obj_vector_inc( y );
+				float*  xp     = bli_obj_buffer( x );
+				float*  yp     = bli_obj_buffer( y );
+				float*  resp   = bli_obj_buffer( res );
+
+				*resp = sdot_( &nn,
+				               xp, &incx,
+				               yp, &incy );
+
+			}
+			else if ( bli_is_double( dt ) )
+			{
+
+				f77_int  nn     = bli_obj_length( x );
+				f77_int  incx   = bli_obj_vector_inc( x );
+				f77_int  incy   = bli_obj_vector_inc( y );
+				double*  xp     = bli_obj_buffer( x );
+				double*  yp     = bli_obj_buffer( y );
+				double*  resp   = bli_obj_buffer( res );
+
+				*resp = ddot_( &nn,
+				               xp, &incx,
+				               yp, &incy );
+			}
+#endif
+
+#ifdef PRINT
+			bli_printm( "res after", &res, "%4.1f", "" );
+			exit(1);
+#endif
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 2.0 * n ) / ( dtime_save * 1.0e9 );
+
+#ifdef BLIS
+		printf( "data_dotv_blis" );
+#else
+		printf( "data_dotv_%s", BLAS );
+#endif
+		printf( "( %2lu, 1:4 ) = [ %4lu  %10.3e  %6.3f ];\n",
+		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )n, dtime_save, gflops );
+
+		bli_obj_free( &x );
+		bli_obj_free( &y );
+		bli_obj_free( &res );
+	}
+
+	bli_finalize();
+
+	return 0;
+}
--- a/testsuite/src/test_amaxv.c
+++ b/testsuite/src/test_amaxv.c
@@ -351,14 +351,19 @@ void PASTEMAC(ch,varname) \
 	ctype_r  chi1_i; \
 	ctype_r  abs_chi1; \
 	ctype_r  abs_chi1_max; \
+	dim_t    i_max_l; \
 	dim_t    i; \
-\
-	/* Initialize the index of the maximum absolute value to zero. */ \
-	PASTEMAC(i,copys)( zero_i, *i_max ); \
 \
 	/* If the vector length is zero, return early. This directly emulates
 	   the behavior of netlib BLAS's i?amax() routines. */ \
-	if ( bli_zero_dim1( n ) ) return; \
+	if ( bli_zero_dim1( n ) ) \
+	{ \
+		PASTEMAC(i,copys)( *zero_i, *i_max ); \
+		return; \
+	} \
+\
+	/* Initialize the index of the maximum absolute value to zero. */ \
+	PASTEMAC(i,copys)( *zero_i, i_max_l ); \
 \
 	/* Initialize the maximum absolute value search candidate with
 	   -1, which is guaranteed to be less than all values we will
@@ -390,10 +395,13 @@ void PASTEMAC(ch,varname) \
 			if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
 			{ \
 				abs_chi1_max = abs_chi1; \
-				*i_max       = i; \
+				i_max_l       = i; \
 			} \
 		} \
 	} \
+\
+	/* Store the final index to the output variable. */ \
+	PASTEMAC(i,copys)( i_max_l, *i_max ); \
 }

 INSERT_GENTFUNCR_BASIC0( amaxv_test )
--- a/testsuite/src/test_dotxf.c
+++ b/testsuite/src/test_dotxf.c
@@ -40,10 +40,10 @@
 static char*     op_str                    = "dotxf";
 static char*     o_types                   = "mvv";  // A x y
 static char*     p_types                   = "cc";   // conjat conjx
-static thresh_t  thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 },   // warn, pass for s
+static thresh_t  thresh[BLIS_NUM_FP_TYPES] = { { 5e-04, 5e-05 },   // warn, pass for s
                                               { 1e-04, 1e-05 },   // warn, pass for c
-                                               { 1e-13, 1e-14 },   // warn, pass for d
-                                               { 1e-13, 1e-14 } }; // warn, pass for z
+                                               { 1e-12, 1e-13 },   // warn, pass for d
+                                               { 1e-12, 1e-13 } }; // warn, pass for z

 // Local prototypes.
 void libblis_test_dotxf_deps