Squash-merge 'pr' into 'squash'. (#457)

Merged contributions from AMD's AOCL BLIS (#448). Details: - Added support for level-3 operation gemmt, which performs a gemm on only the lower or upper triangle of a square matrix C. For now, only the conventional/large code path will be supported (in vanilla BLIS). This was accomplished by leveraging the existing variant logic for herk. However, some of the infrastructure to support a gemmtsup is included in this commit, including - A bli_gemmtsup() front-end, similar to bli_gemmsup(). - A bli_gemmtsup_ref() reference handler function. - A bli_gemmtsup_int() variant chooser function (with variant calls commented out). - Added support for inducing complex domain gemmt via the 1m method. - Added gemmt APIs to the BLAS and CBLAS compatiblity layers. - Added gemmt test module to testsuite. - Added standalone gemmt test driver to 'test' directory. - Documented gemmt APIs in BLISObjectAPI.md and BLISTypedAPI.md. - Added a C++ template header (blis.hh) containing a BLAS-inspired wrapper to a set of polymorphic CBLAS-like function wrappers defined in another header (cblas.hh). These two headers are installed if running the 'install' target with INSTALL_HH is set to 'yes'. (Also added a set of unit tests that exercise blis.hh, although they are disabled for now because they aren't compatible with out-of-tree builds.) These files now live in the 'vendor' top-level directory. - Various updates to 'zen' and 'zen2' subconfigurations, particularly within the context initialization functions. - Added s and d copyv, setv, and swapv kernels to kernels/zen/1, and various minor updates to dotv and scalv kernels. Also added various sup kernels contributed by AMD to kernels/zen/3. However, these kernels are (for now) not yet used, in part because they caused AppVeyor clang failures, and also because I have not found time to review and vet them. - Output the python found during configure into the definition of PYTHON in build/config.mk (via build/config.mk.in). - Added early-return checks (A, B, or C with zero dimension; alpha = 0) to bli_gemm_front.c. - Implemented explicit beta = 0 handling in for the sgemm ukernel in bli_gemm_armv7a_int_d4x4.c, which was previously missing. This latent bug surfaced because the gemmt module verifies its computation using gemm with its beta parameter set to zero, which, on a cortexa15 system caused the gemm kernel code to unconditionally multiply the uninitialized C data by beta. The C matrix likely contained non-numeric values such as NaN, which then would have resulted in a false failure. - Fixed a bug whereby the implementation for bli_herk_determine_kc(), in bli_l3_blocksize.c, was inadvertantly being defined in terms of helper functions meant for trmm. This bug was probably harmless since the trmm code should have also done the right thing for herk. - Used cpp macros to neutralize the various AOCL_DTL_TRACE_ macros in kernels/zen/3/bli_gemm_small.c since those macros are not used in vanilla BLIS. - Added cpp guard to definition of bli_mem_clear() in bli_mem.h to accommodate C++'s stricter type checking. - Added cpp guard to test/*.c drivers that facilitate compilation on Windows systems. - Various whitespace changes.
2026-04-20 07:38:53 +00:00 · 2020-11-14 09:39:48 -06:00
parent 234b8b0cf4
commit 88ad841434
163 changed files with 106563 additions and 9683 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -43,7 +43,12 @@ include/*/*.h
 # -- misc. --

 # BLIS testsuite output file
-output.testsuite
+output.testsuite.*

 # BLAS test output files
 out.*
+
+# GTAGS database
+GPATH
+GRTAGS
+GTAGS
--- a/21
+++ b/21
@@ -249,6 +249,12 @@ ifeq ($(MK_ENABLE_CBLAS),yes)
 HEADERS_TO_INSTALL += $(CBLAS_H_FLAT)
 endif

+# If requested, include AMD's C++ template header files in the list of headers
+# to install.
+ifeq ($(INSTALL_HH),yes)
+HEADERS_TO_INSTALL += $(wildcard $(VEND_CPP_PATH)/*.hh)
+endif
+


 #
@@ -892,6 +898,19 @@ else
 	@- $(TESTSUITE_CHECK_PATH) $(TESTSUITE_OUT_FILE)
 endif

+
+# --- AMD's C++ template header test rules ---
+
+# NOTE: The targets below won't work as intended for an out-of-tree build,
+# and so it's disabled for now.
+
+#testcpp: testvendcpp
+
+# Recursively run the test for AMD's C++ template header.
+#testvendcpp:
+#	$(MAKE) -C $(VEND_TESTCPP_PATH)
+
+
 # --- Install header rules ---

 install-headers: check-env $(MK_INCL_DIR_INST)
@@ -1167,11 +1186,13 @@ ifeq ($(IS_CONFIGURED),yes)
 ifeq ($(ENABLE_VERBOSE),yes)
 	- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
 	- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
+#	- $(MAKE) -C $(VEND_TESTCPP_DIR) clean
 else
 	@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)"
 	@- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
 	@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)"
 	@- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
+#	@$(MAKE) -C $(VEND_TESTCPP_DIR) clean
 endif # ENABLE_VERBOSE
 endif # IS_CONFIGURED

--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -103,6 +103,9 @@ RANLIB            := @RANLIB@
 # Archiver.
 AR                := @AR@

+# Python Interpreter
+PYTHON            := @PYTHON@
+
 # Preset (required) CFLAGS and LDFLAGS. These variables capture the value
 # of the CFLAGS and LDFLAGS environment variables at configure-time (and/or
 # the value of CFLAGS/LDFLAGS if either was specified on the command line).
--- a/build/templates/license.c
+++ b/build/templates/license.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2019, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
--- a/build/templates/license.h
+++ b/build/templates/license.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2019, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
--- a/build/templates/license.sh
+++ b/build/templates/license.sh
@@ -5,6 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2019, The University of Texas at Austin
+#  Copyright (C) 2018, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
--- a/common.mk
+++ b/common.mk
@@ -299,6 +299,10 @@ INCLUDE_DIR        := include
 BLASTEST_DIR       := blastest
 TESTSUITE_DIR      := testsuite

+VEND_DIR           := vendor
+VEND_CPP_DIR       := $(VEND_DIR)/cpp
+VEND_TESTCPP_DIR   := $(VEND_DIR)/testcpp
+
 # The filename suffix for reference kernels.
 REFNM              := ref

@@ -358,6 +362,10 @@ REFKERN_PATH       := $(DIST_PATH)/$(REFKERN_DIR)
 KERNELS_PATH       := $(DIST_PATH)/$(KERNELS_DIR)
 SANDBOX_PATH       := $(DIST_PATH)/$(SANDBOX_DIR)

+# Construct paths to some optional C++ template headers contributed by AMD.
+VEND_CPP_PATH      := $(DIST_PATH)/$(VEND_CPP_DIR)
+VEND_TESTCPP_PATH  := $(DIST_PATH)/$(VEND_TESTCPP_DIR)
+
 # Construct paths to the makefile fragments for the four primary directories
 # of source code: the config directory, general framework code, reference
 # kernel code, and optimized kernel code.
--- a/config/cortexa15/bli_cntx_init_cortexa15.c
+++ b/config/cortexa15/bli_cntx_init_cortexa15.c
@@ -55,11 +55,19 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )

 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     4,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   336,   176,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   528,   368,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,     0,     0 );
+#if 1
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     4,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   336,   176,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   528,   368,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,    -1,    -1 );
+#else
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   176,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   368,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4096,    -1,    -1 );
+#endif

 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -67,6 +67,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  // gemmtrsm_l
 	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+
 	  // gemmtrsm_u
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
@@ -90,11 +91,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	bli_cntx_set_l1v_kers
 	(
 	  10,
-#if 1
+
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
-#endif
+
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
@@ -106,9 +107,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+
 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+
 	  // scalv
 #if 0
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
--- a/config/zen/amd_config.mk
+++ b/config/zen/amd_config.mk
@@ -60,10 +60,8 @@ ifeq ($(CC_VENDOR),gcc)
 CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma
 else
 ifeq ($(CC_VENDOR),clang)
-#CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
 CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma
-# When compiling with AOCC, add these flags to the default flags set above.
-ifeq ($(strip $(shell clang -v |& head -1 | grep -c 'AOCC.LLVM.2.0.0')),1)
+ifeq ($(strip $(shell clang -v |& head -1 | grep -c 'AOCC.LLVM')),1)
 CKVECFLAGS     += -mllvm -disable-licm-vrp
 endif
 else
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -52,27 +52,43 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_cntx_set_l3_nat_ukrs
 	(
 	  8,
+
 	  // gemm
 	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
 	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
 	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
 	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+
 	  // gemmtrsm_l
 	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+
 	  // gemmtrsm_u
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
 	  cntx
 	);

+#if 0
+	// Update the context with optimized level-1m (packm) kernels.
+	bli_cntx_set_packm_kers
+	(
+	  2,
+	  BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
+	  BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
+	  cntx
+	);
+#endif
+
 	// Update the context with optimized level-1f kernels.
 	bli_cntx_set_l1f_kers
 	(
 	  4,
+
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
 	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
@@ -83,11 +99,11 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_cntx_set_l1v_kers
 	(
 	  10,
-#if 1
+
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
-#endif
+
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
@@ -96,12 +112,21 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
 #endif
+
+#if 0
+	  // copyv
+	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
+	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
+#endif
+
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+
 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+
 	  // scalv
 #if 0
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
@@ -110,6 +135,16 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
+
+#if 0
+	  // setv
+	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
+	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+
+	  // swapv
+	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
+#endif
 	  cntx
 	);

@@ -125,29 +160,22 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	a)  If BLIS is run in a multi-instance mode with
 	    CPU freq 2.6/2.2 Ghz
 	    DDR4 clock frequency 2400Mhz
-          mc = 240, kc = 512, and nc = 2040
+	    mc = 240, kc = 512, and nc = 2040
 	    has better performance on EPYC server, over the default block sizes.

 	b)  If BLIS is run in Single Instance mode
-	      mc = 510, kc = 1024 and nc = 4080
+	    mc = 510, kc = 1024 and nc = 4080
 */

-#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
-	// Zen optmized level 3 cache block sizes
 	#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],  1020,   510,   510,   255 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  1024,  1024,  1024,  1024 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  4080,  3056 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,   510,   144,    72 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,  1024,   256,   256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
 	#else
 	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,   240,   144,    72 );
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   512,   256,   256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  2040,  2040,  1528 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  2040,  4080,  4080 );
 	#endif
-#else
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  4080,  3056 );
-#endif
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );

@@ -171,10 +199,10 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	// -------------------------------------------------------------------------

 	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],  256,  256,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],  256,  256,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],  220,  220,   -1,   -1 );
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &thresh[ BLIS_MT ],   512,   256,    -1,    -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_NT ],   512,   256,    -1,    -1 );
+	bli_blksz_init_easy( &thresh[ BLIS_KT ],   440,   220,    -1,    -1 );

 	// Initialize the context with the sup thresholds.
 	bli_cntx_set_l3_sup_thresh
@@ -186,15 +214,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  cntx
 	);

-#if 0
 	// Initialize the context with the sup handlers.
 	bli_cntx_set_l3_sup_handlers
 	(
 	  1,
 	  BLIS_GEMM, bli_gemmsup_ref,
+	  //BLIS_GEMMT, bli_gemmtsup_ref,
 	  cntx
 	);
-#endif

 	// Update the context with optimized small/unpacked gemm kernels.
 	bli_cntx_set_l3_sup_kers
@@ -218,6 +245,33 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
 	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
 	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
+#if 0
+	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
+	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
+	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+#endif
+
+#if 0
+	  // NOTE: This set of kernels is likely broken and therefore disabled.
+	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+
+	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+#endif
 	  cntx
 	);

@@ -227,9 +281,17 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,    -1,    -1,
 	                                             9,     9,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,    -1,    -1 );
+#if 0
+	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,     3,     3,
+	                                             9,     9,     3,     3 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   512,   256,   128,    64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  2040,  1020 );
+#endif

 	// Update the context with the current architecture's register and cache
 	// blocksizes for small/unpacked level-3 problems.
--- a/config/zen/bli_family_zen.h
+++ b/config/zen/bli_family_zen.h
@@ -65,6 +65,17 @@

 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22

+#if 0
+// Allow the sup implementation to combine some small edge case iterations in
+// the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the
+// block-panel algorithm (NR) with the last full iteration that precedes it.
+// NOTE: These cpp macros need to be explicitly set to an integer since they
+// are used at compile-time to create unconditional branches or dead code
+// regions.
+#define BLIS_ENABLE_SUP_MR_EXT 1
+#define BLIS_ENABLE_SUP_NR_EXT 0
+#endif
+

 //#endif

--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -64,13 +64,24 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  cntx
 	);

+#if 0
+	// Update the context with optimized level-1m (packm) kernels.
+	bli_cntx_set_packm_kers
+	(
+	  2,
+	  BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
+	  BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
+	  cntx
+	);
+#endif
+
 	// Update the context with optimized level-1f kernels.
 	bli_cntx_set_l1f_kers
 	(
 	  4,
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
@@ -80,28 +91,39 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	// Update the context with optimized level-1v kernels.
 	bli_cntx_set_l1v_kers
 	(
-	  10,
-#if 1
+	  16,
+
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
-#endif
-	  // axpyv

+	  // axpyv
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,

 	  // dotv
-	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
-	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int10,
+	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int10,
+
 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
-	  // scalv

+	  // scalv
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,

+	  //swap
+	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER, BLIS_DOUBLE,  bli_dswapv_zen_int8,
+
+	  //copy
+	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
+	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
+
+	  //set
+	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
+	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
 	  cntx
 	);

@@ -119,7 +141,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
 #endif

-	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     5,     5,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );

 	// Update the context with the current architecture's register and cache
@@ -195,6 +217,33 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
 	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
 	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
+#if 0
+	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
+	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
+	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+#endif
+
+#if 0
+	  // NOTE: This set of kernels is likely broken and therefore disabled.
+	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
+
+	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+#endif
 	  cntx
 	);

--- a/config/zen2/bli_family_zen2.h
+++ b/config/zen2/bli_family_zen2.h
@@ -60,11 +60,27 @@
 #define BLIS_ENABLE_SMALL_MATRIX_ROME
 #define BLIS_SMALL_MATRIX_THRES_ROME       400

-#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ROME 120
-#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 60
-#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10

-#define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130
+
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100
+
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30
+
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120
+#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50

 // When running HPL with pure MPI without DGEMM threading (Single-threaded
 // BLIS), defining this macro as 1 yields better performance.
--- a/8
+++ b/8
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+#  Copyright (C) 2020, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -1363,6 +1363,9 @@ get_compiler_version()
 	if [ "${cc_vendor}" = "icc" -o \
 	     "${cc_vendor}" = "gcc" ]; then
 		cc_version=$(${cc} -dumpversion)
+	# If compiler is AOCC, first grep for clang and then the version number.
+	elif [ "${cc_vendor}" = "clang" ]; then
+		cc_version=$(echo "${vendor_string}" | egrep -o 'clang version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')
 	elif [ "${cc_vendor}" = "oneAPI" ]; then
 		# Treat Intel oneAPI's clang as clang, not icc.
 		cc_vendor="clang"
@@ -3107,6 +3110,7 @@ main()
 	dist_path_esc=$(echo   "${dist_path}"   | sed 's/\//\\\//g')
 	cc_esc=$(echo          "${found_cc}"    | sed 's/\//\\\//g')
 	cxx_esc=$(echo         "${found_cxx}"   | sed 's/\//\\\//g')
+	python_esc=$(echo      "${found_python}"    | sed 's/\//\\\//g')
 	#sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g')

 	# For RANLIB, if the variable is not set, we use a default value of
@@ -3211,6 +3215,7 @@ main()
 		| sed -e "s/@CXX@/${cxx_esc}/g" \
 		| sed -e "s/@RANLIB@/${ranlib_esc}/g" \
 		| sed -e "s/@AR@/${ar_esc}/g" \
+		| sed -e "s/@PYTHON@/${python_esc}/g" \
 		| sed -e "s/@libpthread@/${libpthread_esc}/g" \
 		| sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \
 		| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
@@ -3311,7 +3316,6 @@ main()
 	echo "${script_name}: creating ${obj_frame_dirpath}"
 	mkdir -p ${obj_frame_dirpath}

-
 	if [ -n "${sandbox_flag}" ]; then

 		obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"
--- a/docs/BLISObjectAPI.md
+++ b/docs/BLISObjectAPI.md
@@ -1681,6 +1681,27 @@ Observed object properties: `trans?(A)`, `trans?(B)`.

 ---

+#### gemmt
+```c
+void bli_gemmt
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c
+     );
+```
+Perform
+```
+  C := beta * C + alpha * trans?(A) * trans?(B)
+```
+where `C` is an _m x m_ matrix, `trans?(A)` is an _m x k_ matrix, and `trans?(B)` is a _k x m_ matrix. This operation is similar to `bli_gemm()` except that it only updates the lower or upper triangle of `C` as specified by `uplo(C)`.
+
+Observed object properties: `trans?(A)`, `trans?(B)`, `uplo(C)`.
+
+---
+
 #### hemm
 ```c
 void bli_hemm
--- a/docs/BLISTypedAPI.md
+++ b/docs/BLISTypedAPI.md
@@ -1213,6 +1213,30 @@ where C is an _m x n_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)`

 ---

+#### gemmt
+```c
+void bli_?gemmt
+     (
+       uplo_t  uploc,
+       trans_t transa,
+       trans_t transb,
+       dim_t   m,
+       dim_t   k,
+       ctype*  alpha,
+       ctype*  a, inc_t rsa, inc_t csa,
+       ctype*  b, inc_t rsb, inc_t csb,
+       ctype*  beta,
+       ctype*  c, inc_t rsc, inc_t csc
+     );
+```
+Perform
+```
+  C := beta * C + alpha * transa(A) * transb(B)
+```
+where C is an _m x m_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)` is a _k x m_ matrix. This operation is similar to `bli_?gemm()` except that it only updates the lower or upper triangle of `C` as specified by `uploc`.
+
+---
+
 #### hemm
 ```c
 void bli_?hemm
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -97,4 +97,4 @@
 #include "bli_trmm.h"
 #include "bli_trmm3.h"
 #include "bli_trsm.h"
-
+#include "bli_gemmt.h"
--- a/frame/3/bli_l3_blocksize.c
+++ b/frame/3/bli_l3_blocksize.c
@@ -91,7 +91,7 @@ dim_t PASTEMAC0(opname) \
 }

 GENFRONT( gemm_determine_kc, gemm )
-GENFRONT( herk_determine_kc, trmm )
+GENFRONT( herk_determine_kc, herk )
 GENFRONT( trmm_determine_kc, trmm )
 GENFRONT( trsm_determine_kc, trsm )

--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -63,6 +63,28 @@ void bli_gemm_check
 	//bli_check_error_code( e_val );
 }

+void bli_gemmt_check
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx
+     )
+{
+	err_t e_val;
+
+	// Check basic properties of the operation.
+
+	bli_gemmt_basic_check( alpha, a, b, beta, c, cntx );
+
+	// Check matrix squareness.
+
+	e_val = bli_check_square_object( c );
+	bli_check_error_code( e_val );
+}
+
 void bli_hemm_check
     (
       side_t  side,
@@ -324,6 +346,28 @@ void bli_gemm_basic_check
 #endif
 }

+void bli_gemmt_basic_check
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx
+     )
+{
+	err_t e_val;
+
+	// Perform standard checks.
+
+	bli_l3_basic_check( alpha, a, b, beta, c, cntx );
+
+	// Check object dimensions.
+
+	e_val = bli_check_level3_dims( a, b, c );
+	bli_check_error_code( e_val );
+}
+
 void bli_hemm_basic_check
     (
       side_t  side,
--- a/frame/3/bli_l3_check.h
+++ b/frame/3/bli_l3_check.h
@@ -51,6 +51,7 @@ void PASTEMAC(opname,_check) \
    );

 GENPROT( gemm )
+GENPROT( gemmt )
 GENPROT( her2k )
 GENPROT( syr2k )

@@ -103,6 +104,16 @@ void bli_gemm_basic_check
       cntx_t* cntx
     );

+void bli_gemmt_basic_check
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx
+     );
+
 void bli_hemm_basic_check
     (
       side_t  side,
--- a/frame/3/bli_l3_oapi.c
+++ b/frame/3/bli_l3_oapi.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -71,7 +71,10 @@ void PASTEMAC(opname,EX_SUF) \
 		   the function returns with BLIS_FAILURE, which causes execution to
 		   proceed towards the conventional implementation. */ \
 		err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
-		if ( result == BLIS_SUCCESS ) return; \
+		if ( result == BLIS_SUCCESS ) \
+		{ \
+			return; \
+		} \
 	} \
 \
 	/* Only proceed with an induced method if each of the operands have a
@@ -101,6 +104,75 @@ void PASTEMAC(opname,EX_SUF) \
 GENFRONT( gemm )


+#undef  GENFRONT
+#define GENFRONT( opname ) \
+\
+void PASTEMAC(opname,EX_SUF) \
+     ( \
+       obj_t*  alpha, \
+       obj_t*  a, \
+       obj_t*  b, \
+       obj_t*  beta, \
+       obj_t*  c  \
+       BLIS_OAPI_EX_PARAMS  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	BLIS_OAPI_EX_DECLS \
+\
+	/* If the rntm is non-NULL, it may indicate that we should forgo sup
+	   handling altogether. */ \
+	/*
+	bool enable_sup = TRUE; \
+	if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
+	*/ \
+\
+	/* NOTE: The sup handling for gemmt is disabled here because gemmtsup
+	   is not yet fully implemented. */ \
+	/*
+	if ( enable_sup ) \
+	{ \
+	*/ \
+		/* Execute the small/unpacked oapi handler. If it finds that the problem
+		   does not fall within the thresholds that define "small", or for some
+		   other reason decides not to use the small/unpacked implementation,
+		   the function returns with BLIS_FAILURE, which causes execution to
+		   proceed towards the conventional implementation. */ \
+	/*
+		err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
+		if ( result == BLIS_SUCCESS ) \
+		{ \
+			return; \
+		} \
+	} \
+	*/ \
+\
+	/* Only proceed with an induced method if each of the operands have a
+	   complex storage datatype. NOTE: Allowing precisions to vary while
+	   using 1m, which is what we do here, is unique to gemm; other level-3
+	   operations use 1m only if all storage datatypes are equal (and they
+	   ignore the computation precision). If any operands are real, skip the
+	   induced method chooser function and proceed directly with native
+	   execution. */ \
+	if ( bli_obj_is_complex( c ) && \
+	     bli_obj_is_complex( a ) && \
+	     bli_obj_is_complex( b ) ) \
+	{ \
+		/* FIXME: BLIS does not yet support induced methods for gemmt. Thus,
+		   we call the native implementation code path for now. */ \
+		/*PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm );*/ \
+		PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
+	} \
+	else \
+	{ \
+		PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
+	} \
+}
+
+GENFRONT( gemmt )
+
+
 #undef  GENFRONT
 #define GENFRONT( opname ) \
 \
--- a/frame/3/bli_l3_oapi.h
+++ b/frame/3/bli_l3_oapi.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -51,6 +52,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
     );

 GENPROT( gemm )
+GENPROT( gemmt )
 GENPROT( her2k )
 GENPROT( syr2k )

--- a/frame/3/bli_l3_oft.h
+++ b/frame/3/bli_l3_oft.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -57,6 +58,7 @@ typedef void (*PASTECH(opname,_oft)) \
 );

 GENTDEF( gemm )
+GENTDEF( gemmt )
 GENTDEF( her2k )
 GENTDEF( syr2k )

--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -132,3 +132,72 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
 }


+err_t bli_gemmtsup
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	// Return early if small matrix handling is disabled at configure-time.
+	#ifdef BLIS_DISABLE_SUP_HANDLING
+	return BLIS_FAILURE;
+	#endif
+
+	// Return early if this is a mixed-datatype computation.
+	if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
+	     bli_obj_dt( c ) != bli_obj_dt( b ) ||
+	     bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) return BLIS_FAILURE;
+
+	// Obtain a valid (native) context from the gks if necessary.
+	// NOTE: This must be done before calling the _check() function, since
+	// that function assumes the context pointer is valid.
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+	// Return early if the problem dimensions exceed their sup thresholds.
+	// Notice that we do not bother to check whether the microkernel
+	// prefers or dislikes the storage of C, since the same check is called
+	// for either way.
+	{
+		const num_t dt = bli_obj_dt( c );
+		const dim_t m  = bli_obj_length( c );
+		const dim_t k  = bli_obj_width_after_trans( a );
+
+		if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, m, k, cntx ) )
+			return BLIS_FAILURE;
+	}
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// We've now ruled out the possibility that the sup thresholds are
+	// unsatisfied.
+	// This implies that the sup thresholds (at least one of them) are met.
+	// and the small/unpacked handler should be called.
+	// NOTE: The sup handler is free to enforce a stricter threshold regime
+	// if it so chooses, in which case it can/should return BLIS_FAILURE.
+
+	// Query the small/unpacked handler from the context and invoke it.
+	gemmtsup_oft gemmtsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMMT, cntx );
+
+	return
+	gemmtsup_fp
+	(
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm
+	);
+}
+
+
--- a/frame/3/bli_l3_sup.h
+++ b/frame/3/bli_l3_sup.h
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -43,3 +43,14 @@ err_t bli_gemmsup
       rntm_t* rntm
     );

+err_t bli_gemmtsup
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     );
+
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -80,7 +80,10 @@ err_t bli_gemmsup_int

 	// Don't use the small/unpacked implementation if one of the matrices
 	// uses general stride.
-	if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
+	if ( stor_id == BLIS_XXX )
+	{
+		return BLIS_FAILURE;
+	}

 	const bool    is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
 	                                     stor_id == BLIS_RRC ||
@@ -240,3 +243,192 @@ err_t bli_gemmsup_int
 	return BLIS_SUCCESS;
 }

+// -----------------------------------------------------------------------------
+
+err_t bli_gemmtsup_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     )
+{
+	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
+
+	// Don't use the small/unpacked implementation if one of the matrices
+	// uses general stride.
+	if ( stor_id == BLIS_XXX )
+	{
+		return BLIS_FAILURE;
+	}
+
+	const bool    is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
+	                                     stor_id == BLIS_RRC ||
+	                                     stor_id == BLIS_RCR ||
+	                                     stor_id == BLIS_CRR );
+	const bool    is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
+
+	const num_t   dt         = bli_obj_dt( c );
+	const bool    row_pref   = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+
+	const bool    is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
+	                                      : is_rcc_crc_ccr_ccc );
+
+	const dim_t  m           = bli_obj_length( c );
+	const dim_t  n           = m;
+	const dim_t  MR          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t  NR          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
+	const bool   auto_factor = bli_rntm_auto_factor( rntm );
+	const dim_t  n_threads   = bli_rntm_num_threads( rntm );
+	bool         use_bp      = TRUE;
+	dim_t        jc_new;
+	dim_t        ic_new;
+
+
+	if ( is_primary )
+	{
+		// This branch handles:
+		//  - rrr rrc rcr crr for row-preferential kernels
+		//  - rcc crc ccr ccc for column-preferential kernels
+
+		const dim_t mu = m / MR;
+		const dim_t nu = n / NR;
+
+		// Decide which algorithm to use (block-panel var2m or panel-block
+		// var1n) based on the number of micropanels in the m and n dimensions.
+		// Also, recalculate the automatic thread factorization.
+		if         ( mu >= nu )    use_bp = TRUE;
+		else /* if ( mu <  nu ) */ use_bp = FALSE;
+
+		// If the parallel thread factorization was automatic, we update it
+		// with a new factorization based on the matrix dimensions in units
+		// of micropanels.
+		if ( auto_factor )
+		{
+			if ( use_bp )
+			{
+				// In the block-panel algorithm, the m dimension is parallelized
+				// with ic_nt and the n dimension is parallelized with jc_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
+			}
+			else // if ( !use_bp )
+			{
+				// In the panel-block algorithm, the m dimension is parallelized
+				// with jc_nt and the n dimension is parallelized with ic_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
+			}
+
+			// Update the ways of parallelism for the jc and ic loops, and then
+			// update the current thread's root thrinfo_t node according to the
+			// new ways of parallelism value for the jc loop.
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
+			bli_l3_sup_thrinfo_update_root( rntm, thread );
+		}
+
+
+		if ( use_bp )
+		{
+			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
+			printf( "bli_l3_sup_int(): var2m primary\n" );
+			#endif
+			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
+#if 0
+			bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
+			                        alpha, a, b, beta, c,
+			                        stor_id, cntx, rntm, thread );
+#endif
+		}
+		else // use_pb
+		{
+			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
+			printf( "bli_l3_sup_int(): var1n primary\n" );
+			#endif
+			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
+#if 0
+			bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
+			                        alpha, a, b, beta, c,
+			                        stor_id, cntx, rntm, thread );
+#endif
+			// *requires nudging of nc up to be a multiple of mr.
+		}
+	}
+	else
+	{
+		// This branch handles:
+		//  - rrr rrc rcr crr for column-preferential kernels
+		//  - rcc crc ccr ccc for row-preferential kernels
+
+		const dim_t mu = n / MR; // the n becomes m after a transposition
+		const dim_t nu = m / NR; // the m becomes n after a transposition
+
+		// Decide which algorithm to use (block-panel var2m or panel-block
+		// var1n) based on the number of micropanels in the m and n dimensions.
+		// Also, recalculate the automatic thread factorization.
+		if         ( mu >= nu )    use_bp = TRUE;
+		else /* if ( mu <  nu ) */ use_bp = FALSE;
+
+		// If the parallel thread factorization was automatic, we update it
+		// with a new factorization based on the matrix dimensions in units
+		// of micropanels.
+		if ( auto_factor )
+		{
+			if ( use_bp )
+			{
+				// In the block-panel algorithm, the m dimension is parallelized
+				// with ic_nt and the n dimension is parallelized with jc_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
+			}
+			else // if ( !use_bp )
+			{
+				// In the panel-block algorithm, the m dimension is parallelized
+				// with jc_nt and the n dimension is parallelized with ic_nt.
+				bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
+			}
+
+			// Update the ways of parallelism for the jc and ic loops, and then
+			// update the current thread's root thrinfo_t node according to the
+			// new ways of parallelism value for the jc loop.
+			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
+			bli_l3_sup_thrinfo_update_root( rntm, thread );
+		}
+
+
+		if ( use_bp )
+		{
+			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
+			printf( "bli_l3_sup_int(): var2m non-primary\n" );
+			#endif
+			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
+#if 0
+			bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
+			                        alpha, a, b, beta, c,
+			                        stor_id, cntx, rntm, thread );
+#endif
+		}
+		else // use_pb
+		{
+			#ifdef TRACEVAR
+			if ( bli_thread_am_ochief( thread ) )
+			printf( "bli_l3_sup_int(): var1n non-primary\n" );
+			#endif
+			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
+#if 0
+			bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
+			                        alpha, a, b, beta, c,
+			                        stor_id, cntx, rntm, thread );
+#endif
+			// *requires nudging of mc up to be a multiple of nr.
+		}
+	}
+
+	// Return success so that the caller knows that we computed the solution.
+	return BLIS_SUCCESS;
+}
+
--- a/frame/3/bli_l3_sup_int.h
+++ b/frame/3/bli_l3_sup_int.h
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -43,3 +43,15 @@ err_t bli_gemmsup_int
       rntm_t* rntm,
       thrinfo_t* thread
     );
+
+err_t bli_gemmtsup_int
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
--- a/frame/3/bli_l3_sup_oft.h
+++ b/frame/3/bli_l3_sup_oft.h
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-20, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -57,6 +57,6 @@ typedef err_t (*PASTECH(opname,_oft)) \
 );

 GENTDEF( gemmsup )
-
+GENTDEF( gemmtsup )
 #endif

--- a/frame/3/bli_l3_sup_ref.c
+++ b/frame/3/bli_l3_sup_ref.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -106,3 +106,69 @@ err_t bli_gemmsup_ref
 	);
 }

+// -----------------------------------------------------------------------------
+
+err_t bli_gemmtsup_ref
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	// This function implements the default gemmtsup handler. If you are a
+	// BLIS developer and wish to use a different gemmtsup handler, please
+	// register a different function pointer in the context in your
+	// sub-configuration's bli_cntx_init_*() function.
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemmt_check( alpha, a, b, beta, c, cntx );
+
+#if 0
+	// NOTE: This special case handling is done within the variants.
+
+	// If alpha is zero, scale by beta and return.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
+	// If A or B has a zero dimension, scale C by beta and return early.
+	if ( bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return BLIS_SUCCESS;
+	}
+#endif
+
+	// Parse and interpret the contents of the rntm_t object to properly
+	// set the ways of parallelism for each loop.
+	bli_rntm_set_ways_from_rntm_sup
+	(
+	  bli_obj_length( c ),
+	  bli_obj_width( c ),
+	  bli_obj_width( a ),
+	  rntm
+	);
+
+	return
+	bli_l3_sup_thread_decorator
+	(
+	  bli_gemmtsup_int,
+	  BLIS_GEMMT, // operation family id
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c,
+	  cntx,
+	  rntm
+	);
+}
+
--- a/frame/3/bli_l3_sup_ref.h
+++ b/frame/3/bli_l3_sup_ref.h
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -43,3 +43,14 @@ err_t bli_gemmsup_ref
       rntm_t* rntm
     );

+err_t bli_gemmtsup_ref
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     );
+
--- a/frame/3/bli_l3_tapi.c
+++ b/frame/3/bli_l3_tapi.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -100,7 +100,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 }

 INSERT_GENTFUNC_BASIC0( gemm )
-
+INSERT_GENTFUNC_BASIC0( gemmt )

 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, struca ) \
--- a/frame/3/bli_l3_tapi.h
+++ b/frame/3/bli_l3_tapi.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -56,7 +57,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
     );

 INSERT_GENTPROT_BASIC0( gemm )
-
+INSERT_GENTPROT_BASIC0( gemmt )

 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -53,6 +53,26 @@ void bli_gemm_front
 	obj_t   b_local;
 	obj_t   c_local;

+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemm_check( alpha, a, b, beta, c, cntx );
+
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
 #if 0
 #ifdef BLIS_ENABLE_SMALL_MATRIX
 	// Only handle small problems separately for homogeneous datatypes.
@@ -60,23 +80,12 @@ void bli_gemm_front
 	     bli_obj_dt( a ) == bli_obj_dt( c ) &&
 	     bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
 	{
-		gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
+		err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
 		if ( status == BLIS_SUCCESS ) return;
 	}
 #endif
 #endif

-	// Check parameters.
-	if ( bli_error_checking_is_enabled() )
-		bli_gemm_check( alpha, a, b, beta, c, cntx );
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
 	// Alias A, B, and C in case we need to apply transformations.
 	bli_obj_alias_to( a, &a_local );
 	bli_obj_alias_to( b, &b_local );
--- a/frame/3/gemm/bli_gemm_int.c
+++ b/frame/3/gemm/bli_gemm_int.c
@@ -58,15 +58,18 @@ void bli_gemm_int
 		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

 	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) ) return;
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}

 	// If A or B has a zero dimension, scale C by beta and return early.
 	if ( bli_obj_has_zero_dim( a ) ||
 	     bli_obj_has_zero_dim( b ) )
 	{
-        if ( bli_thread_am_ochief( thread ) )
-		    bli_scalm( beta, c );
-        bli_thread_barrier( thread );
+		if ( bli_thread_am_ochief( thread ) )
+			bli_scalm( beta, c );
+		bli_thread_barrier( thread );
 		return;
 	}

@@ -78,9 +81,9 @@ void bli_gemm_int
 		// This should never execute.
 		bli_abort();

-        if ( bli_thread_am_ochief( thread ) )
-		    bli_scalm( beta, c );
-        bli_thread_barrier( thread );
+		if ( bli_thread_am_ochief( thread ) )
+			bli_scalm( beta, c );
+		bli_thread_barrier( thread );
 		return;
 	}

@@ -93,14 +96,14 @@ void bli_gemm_int
 	// to B.
 	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
 	{
-        bli_obj_scalar_apply_scalar( alpha, &b_local );
+		bli_obj_scalar_apply_scalar( alpha, &b_local );
 	}

 	// If beta is non-unit, typecast and apply it to the scalar attached
 	// to C.
 	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
 	{
-        bli_obj_scalar_apply_scalar( beta, &c_local );
+		bli_obj_scalar_apply_scalar( beta, &c_local );
 	}

 	// Create the next node in the thrinfo_t structure.
@@ -129,7 +132,7 @@ void bli_gemm_int
 	  cntx,
 	  rntm,
 	  cntl,
-      thread
+	  thread
 	);
 }

--- a/frame/3/gemmt/bli_gemmt.h
+++ b/frame/3/gemmt/bli_gemmt.h
@@ -0,0 +1,36 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "bli_gemmt_front.h"
+
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -0,0 +1,142 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_gemmt_front
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl
+     )
+{
+	bli_init_once();
+
+	obj_t   a_local;
+	obj_t   b_local;
+	obj_t   c_local;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemmt_check( alpha, a, b, beta, c, cntx );
+
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
+	// Alias A, B, and C in case we need to apply transformations.
+	bli_obj_alias_to( a, &a_local );
+	bli_obj_alias_to( b, &b_local );
+	bli_obj_alias_to( c, &c_local );
+	bli_obj_set_as_root( &c_local );
+
+	// An optimization: If C is stored by rows and the micro-kernel prefers
+	// contiguous columns, or if C is stored by columns and the micro-kernel
+	// prefers contiguous rows, transpose the entire operation to allow the
+	// micro-kernel to access elements of C in its preferred manner.
+	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	{
+		bli_obj_swap( &a_local, &b_local );
+
+		bli_obj_induce_trans( &a_local );
+		bli_obj_induce_trans( &b_local );
+		bli_obj_induce_trans( &c_local );
+	}
+
+	// Parse and interpret the contents of the rntm_t object to properly
+	// set the ways of parallelism for each loop, and then make any
+	// additional modifications necessary for the current operation.
+	bli_rntm_set_ways_for_op
+	(
+	  BLIS_GEMM,
+	  BLIS_LEFT, // ignored for gemm/hemm/symm/gemmt
+	  bli_obj_length( &c_local ),
+	  bli_obj_width( &c_local ),
+	  bli_obj_width( &a_local ),
+	  rntm
+	);
+
+	// A sort of hack for communicating the desired pach schemas for A and B
+	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
+	// bli_l3_cntl_create_if()). This allows us to access the schemas from
+	// the control tree, which hopefully reduces some confusion, particularly
+	// in bli_packm_init().
+	if ( bli_cntx_method( cntx ) == BLIS_NAT )
+	{
+		bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
+		bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
+	}
+	else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
+	{
+		pack_t schema_a = bli_cntx_schema_a_block( cntx );
+		pack_t schema_b = bli_cntx_schema_b_panel( cntx );
+
+		bli_obj_set_pack_schema( schema_a, &a_local );
+		bli_obj_set_pack_schema( schema_b, &b_local );
+	}
+
+	// Invoke the internal back-end via the thread handler.
+	bli_l3_thread_decorator
+	(
+	  bli_gemm_int,
+	  BLIS_HERK, // operation family id (gemmt uses 'herk' family)
+	  alpha,
+	  &a_local,
+	  &b_local,
+	  beta,
+	  &c_local,
+	  cntx,
+	  rntm,
+	  cntl
+	);
+}
+
--- a/frame/3/gemmt/bli_gemmt_front.h
+++ b/frame/3/gemmt/bli_gemmt_front.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+void bli_gemmt_front
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl
+     );
--- a/frame/base/bli_error.h
+++ b/frame/base/bli_error.h
@@ -38,8 +38,8 @@ BLIS_EXPORT_BLIS void     bli_error_checking_level_set( errlev_t new_level );

 BLIS_EXPORT_BLIS bool     bli_error_checking_is_enabled( void );

-void      bli_print_msg( char* str, char* file, guint_t line );
-BLIS_EXPORT_BLIS void      bli_abort( void );
+void                      bli_print_msg( char* str, char* file, guint_t line );
+BLIS_EXPORT_BLIS void     bli_abort( void );

-char*     bli_error_string_for_code( gint_t code );
+char*                     bli_error_string_for_code( gint_t code );

--- a/frame/base/bli_mem.h
+++ b/frame/base/bli_mem.h
@@ -147,7 +147,14 @@ BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem )
 BLIS_INLINE void bli_mem_clear( mem_t* mem )
 {
 	bli_mem_set_buffer( NULL, mem );
+#ifdef __cplusplus
+	const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE;
+	// When using C++, which is strongly typed, we avoid use of -1 as a
+	// packbuf_t value since it will result in a compile-time error.
+	bli_mem_set_buf_type( pb, mem );
+#else
 	bli_mem_set_buf_type( ( packbuf_t )-1, mem );
+#endif
 	bli_mem_set_pool( NULL, mem );
 	bli_mem_set_size( 0, mem );
 }
--- a/frame/compat/bla_gemmt.c
+++ b/frame/compat/bla_gemmt.c
@@ -0,0 +1,234 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+//
+// Define BLAS-to-BLIS interfaces.
+//
+
+#ifdef BLIS_BLAS3_CALLS_TAPI
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	uplo_t  blis_uploc; \
+	trans_t blis_transa; \
+	trans_t blis_transb; \
+	dim_t   m0, k0; \
+	inc_t   rs_a, cs_a; \
+	inc_t   rs_b, cs_b; \
+	inc_t   rs_c, cs_c; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	/* Perform BLAS parameter checking. */ \
+	PASTEBLACHK(blasname) \
+	( \
+	  MKSTR(ch), \
+	  MKSTR(blasname), \
+	  uploc, \
+	  transa, \
+	  transb, \
+	  m, \
+	  k, \
+	  lda, \
+	  ldb, \
+	  ldc  \
+	); \
+\
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+	bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
+\
+	/* Typecast BLAS integers to BLIS integers. */ \
+	bli_convert_blas_dim1( *m, m0 ); \
+	bli_convert_blas_dim1( *k, k0 ); \
+\
+	/* Set the row and column strides of the matrix operands. */ \
+	rs_a = 1; \
+	cs_a = *lda; \
+	rs_b = 1; \
+	cs_b = *ldb; \
+	rs_c = 1; \
+	cs_c = *ldc; \
+\
+	/* Call BLIS interface. */ \
+	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
+	( \
+	  blis_uploc, \
+	  blis_transa, \
+	  blis_transb, \
+	  m0, \
+	  k0, \
+	  (ftype*)alpha, \
+	  (ftype*)a, rs_a, cs_a, \
+	  (ftype*)b, rs_b, cs_b, \
+	  (ftype*)beta, \
+	  (ftype*)c, rs_c, cs_c, \
+	  NULL, \
+	  NULL  \
+	); \
+\
+	/* Finalize BLIS. */ \
+	bli_finalize_auto(); \
+}
+
+#else
+
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
+\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	uplo_t  blis_uploc; \
+	trans_t blis_transa; \
+	trans_t blis_transb; \
+	dim_t   m0, k0; \
+\
+	/* Initialize BLIS. */ \
+	bli_init_auto(); \
+\
+	/* Perform BLAS parameter checking. */ \
+	PASTEBLACHK(blasname) \
+	( \
+	  MKSTR(ch), \
+	  MKSTR(blasname), \
+	  uploc, \
+	  transa, \
+	  transb, \
+	  m, \
+	  k, \
+	  lda, \
+	  ldb, \
+	  ldc  \
+	); \
+\
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
+	bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
+	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
+	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
+\
+	/* Typecast BLAS integers to BLIS integers. */ \
+	bli_convert_blas_dim1( *m, m0 ); \
+	bli_convert_blas_dim1( *k, k0 ); \
+\
+	/* Set the row and column strides of the matrix operands. */ \
+	const inc_t rs_a = 1; \
+	const inc_t cs_a = *lda; \
+	const inc_t rs_b = 1; \
+	const inc_t cs_b = *ldb; \
+	const inc_t rs_c = 1; \
+	const inc_t cs_c = *ldc; \
+\
+	const num_t dt     = PASTEMAC(ch,type); \
+\
+	const struc_t strucc = BLIS_SYMMETRIC; \
+\
+	obj_t       alphao = BLIS_OBJECT_INITIALIZER_1X1; \
+	obj_t       ao     = BLIS_OBJECT_INITIALIZER; \
+	obj_t       bo     = BLIS_OBJECT_INITIALIZER; \
+	obj_t       betao  = BLIS_OBJECT_INITIALIZER_1X1; \
+	obj_t       co     = BLIS_OBJECT_INITIALIZER; \
+\
+	dim_t       m0_a, n0_a; \
+	dim_t       m0_b, n0_b; \
+\
+	bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
+	bli_set_dims_with_trans( blis_transb, k0, m0, &m0_b, &n0_b ); \
+\
+	bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, (ftype*)beta,  &betao  ); \
+\
+	bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m0,   m0,   (ftype*)c, rs_c, cs_c, &co ); \
+\
+	bli_obj_set_uplo( blis_uploc, &co ); \
+	bli_obj_set_conjtrans( blis_transa, &ao ); \
+	bli_obj_set_conjtrans( blis_transb, &bo ); \
+\
+	bli_obj_set_struc( strucc, &co ); \
+\
+	PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
+	( \
+	  &alphao, \
+	  &ao, \
+	  &bo, \
+	  &betao, \
+	  &co, \
+	  NULL, \
+	  NULL  \
+	); \
+\
+	/* Finalize BLIS. */ \
+	bli_finalize_auto(); \
+}
+
+#endif
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTFUNC_BLAS( gemmt, gemmt )
+#endif
+
--- a/frame/compat/bla_gemmt.h
+++ b/frame/compat/bla_gemmt.h
@@ -0,0 +1,60 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype BLAS-to-BLIS interfaces.
+//
+#undef  GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
+\
+BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     );
+
+#ifdef BLIS_ENABLE_BLAS
+INSERT_GENTPROT_BLAS( gemmt )
+#endif
+
--- a/frame/compat/bli_blas.h
+++ b/frame/compat/bli_blas.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -185,6 +186,7 @@
 #include "bla_syr2k.h"
 #include "bla_trmm.h"
 #include "bla_trsm.h"
+#include "bla_gemmt.h"

 #include "bla_gemm_check.h"
 #include "bla_hemm_check.h"
@@ -195,6 +197,7 @@
 #include "bla_syr2k_check.h"
 #include "bla_trmm_check.h"
 #include "bla_trsm_check.h"
+#include "bla_gemmt_check.h"

 // -- Fortran-compatible APIs to BLIS functions --

--- a/frame/compat/cblas/src/cblas.h
+++ b/frame/compat/cblas/src/cblas.h
@@ -448,6 +448,11 @@ void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                 enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                 float alpha, const float *A, f77_int lda,
                 float *B, f77_int ldb);
+void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, float alpha, const float *A,
+                 f77_int lda, const float *B, f77_int ldb,
+                 float beta, float *C, f77_int ldc);

 void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -478,6 +483,11 @@ void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                 enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                 double alpha, const double *A, f77_int lda,
                 double *B, f77_int ldb);
+void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, double alpha, const double *A,
+                 f77_int lda, const double *B, f77_int ldb,
+                 double beta, double *C, f77_int ldc);

 void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -508,6 +518,11 @@ void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                 enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                 const void *alpha, const void *A, f77_int lda,
                 void *B, f77_int ldb);
+void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, const void *alpha, const void *A,
+                 f77_int lda, const void *B, f77_int ldb,
+                 const void *beta, void *C, f77_int ldc);

 void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                 enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -538,6 +553,11 @@ void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                 enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                 const void *alpha, const void *A, f77_int lda,
                 void *B, f77_int ldb);
+void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                 enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
+                 f77_int N, f77_int K, const void *alpha, const void *A,
+                 f77_int lda, const void *B, f77_int ldb,
+                 const void *beta, void *C, f77_int ldc);


 /* 
--- a/frame/compat/cblas/src/cblas_cgemmt.c
+++ b/frame/compat/cblas/src/cblas_cgemmt.c
@@ -0,0 +1,166 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+   cblas_cgemmt.c
+   Based off of cblas_cgemm.c.
+*/
+
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                  enum CBLAS_TRANSPOSE TransA,
+                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
+                  const void *alpha, const void  *A,
+                  f77_int lda, const void  *B, f77_int ldb,
+                  const void *beta, void  *C, f77_int ldc)
+{
+   char UL, TA, TB;   
+#ifdef F77_CHAR
+   F77_CHAR F77_UL, F77_TA, F77_TB;
+#else
+   #define F77_UL &UL  
+   #define F77_TA &TA  
+   #define F77_TB &TB  
+#endif
+
+#ifdef F77_INT
+   F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
+   F77_INT F77_ldc=ldc;
+#else
+   #define F77_M M
+   #define F77_K K
+   #define F77_lda lda
+   #define F77_ldb ldb
+   #define F77_ldc ldc
+#endif
+
+   extern int CBLAS_CallFromC;
+   extern int RowMajorStrg;
+   RowMajorStrg = 0;
+   CBLAS_CallFromC = 1;
+
+   if( Order == CblasColMajor )
+   {
+
+      if( Uplo == CblasUpper) UL='U';
+      else if ( Uplo == CblasLower ) UL='L';
+      else 
+      {
+         cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransA == CblasTrans) TA='T';
+      else if ( TransA == CblasConjTrans ) TA='C';
+      else if ( TransA == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransB == CblasTrans) TB='T';
+      else if ( TransB == CblasConjTrans ) TB='C';
+      else if ( TransB == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      #ifdef F77_CHAR
+         F77_UL = C2F_CHAR(&UL);
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)A,
+       &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc);
+   } else if (Order == CblasRowMajor)
+   {
+      RowMajorStrg = 1;
+      if( Uplo == CblasUpper) UL='L';
+      else if ( Uplo == CblasLower ) UL='U';
+      else 
+      {
+         cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransA == CblasTrans) TB='T';
+      else if ( TransA == CblasConjTrans ) TB='C';
+      else if ( TransA == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      if(TransB == CblasTrans) TA='T';
+      else if ( TransB == CblasConjTrans ) TA='C';
+      else if ( TransB == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      #ifdef F77_CHAR
+         F77_UL = C2F_CHAR(&UL);
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B,
+                  &F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc);
+   } 
+   else  cblas_xerbla(1, "cblas_cgemmt", "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+   return;
+}
+#endif
--- a/frame/compat/cblas/src/cblas_dgemmt.c
+++ b/frame/compat/cblas/src/cblas_dgemmt.c
@@ -0,0 +1,166 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+   cblas_dgemmt.c
+   Based off of cblas_dgemm.c.
+*/
+
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                  enum CBLAS_TRANSPOSE TransA,
+                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
+                  double alpha, const double  *A,
+                  f77_int lda, const double  *B, f77_int ldb,
+                  double beta, double  *C, f77_int ldc)
+{
+   char UL, TA, TB;   
+#ifdef F77_CHAR
+   F77_CHAR F77_UL, F77_TA, F77_TB;
+#else
+   #define F77_UL &UL  
+   #define F77_TA &TA  
+   #define F77_TB &TB  
+#endif
+
+#ifdef F77_INT
+   F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
+   F77_INT F77_ldc=ldc;
+#else
+   #define F77_M M
+   #define F77_K K
+   #define F77_lda lda
+   #define F77_ldb ldb
+   #define F77_ldc ldc
+#endif
+
+   extern int CBLAS_CallFromC;
+   extern int RowMajorStrg;
+   RowMajorStrg = 0;
+   CBLAS_CallFromC = 1;
+
+   if( Order == CblasColMajor )
+   {
+
+      if( Uplo == CblasUpper) UL='U';
+      else if ( Uplo == CblasLower ) UL='L';
+      else 
+      {
+         cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransA == CblasTrans) TA='T';
+      else if ( TransA == CblasConjTrans ) TA='C';
+      else if ( TransA == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransB == CblasTrans) TB='T';
+      else if ( TransB == CblasConjTrans ) TB='C';
+      else if ( TransB == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      #ifdef F77_CHAR
+         F77_UL = C2F_CHAR(&UL);
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, A,
+       &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc);
+   } else if (Order == CblasRowMajor)
+   {
+      RowMajorStrg = 1;
+      if( Uplo == CblasUpper) UL='L';
+      else if ( Uplo == CblasLower ) UL='U';
+      else 
+      {
+         cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransA == CblasTrans) TB='T';
+      else if ( TransA == CblasConjTrans ) TB='C';
+      else if ( TransA == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      if(TransB == CblasTrans) TA='T';
+      else if ( TransB == CblasConjTrans ) TA='C';
+      else if ( TransB == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      #ifdef F77_CHAR
+         F77_UL = C2F_CHAR(&UL);
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B,
+                  &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
+   } 
+   else  cblas_xerbla(1, "cblas_dgemmt", "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+   return;
+}
+#endif
--- a/frame/compat/cblas/src/cblas_f77.h
+++ b/frame/compat/cblas/src/cblas_f77.h
@@ -1,12 +1,46 @@
 /*
- * cblas_f77.h
- * Written by Keita Teranishi
- *
- * Updated by Jeff Horner
- * Merged cblas_f77.h and cblas_fortran_header.h
- *
- * (Heavily hacked down from the original)
- */
+   cblas_f77.h
+   Written by Keita Teranishi
+
+   Updated by Jeff Horner
+   Merged cblas_f77.h and cblas_fortran_header.h
+
+   (Heavily hacked down from the original)
+*/
+
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/

 #ifndef CBLAS_F77_H
 #define CBLAS_F77_H
@@ -163,5 +197,12 @@
 #define F77_zsyr2k zsyr2k_
 #define F77_ztrmm  ztrmm_
 #define F77_ztrsm  ztrsm_
+/*
+* BLAS extensions
+*/
+#define F77_sgemmt sgemmt_
+#define F77_dgemmt dgemmt_
+#define F77_cgemmt cgemmt_
+#define F77_zgemmt zgemmt_

 #endif /*  CBLAS_F77_H */
--- a/frame/compat/cblas/src/cblas_sgemmt.c
+++ b/frame/compat/cblas/src/cblas_sgemmt.c
@@ -0,0 +1,166 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+   cblas_sgemmt.c
+   Based off of cblas_sgemm.c.
+*/
+
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                  enum CBLAS_TRANSPOSE TransA,
+                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
+                  float alpha, const float  *A,
+                  f77_int lda, const float  *B, f77_int ldb,
+                  float beta, float  *C, f77_int ldc)
+{
+   char UL, TA, TB;   
+#ifdef F77_CHAR
+   F77_CHAR F77_UL, F77_TA, F77_TB;
+#else
+   #define F77_UL &UL  
+   #define F77_TA &TA  
+   #define F77_TB &TB  
+#endif
+
+#ifdef F77_INT
+   F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
+   F77_INT F77_ldc=ldc;
+#else
+   #define F77_M M
+   #define F77_K K
+   #define F77_lda lda
+   #define F77_ldb ldb
+   #define F77_ldc ldc
+#endif
+
+   extern int CBLAS_CallFromC;
+   extern int RowMajorStrg;
+   RowMajorStrg = 0;
+   CBLAS_CallFromC = 1;
+
+   if( Order == CblasColMajor )
+   {
+
+      if( Uplo == CblasUpper) UL='U';
+      else if ( Uplo == CblasLower ) UL='L';
+      else 
+      {
+         cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransA == CblasTrans) TA='T';
+      else if ( TransA == CblasConjTrans ) TA='C';
+      else if ( TransA == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransB == CblasTrans) TB='T';
+      else if ( TransB == CblasConjTrans ) TB='C';
+      else if ( TransB == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      #ifdef F77_CHAR
+         F77_UL = C2F_CHAR(&UL);
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, A,
+       &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc);
+   } else if (Order == CblasRowMajor)
+   {
+      RowMajorStrg = 1;
+      if( Uplo == CblasUpper) UL='L';
+      else if ( Uplo == CblasLower ) UL='U';
+      else 
+      {
+         cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransA == CblasTrans) TB='T';
+      else if ( TransA == CblasConjTrans ) TB='C';
+      else if ( TransA == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      if(TransB == CblasTrans) TA='T';
+      else if ( TransB == CblasConjTrans ) TA='C';
+      else if ( TransB == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      #ifdef F77_CHAR
+         F77_UL = C2F_CHAR(&UL);
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B,
+                  &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
+   } 
+   else  cblas_xerbla(1, "cblas_sgemmt", "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+   return;
+}
+#endif
--- a/frame/compat/cblas/src/cblas_zgemmt.c
+++ b/frame/compat/cblas/src/cblas_zgemmt.c
@@ -0,0 +1,166 @@
+#include "blis.h"
+#ifdef BLIS_ENABLE_CBLAS
+/*
+   cblas_zgemmt.c
+   Based off of cblas_zgemm.c.
+*/
+
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "cblas.h"
+#include "cblas_f77.h"
+void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
+                  enum CBLAS_TRANSPOSE TransA,
+                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
+                  const void *alpha, const void  *A,
+                  f77_int lda, const void  *B, f77_int ldb,
+                  const void *beta, void  *C, f77_int ldc)
+{
+   char UL, TA, TB;   
+#ifdef F77_CHAR
+   F77_CHAR F77_UL, F77_TA, F77_TB;
+#else
+   #define F77_UL &UL  
+   #define F77_TA &TA  
+   #define F77_TB &TB  
+#endif
+
+#ifdef F77_INT
+   F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
+   F77_INT F77_ldc=ldc;
+#else
+   #define F77_M M
+   #define F77_K K
+   #define F77_lda lda
+   #define F77_ldb ldb
+   #define F77_ldc ldc
+#endif
+
+   extern int CBLAS_CallFromC;
+   extern int RowMajorStrg;
+   RowMajorStrg = 0;
+   CBLAS_CallFromC = 1;
+
+   if( Order == CblasColMajor )
+   {
+
+      if( Uplo == CblasUpper) UL='U';
+      else if ( Uplo == CblasLower ) UL='L';
+      else 
+      {
+         cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransA == CblasTrans) TA='T';
+      else if ( TransA == CblasConjTrans ) TA='C';
+      else if ( TransA == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransB == CblasTrans) TB='T';
+      else if ( TransB == CblasConjTrans ) TB='C';
+      else if ( TransB == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      #ifdef F77_CHAR
+         F77_UL = C2F_CHAR(&UL);
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)A,
+       &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
+   } else if (Order == CblasRowMajor)
+   {
+      RowMajorStrg = 1;
+      if( Uplo == CblasUpper) UL='L';
+      else if ( Uplo == CblasLower ) UL='U';
+      else 
+      {
+         cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+
+      if(TransA == CblasTrans) TB='T';
+      else if ( TransA == CblasConjTrans ) TB='C';
+      else if ( TransA == CblasNoTrans )   TB='N';
+      else 
+      {
+         cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      if(TransB == CblasTrans) TA='T';
+      else if ( TransB == CblasConjTrans ) TA='C';
+      else if ( TransB == CblasNoTrans )   TA='N';
+      else 
+      {
+         cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
+         CBLAS_CallFromC = 0;
+         RowMajorStrg = 0;
+         return;
+      }
+      #ifdef F77_CHAR
+         F77_UL = C2F_CHAR(&UL);
+         F77_TA = C2F_CHAR(&TA);
+         F77_TB = C2F_CHAR(&TB);
+      #endif
+
+      F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B,
+                  &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
+   } 
+   else  cblas_xerbla(1, "cblas_zgemmt", "Illegal Order setting, %d\n", Order);
+   CBLAS_CallFromC = 0;
+   RowMajorStrg = 0;
+   return;
+}
+#endif
--- a/frame/compat/check/bla_gemmt_check.h
+++ b/frame/compat/check/bla_gemmt_check.h
@@ -0,0 +1,92 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef BLIS_ENABLE_BLAS
+
+#define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \
+{ \
+	f77_int info = 0; \
+	f77_int nota,  notb; \
+	f77_int conja, conjb; \
+	f77_int ta,    tb; \
+	f77_int lower, upper; \
+	f77_int nrowa, nrowb; \
+\
+	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+\
+	lower = PASTEF770(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTEF770(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
+\
+	if ( nota ) { nrowa = *m; } \
+	else        { nrowa = *k; } \
+	if ( notb ) { nrowb = *k; } \
+	else        { nrowb = *m; } \
+\
+	if	( !lower && !upper ) \
+		info = 1; \
+	else if ( !nota && !conja && !ta ) \
+		info = 2; \
+	else if ( !notb && !conjb && !tb ) \
+		info = 3; \
+	else if ( *m < 0 ) \
+		info = 4; \
+	else if ( *k < 0 ) \
+		info = 5; \
+	else if ( *lda < bli_max( 1, nrowa ) ) \
+		info = 8; \
+	else if ( *ldb < bli_max( 1, nrowb ) ) \
+		info = 10; \
+	else if ( *ldc < bli_max( 1, *m    ) ) \
+		info = 13; \
+\
+	if ( info != 0 ) \
+	{ \
+		char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \
+\
+		sprintf( func_str, "%s%-5s", dt_str, op_str ); \
+\
+		bli_string_mkupper( func_str ); \
+\
+		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+\
+		return; \
+	} \
+}
+
+#endif
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -6,7 +6,7 @@

   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -250,9 +250,9 @@ CNTX_INIT_PROTS( generic )

 // -- AMD64 architectures --

-//#ifdef BLIS_KERNELS_ZEN2
-//#include "bli_kernels_zen2.h"
-//#endif
+#ifdef BLIS_KERNELS_ZEN2
+#include "bli_kernels_zen2.h"
+#endif
 #ifdef BLIS_KERNELS_ZEN
 #include "bli_kernels_zen.h"
 #endif
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -6,7 +6,7 @@

   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -918,6 +918,7 @@ typedef enum
 // bli_l3_ind.c to index into arrays.
 //
 	BLIS_GEMM = 0,
+	BLIS_GEMMT,
 	BLIS_HEMM,
 	BLIS_HERK,
 	BLIS_HER2K,
@@ -931,7 +932,7 @@ typedef enum
 	BLIS_NOID
 } opid_t;

-#define BLIS_NUM_LEVEL3_OPS 10
+#define BLIS_NUM_LEVEL3_OPS 11


 // -- Blocksize ID type --
--- a/frame/ind/bli_l3_ind.c
+++ b/frame/ind/bli_l3_ind.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -37,48 +37,49 @@

 static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
 {
-        /*   gemm   hemm   herk   her2k  symm   syrk,  syr2k  trmm3  trmm   trsm  */
-/* 3mh  */ { bli_gemm3mh,  bli_hemm3mh,  bli_herk3mh,  bli_her2k3mh, bli_symm3mh,
+        /*   gemm  gemmt  hemm  herk  her2k  symm  syrk  syr2k  trmm3  trmm  trsm  */
+/* 3mh  */ { bli_gemm3mh,  NULL,         bli_hemm3mh,  bli_herk3mh,  bli_her2k3mh, bli_symm3mh,
             bli_syrk3mh,  bli_syr2k3mh, bli_trmm33mh, NULL,         NULL         },
-/* 3m1  */ { bli_gemm3m1,  bli_hemm3m1,  bli_herk3m1,  bli_her2k3m1, bli_symm3m1,
+/* 3m1  */ { bli_gemm3m1,  NULL,         bli_hemm3m1,  bli_herk3m1,  bli_her2k3m1, bli_symm3m1,
             bli_syrk3m1,  bli_syr2k3m1, bli_trmm33m1, bli_trmm3m1,  bli_trsm3m1  },
-/* 4mh  */ { bli_gemm4mh,  bli_hemm4mh,  bli_herk4mh,  bli_her2k4mh, bli_symm4mh,
+/* 4mh  */ { bli_gemm4mh,  NULL,         bli_hemm4mh,  bli_herk4mh,  bli_her2k4mh, bli_symm4mh,
             bli_syrk4mh,  bli_syr2k4mh, bli_trmm34mh, NULL,         NULL         },
-/* 4mb  */ { bli_gemm4mb,  NULL,         NULL,         NULL,         NULL,         
+/* 4mb  */ { bli_gemm4mb,  NULL,         NULL,         NULL,         NULL,         NULL,
             NULL,         NULL,         NULL,         NULL,         NULL         },
-/* 4m1  */ { bli_gemm4m1,  bli_hemm4m1,  bli_herk4m1,  bli_her2k4m1, bli_symm4m1,
+/* 4m1  */ { bli_gemm4m1,  NULL,         bli_hemm4m1,  bli_herk4m1,  bli_her2k4m1, bli_symm4m1,
             bli_syrk4m1,  bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1,  bli_trsm4m1  },
-/* 1m   */ { bli_gemm1m,   bli_hemm1m,   bli_herk1m,   bli_her2k1m,  bli_symm1m,
+/* 1m   */ { bli_gemm1m,   NULL,         bli_hemm1m,   bli_herk1m,   bli_her2k1m,  bli_symm1m,
             bli_syrk1m,   bli_syr2k1m,  bli_trmm31m,  bli_trmm1m,   bli_trsm1m   },
-/* nat  */ { bli_gemmnat,  bli_hemmnat,  bli_herknat,  bli_her2knat, bli_symmnat,
+/* nat  */ { bli_gemmnat,  bli_gemmtnat, bli_hemmnat,  bli_herknat,  bli_her2knat, bli_symmnat,
             bli_syrknat,  bli_syr2knat, bli_trmm3nat, bli_trmmnat,  bli_trsmnat  },
 };

 //
 // NOTE: "2" is used instead of BLIS_NUM_FP_TYPES/2.
 //
-// BLIS provides APIs to modify this state during runtime. So, one application thread
-// can modify the state, before another starts the corresponding BLIS operation.
-// This is solved by making the induced method status array local to threads.
+// BLIS provides APIs to modify this state during runtime. So, it's possible for one
+// application thread to modify the state before another starts the corresponding
+// BLIS operation. This is solved by making the induced method status array local to
+// threads.

 static BLIS_THREAD_LOCAL
 bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
 {
-        /*   gemm   hemm   herk   her2k  symm   syrk,  syr2k  trmm3  trmm   trsm  */
+        /*   gemm  gemmt  hemm  herk  her2k  symm  syrk  syr2k  trmm3  trmm  trsm  */
        /*    c     z    */
-/* 3mh  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
+/* 3mh  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
             {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}  },
-/* 3m1  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
+/* 3m1  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
             {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}  },
-/* 4mh  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
+/* 4mh  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
             {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}  },
-/* 4mb  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
+/* 4mb  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
             {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}  },
-/* 4m1  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
+/* 4m1  */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
             {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}  },
-/* 1m   */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
+/* 1m   */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
             {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}  },
-/* nat  */ { {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},
+/* nat  */ { {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},
             {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE},   {TRUE,TRUE}    },
 };

@@ -99,6 +100,7 @@ bool PASTEMAC(opname,ind_has_avail)( num_t dt )
 */

 GENFUNC( gemm, BLIS_GEMM )
+GENFUNC( gemmt, BLIS_GEMMT )
 GENFUNC( hemm, BLIS_HEMM )
 GENFUNC( herk, BLIS_HERK )
 GENFUNC( her2k, BLIS_HER2K )
--- a/frame/ind/bli_l3_ind.h
+++ b/frame/ind/bli_l3_ind.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -44,6 +45,7 @@ void_fp PASTEMAC(opname,ind_get_avail)( num_t dt );
 /*bool PASTEMAC(opname,ind_has_avail)( num_t dt ); */

 GENPROT( gemm )
+GENPROT( gemmt )
 GENPROT( hemm )
 GENPROT( herk )
 GENPROT( her2k )
--- a/frame/ind/oapi/bli_l3_ind_oapi.c
+++ b/frame/ind/oapi/bli_l3_ind_oapi.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -67,6 +67,7 @@ void PASTEMAC(opname,imeth) \
 }

 GENFRONT( gemm, ind )
+GENFRONT( gemmt, ind )
 GENFRONT( her2k, ind )
 GENFRONT( syr2k, ind )

--- a/frame/ind/oapi/bli_l3_ind_oapi.h
+++ b/frame/ind/oapi/bli_l3_ind_oapi.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -41,6 +42,7 @@
 #define GENPROT( imeth ) \
 \
 BLIS_EXPORT_BLIS void PASTEMAC(gemm,imeth) (              obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
+BLIS_EXPORT_BLIS void PASTEMAC(gemmt,imeth)(              obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
 BLIS_EXPORT_BLIS void PASTEMAC(hemm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
 BLIS_EXPORT_BLIS void PASTEMAC(herk,imeth) (              obj_t* alpha, obj_t* a,           obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
 BLIS_EXPORT_BLIS void PASTEMAC(her2k,imeth)(              obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
--- a/frame/ind/oapi/bli_l3_nat_oapi.c
+++ b/frame/ind/oapi/bli_l3_nat_oapi.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -41,7 +41,7 @@
 // of executing one iteration of a for loop, plus the overhead of calling a
 // function that does nothing (ie: the _cntx_init_stage() function).

-// -- gemm/her2k/syr2k ---------------------------------------------------------
+// -- gemm/her2k/syr2k/gemmt ---------------------------------------------------

 #undef  GENFRONT
 #define GENFRONT( opname, cname, imeth ) \
@@ -80,6 +80,7 @@ void PASTEMAC(opname,imeth) \
 #ifndef BLIS_ENABLE_SANDBOX
 GENFRONT( gemm, gemm, nat )
 #endif
+GENFRONT( gemmt, gemm, nat )
 GENFRONT( her2k, gemm, nat )
 GENFRONT( syr2k, gemm, nat )

--- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
@@ -80,46 +80,56 @@ void bli_sgemm_armv7a_int_4x4
 	// Vector for column 3
 	float32x4_t cv3;

-	if( rs_c == 1 )
+	if ( *beta != 0.0F )
 	{
-		// Load column 0
- 		cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c ); 
-	
-		// Load column 1
- 		cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c ); 
-	
-		// Load column 2
- 		cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c ); 
-	
-		// Load column 3
- 		cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c ); 
-	}	
+		if ( rs_c == 1 )
+		{
+			// Load column 0
+			cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c );
+
+			// Load column 1
+			cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c );
+
+			// Load column 2
+			cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c );
+
+			// Load column 3
+			cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c );
+		}
+		else
+		{
+			// Load column 0
+			cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
+			cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
+			cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
+			cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
+
+			// Load column 1
+			cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
+			cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
+			cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
+			cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
+
+			// Load column 2
+			cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
+			cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
+			cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
+			cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
+
+			// Load column 3
+			cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
+			cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
+			cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
+			cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
+
+		}
+	}
 	else
 	{
-		// Load column 0
-		cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
-		cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
-		cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
-		cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
-	
-		// Load column 1
-		cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
-		cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
-		cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
-		cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
-	
-		// Load column 2
-		cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
-		cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
-		cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
-		cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
-	
-		// Load column 3
-		cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
-		cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
-		cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
-		cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
-
+		cv0 = vmovq_n_f32( 0.0 );
+		cv1 = vmovq_n_f32( 0.0 );
+		cv2 = vmovq_n_f32( 0.0 );
+		cv3 = vmovq_n_f32( 0.0 );
 	}

 	// Vector for accummulating column 0
@@ -142,15 +152,15 @@ void bli_sgemm_armv7a_int_4x4
 	// Initialize vector to 0.0
 	abv3 = vmovq_n_f32( 0.0 );

-	for ( i = 0; i < k_iter; ++i ) 
-	{ 
+	for ( i = 0; i < k_iter; ++i )
+	{
 		// Begin iter 0
- 		av1 = vld1q_f32( a ); 
+			av1 = vld1q_f32( a );

 		__builtin_prefetch( a + 224 );
 		__builtin_prefetch( b + 224 );
-	
- 		bv1 = vld1q_f32( b ); 
+
+		bv1 = vld1q_f32( b );

 		abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 );
 		abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 );
@@ -158,24 +168,24 @@ void bli_sgemm_armv7a_int_4x4
 		abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 );


-		av2 = vld1q_f32( a+4 ); 
+		av2 = vld1q_f32( a+4 );

 		//__builtin_prefetch( a + 116 );
 		//__builtin_prefetch( b + 116 );
-	
- 		bv2 = vld1q_f32( b+4 ); 
+
+		bv2 = vld1q_f32( b+4 );

 		abv0 = vmlaq_lane_f32( abv0, av2, vget_low_f32(bv2), 0 );
 		abv1 = vmlaq_lane_f32( abv1, av2, vget_low_f32(bv2), 1 );
 		abv2 = vmlaq_lane_f32( abv2, av2, vget_high_f32(bv2), 0 );
 		abv3 = vmlaq_lane_f32( abv3, av2, vget_high_f32(bv2), 1 );

-		av3 = vld1q_f32( a+8 ); 
+		av3 = vld1q_f32( a+8 );

 		//__builtin_prefetch( a + 120 );
 		//__builtin_prefetch( b + 120 );
-	
- 		bv3 = vld1q_f32( b+8 ); 
+
+		bv3 = vld1q_f32( b+8 );

 		abv0 = vmlaq_lane_f32( abv0, av3, vget_low_f32(bv3), 0 );
 		abv1 = vmlaq_lane_f32( abv1, av3, vget_low_f32(bv3), 1 );
@@ -183,12 +193,12 @@ void bli_sgemm_armv7a_int_4x4
 		abv3 = vmlaq_lane_f32( abv3, av3, vget_high_f32(bv3), 1 );


-		av4 = vld1q_f32( a+12); 
+		av4 = vld1q_f32( a+12);

 		//__builtin_prefetch( a + 124 );
 		//__builtin_prefetch( b + 124 );
-	
- 		bv4 = vld1q_f32( b+12); 
+
+		bv4 = vld1q_f32( b+12);

 		abv0 = vmlaq_lane_f32( abv0, av4, vget_low_f32(bv4), 0 );
 		abv1 = vmlaq_lane_f32( abv1, av4, vget_low_f32(bv4), 1 );
@@ -197,71 +207,85 @@ void bli_sgemm_armv7a_int_4x4



-		a += 16; 
-		b += 16; 
-	} 
+		a += 16;
+		b += 16;
+	}

-	for ( i = 0; i < k_left; ++i ) 
-	{ 
- 		av1 = vld1q_f32( a ); 
+	for ( i = 0; i < k_left; ++i )
+	{
+		av1 = vld1q_f32( a );

 		__builtin_prefetch( a + 112 );
 		__builtin_prefetch( b + 112 );
-	
- 		bv1 = vld1q_f32( b ); 
+
+		bv1 = vld1q_f32( b );

 		abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 );
 		abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 );
 		abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 );
 		abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 );

-		a += 4; 
-		b += 4; 
+		a += 4;
+		b += 4;
 	}

 	__builtin_prefetch( a_next );
 	__builtin_prefetch( b_next );

-	cv0 = vmulq_n_f32( cv0, *beta );
-	cv1 = vmulq_n_f32( cv1, *beta );
-	cv2 = vmulq_n_f32( cv2, *beta );
-	cv3 = vmulq_n_f32( cv3, *beta );
+	if ( *beta != 0.0F )
+	{
+		// Multiply C by beta and then accumulate alpha * A * B.
+		cv0 = vmulq_n_f32( cv0, *beta );
+		cv1 = vmulq_n_f32( cv1, *beta );
+		cv2 = vmulq_n_f32( cv2, *beta );
+		cv3 = vmulq_n_f32( cv3, *beta );

-	cv0 = vmlaq_f32( cv0, abv0, alphav );
-	cv1 = vmlaq_f32( cv1, abv1, alphav );
-	cv2 = vmlaq_f32( cv2, abv2, alphav );
-	cv3 = vmlaq_f32( cv3, abv3, alphav );
+		cv0 = vmlaq_f32( cv0, abv0, alphav );
+		cv1 = vmlaq_f32( cv1, abv1, alphav );
+		cv2 = vmlaq_f32( cv2, abv2, alphav );
+		cv3 = vmlaq_f32( cv3, abv3, alphav );
+	}
+	else
+	{
+		// Since beta = 0, skip straight to accumulating alpha * A * B.
+		// Note: C (cv?) was initialized to zero above.
+		cv0 = vmlaq_f32( cv0, abv0, alphav );
+		cv1 = vmlaq_f32( cv1, abv1, alphav );
+		cv2 = vmlaq_f32( cv2, abv2, alphav );
+		cv3 = vmlaq_f32( cv3, abv3, alphav );
+	}

-	if( rs_c == 1 )
+	if ( rs_c == 1 )
 	{
 		// Store column 0
-  		vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 ); 
+		vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 );
 		// Store column 1
-  		vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 ); 
+		vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 );
 		// Store column 2
-  		vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 ); 
+		vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 );
 		// Store column 3
-  		vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 ); 
+		vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 );
 	}
-	else{
+	else
+	{
 		// Store column 0
 		vst1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
 		vst1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
 		vst1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
 		vst1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
-	
+
 		// Store column 1
 		vst1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
 		vst1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
 		vst1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
 		vst1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
-	
+
 		// Store column 2
 		vst1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
 		vst1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
 		vst1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
 		vst1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
-	
+
 		// Store column 3
 		vst1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
 		vst1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
--- a/kernels/zen/1/bli_copyv_zen_int.c
+++ b/kernels/zen/1/bli_copyv_zen_int.c
@@ -0,0 +1,330 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+// -----------------------------------------------------------------------------
+
+void bli_scopyv_zen_int
+     (
+       conj_t           conjx,
+       dim_t            n,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+	const dim_t num_elem_per_reg = 8;
+	dim_t       i = 0;
+	__m256      xv[16];
+
+	// If the vector dimension is zero return early.
+	if ( bli_zero_dim1( n ) ) return;
+
+	if ( incx == 1 && incy == 1 )
+	{
+#if 0
+	PRAGMA_SIMD
+	for (i = 0; i < n; i++)
+	{
+		y[i] = x[i];
+	}
+#endif
+#if 0
+	memcpy(y, x, n << 2);
+#endif
+#if 1
+
+		// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
+		// for example if n = 255
+		// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
+		// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
+		// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
+		for ( i = 0; i < (n & (~0x7F)); i += 128 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
+			xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
+			xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
+			xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
+			xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
+			xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
+			xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
+			xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
+			xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
+
+			y += 128;
+			x += 128;
+		}
+		for ( ; i < (n & (~0x3F)); i += 64 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
+
+			y += 64;
+			x += 64;
+		}
+		for ( ; i < (n & (~0x1F)); i += 32 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+
+			y += 32;
+			x += 32;
+		}
+		for ( ; i < (n & (~0x0F)); i += 16 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+
+			y += 16;
+			x += 16;
+		}
+		for ( ; i < (n & (~0x07)); i += 8 )
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			y += 8;
+			x += 8;
+		}
+		for ( ; i < n; ++i )
+		{
+			*y++ = *x++;
+		}
+#endif
+	}
+	else
+	{
+		for ( dim_t i = 0; i < n; ++i )
+		{
+			*y = *x;
+			x += incx;
+			y += incy;
+		}
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_dcopyv_zen_int
+     (
+       conj_t           conjx,
+       dim_t            n,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+	const dim_t num_elem_per_reg = 4;
+	dim_t       i = 0;
+	__m256d     xv[16];
+
+	// If the vector dimension is zero return early.
+	if ( bli_zero_dim1( n ) ) return;
+
+	if ( incx == 1 && incy == 1 )
+	{
+#if 0
+	PRAGMA_SIMD
+	for (i = 0; i < n; ++i)
+	{
+		y[i] = x[i];
+	}
+#endif
+#if 0
+	memcpy(y, x, n << 3);
+#endif
+#if 1
+		// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
+		// the copy operation will be done for the multiples of 64
+		for ( i = 0; i < (n & (~0x3F)); i += 64 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
+			xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
+			xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
+			xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
+			xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
+			xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
+			xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
+			xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
+			xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
+			y += num_elem_per_reg * 16;
+			x += num_elem_per_reg * 16;
+		}
+		for ( ; i < (n & (~0x1F)); i += 32 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
+
+			y += num_elem_per_reg * 8;
+			x += num_elem_per_reg * 8;
+		}
+		for ( ; i < (n & (~0xF)); i += 16 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+
+			y += num_elem_per_reg * 4;
+			x += num_elem_per_reg * 4;
+		}
+		for ( ; i < (n & (~0x07)); i += 8 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+
+			y += num_elem_per_reg * 2;
+			x += num_elem_per_reg * 2;
+		}
+		for ( ; i < (n & (~0x03)); i += 4 )
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			y += num_elem_per_reg;
+			x += num_elem_per_reg;
+		}
+		for ( ; i < n; ++i )
+		{
+			*y++ = *x++;
+		}
+#endif
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			*y = *x;
+
+			x += incx;
+			y += incy;
+		}
+	}
+}
+
--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2016 - 2020, Advanced Micro Devices, Inc.
   Copyright (C) 2018, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
@@ -73,11 +73,11 @@ void bli_sdotv_zen_int10
 	float*  restrict x0;
 	float*  restrict y0;

-	float            rho0;
+	float            rho0 = 0.0;

 	__m256           xv[10];
 	__m256           yv[10];
-	v8sf_t           rhov[2];
+	v8sf_t           rhov[10];

 	// If the vector dimension is zero, or if alpha is zero, return early.
 	if ( bli_zero_dim1( n ) )
@@ -96,8 +96,16 @@ void bli_sdotv_zen_int10
 	{
 		rhov[0].v = _mm256_setzero_ps();
 		rhov[1].v = _mm256_setzero_ps();
+		rhov[2].v = _mm256_setzero_ps();
+		rhov[3].v = _mm256_setzero_ps();
+		rhov[4].v = _mm256_setzero_ps();
+		rhov[5].v = _mm256_setzero_ps();
+		rhov[6].v = _mm256_setzero_ps();
+		rhov[7].v = _mm256_setzero_ps();
+		rhov[8].v = _mm256_setzero_ps();
+		rhov[9].v = _mm256_setzero_ps();

-		for ( i = 0; (i + 79) < n; i += 80 )
+		for ( i = 0 ; (i + 79) < n; i += 80 )
 		{
 			// 80 elements will be processed per loop; 10 FMAs will run per loop.
 			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -124,19 +132,25 @@ void bli_sdotv_zen_int10

 			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[1].v );
+			rhov[2].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[2].v );
+			rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v );
+			rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v );
+			rhov[5].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[5].v );
+			rhov[6].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[6].v );
+			rhov[7].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[7].v );
+			rhov[8].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[8].v );
+			rhov[9].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[9].v );

 			x0 += 10*n_elem_per_reg;
 			y0 += 10*n_elem_per_reg;
 		}

+		rhov[0].v += rhov[5].v;
+		rhov[1].v += rhov[6].v;
+		rhov[2].v += rhov[7].v;
+		rhov[3].v += rhov[8].v;
+		rhov[4].v += rhov[9].v;
+
 		for ( ; (i + 39) < n; i += 40 )
 		{
 			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -153,34 +167,17 @@ void bli_sdotv_zen_int10

 			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
+			rhov[2].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[2].v );
+			rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v );
+			rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v );

 			x0 += 5*n_elem_per_reg;
 			y0 += 5*n_elem_per_reg;
 		}

-		for ( ; (i + 31) < n; i += 32 )
-		{
-			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
-			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
-			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
-			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
-
-			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
-			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
-
-			rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
-
-			x0 += 4*n_elem_per_reg;
-			y0 += 4*n_elem_per_reg;
-		}
+		rhov[0].v += rhov[2].v;
+		rhov[1].v += rhov[3].v;
+		rhov[0].v += rhov[4].v;

 		for ( ; (i + 15) < n; i += 16 )
 		{
@@ -197,6 +194,8 @@ void bli_sdotv_zen_int10
 			y0 += 2*n_elem_per_reg;
 		}

+		rhov[0].v += rhov[1].v;
+
 		for ( ; (i + 7) < n; i += 8 )
 		{
 			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -211,19 +210,15 @@ void bli_sdotv_zen_int10

 		for ( ; (i + 0) < n; i += 1 )
 		{
-			rhov[0].f[0] += x0[i] * y0[i];
+			rho0 += (*x0) * (*y0);
+			x0 += 1;
+			y0 += 1;
 		}

-		v8sf_t onev;
-
-		onev.v = _mm256_set1_ps( 1.0f );
-
-		rhov[0].v = _mm256_dp_ps( rhov[0].v, onev.v, 0xf1 );
-        rhov[1].v = _mm256_dp_ps( rhov[1].v, onev.v, 0xf1 );
-
-		// Manually add the results from above to finish the sum.
-		rho0   += rhov[0].f[0] + rhov[0].f[4];
-		rho0   += rhov[1].f[0] + rhov[1].f[4];
+		rho0 += rhov[0].f[0] + rhov[0].f[1] +
+		        rhov[0].f[2] + rhov[0].f[3] +
+		        rhov[0].f[4] + rhov[0].f[5] +
+		        rhov[0].f[6] + rhov[0].f[7];

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
@@ -269,11 +264,11 @@ void bli_ddotv_zen_int10
 	double* restrict x0;
 	double* restrict y0;

-	double           rho0;
+	double           rho0 = 0.0;

 	__m256d          xv[10];
 	__m256d          yv[10];
-	v4df_t           rhov[2];
+	v4df_t           rhov[10];

 	// If the vector dimension is zero, or if alpha is zero, return early.
 	if ( bli_zero_dim1( n ) )
@@ -292,6 +287,14 @@ void bli_ddotv_zen_int10
 	{
 		rhov[0].v = _mm256_setzero_pd();
 		rhov[1].v = _mm256_setzero_pd();
+		rhov[2].v = _mm256_setzero_pd();
+		rhov[3].v = _mm256_setzero_pd();
+		rhov[4].v = _mm256_setzero_pd();
+		rhov[5].v = _mm256_setzero_pd();
+		rhov[6].v = _mm256_setzero_pd();
+		rhov[7].v = _mm256_setzero_pd();
+		rhov[8].v = _mm256_setzero_pd();
+		rhov[9].v = _mm256_setzero_pd();

 		for ( i = 0; (i + 39) < n; i += 40 )
 		{
@@ -320,19 +323,25 @@ void bli_ddotv_zen_int10

 			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[1].v );
+			rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
+			rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
+			rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v );
+			rhov[5].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[5].v );
+			rhov[6].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[6].v );
+			rhov[7].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[7].v );
+			rhov[8].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[8].v );
+			rhov[9].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[9].v );

 			x0 += 10*n_elem_per_reg;
 			y0 += 10*n_elem_per_reg;
 		}

+		rhov[0].v += rhov[5].v;
+		rhov[1].v += rhov[6].v;
+		rhov[2].v += rhov[7].v;
+		rhov[3].v += rhov[8].v;
+		rhov[4].v += rhov[9].v;
+
 		for ( ; (i + 19) < n; i += 20 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -349,14 +358,16 @@ void bli_ddotv_zen_int10

 			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
+			rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
+			rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
+			rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v );

 			x0 += 5*n_elem_per_reg;
 			y0 += 5*n_elem_per_reg;
 		}

+		rhov[0].v += rhov[4].v;
+
 		for ( ; (i + 15) < n; i += 16 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -371,13 +382,16 @@ void bli_ddotv_zen_int10

 			rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
 			rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
-			rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
-			rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
+			rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
+			rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );

 			x0 += 4*n_elem_per_reg;
 			y0 += 4*n_elem_per_reg;
 		}

+		rhov[0].v += rhov[2].v;
+		rhov[1].v += rhov[3].v;
+
 		for ( ; (i + 7) < n; i += 8 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -393,6 +407,8 @@ void bli_ddotv_zen_int10
 			y0 += 2*n_elem_per_reg;
 		}

+		rhov[0].v += rhov[1].v;
+
 		for ( ; (i + 3) < n; i += 4 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -407,12 +423,14 @@ void bli_ddotv_zen_int10

 		for ( ; (i + 0) < n; i += 1 )
 		{
-			rhov[0].d[0] += x0[i] * y0[i];
+			rho0 += (*x0) * (*y0);
+
+			x0 += 1;
+			y0 += 1;
 		}

 		// Manually add the results from above to finish the sum.
-		rho0   += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
-		rho0   += rhov[1].d[0] + rhov[1].d[1] + rhov[1].d[2] + rhov[1].d[3];
+		rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];

 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -80,9 +80,18 @@ void bli_sscalv_zen_int10
 	// If alpha is zero, use setv.
 	if ( PASTEMAC(s,eq0)( *alpha ) )
 	{
-		float*       zero = bli_s0;
-		ssetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
-
+		float* zero = bli_s0;
+#ifdef BLIS_CONFIG_ZEN2
+		bli_ssetv_zen_int
+		(
+		  BLIS_NO_CONJUGATE,
+		  n,
+		  zero,
+		  x, incx,
+		  cntx
+		);
+#else
+		ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 		f
 		(
 		  BLIS_NO_CONJUGATE,
@@ -91,6 +100,7 @@ void bli_sscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
+#endif
 		return;
 	}

@@ -270,8 +280,18 @@ void bli_dscalv_zen_int10
 	// If alpha is zero, use setv.
 	if ( PASTEMAC(d,eq0)( *alpha ) )
 	{
-		double*      zero = bli_d0;
-		dsetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+		double* zero = bli_d0;
+#ifdef BLIS_CONFIG_ZEN2
+		bli_dsetv_zen_int
+		(
+		  BLIS_NO_CONJUGATE,
+		  n,
+		  zero,
+		  x, incx,
+		  cntx
+		);
+#else
+		dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );

 		f
 		(
@@ -281,6 +301,7 @@ void bli_dscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
+#endif
 		return;
 	}

--- a/kernels/zen/1/bli_setv_zen_int.c
+++ b/kernels/zen/1/bli_setv_zen_int.c
@@ -0,0 +1,228 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+// -----------------------------------------------------------------------------
+
+void bli_ssetv_zen_int
+     (
+       conj_t           conjalpha,
+       dim_t            n,
+       float*  restrict alpha,
+       float*  restrict x, inc_t incx,
+       cntx_t* restrict cntx
+     )
+{
+	const dim_t num_elem_per_reg = 8;
+	dim_t       i = 0;
+	__m256      alphav;
+
+	// If the vector dimension is zero return early.
+	if ( bli_zero_dim1( n ) ) return;
+
+	if ( incx == 1 )
+	{
+		alphav = _mm256_broadcast_ss( alpha );
+
+		// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
+		// for example if n = 255
+		// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
+		// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
+		// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
+		for ( i = 0; i < (n & (~0x7F)); i += 128 )
+		{
+			_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 4, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 5, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 6, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 7, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 8, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 9, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 10, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 11, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 12, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 13, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 14, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 15, alphav);
+
+			x += 128;
+		}
+		for ( ; i < (n & (~0x3F)); i += 64 )
+		{
+			_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 4, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 5, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 6, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 7, alphav);
+
+			x += 64;
+		}
+		for ( ; i < (n & (~0x1F)); i += 32 )
+		{
+			_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
+
+			x += 32;
+		}
+		for ( ; i < (n & (~0x0F)); i += 16 )
+		{
+			_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
+			_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
+
+			x += 16;
+		}
+		for ( ; i < (n & (~0x07)); i += 8 )
+		{
+			_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
+			x += 8;
+		}
+		for ( ; i < n; ++i )
+		{
+			*x++ = *alpha;
+		}
+	}
+	else
+	{
+		for ( dim_t i = 0; i < n; ++i )
+		{
+			*x = *alpha;
+			x += incx;
+		}
+	}
+}
+
+void  bli_dsetv_zen_int
+     (
+       conj_t           conjalpha,
+       dim_t            n,
+       double* restrict alpha,
+       double* restrict x, inc_t incx,
+       cntx_t* restrict cntx
+     )
+{
+	const dim_t num_elem_per_reg = 4;
+	dim_t       i = 0;
+	__m256d     alphav;
+
+	// If the vector dimension is zero return early.
+	if ( bli_zero_dim1( n ) ) return;
+
+	if ( incx == 1 )
+	{
+		// Broadcast the alpha scalar to all elements of a vector register.
+		alphav = _mm256_broadcast_sd( alpha );
+
+		// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
+		// the copy operation will be done for the multiples of 64
+		for ( i = 0; i < (n & (~0x3F)); i += 64 )
+		{
+			_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 4, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 5, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 6, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 7, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 8, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 9, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 10, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 11, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 12, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 13, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 14, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 15, alphav);
+
+			x += num_elem_per_reg * 16;
+		}
+		for ( ; i < (n & (~0x1F)); i += 32 )
+		{
+			_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 4, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 5, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 6, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 7, alphav);
+
+			x += num_elem_per_reg * 8;
+		}
+		for ( ; i < (n & (~0xF)); i += 16 )
+		{
+			_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
+
+			x += num_elem_per_reg * 4;
+		}
+		for ( ; i < (n & (~0x07)); i += 8 )
+		{
+			_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
+			_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
+
+			x += num_elem_per_reg * 2;
+		}
+		for ( ; i < (n & (~0x03)); i += 4 )
+		{
+			_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
+			x += num_elem_per_reg;
+		}
+		for ( ; i < n; ++i )
+		{
+			*x++ = *alpha;
+		}
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			*x = *alpha;
+
+			x += incx;
+		}
+	}
+}
+
--- a/kernels/zen/1/bli_swapv_zen_int8.c
+++ b/kernels/zen/1/bli_swapv_zen_int8.c
@@ -0,0 +1,344 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+	__m256  v;
+	float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+	__m256d v;
+	double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_sswapv_zen_int8
+     (
+       dim_t            n,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+
+	const dim_t     n_elem_per_reg = 8;
+	dim_t           i = 0;
+
+	float* restrict x0;
+	float* restrict y0;
+
+	__m256          xv[8];
+	__m256          yv[8];
+
+	// If the vector dimension is zero, return early.
+	if ( bli_zero_dim1( n ) ) return;
+
+	x0 = x;
+	y0 = y;
+
+	if ( incx == 1 && incy == 1 )
+	{
+		for ( i = 0; ( i + 63 ) < n; i += 64 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
+
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
+			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
+			_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), yv[4]);
+			_mm256_storeu_ps( (x0 + 5*n_elem_per_reg), yv[5]);
+			_mm256_storeu_ps( (x0 + 6*n_elem_per_reg), yv[6]);
+			_mm256_storeu_ps( (x0 + 7*n_elem_per_reg), yv[7]);
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
+			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
+			_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), xv[4]);
+			_mm256_storeu_ps( (y0 + 5*n_elem_per_reg), xv[5]);
+			_mm256_storeu_ps( (y0 + 6*n_elem_per_reg), xv[6]);
+			_mm256_storeu_ps( (y0 + 7*n_elem_per_reg), xv[7]);
+
+			x0 += 8*n_elem_per_reg;
+			y0 += 8*n_elem_per_reg;
+		}
+
+		for ( ; ( i + 31 ) < n; i += 32 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
+			_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
+
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
+			_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
+
+			x0 += 4*n_elem_per_reg;
+			y0 += 4*n_elem_per_reg;
+		}
+
+		for ( ; ( i + 15 ) < n; i += 16 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
+
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
+
+			x0 += 2*n_elem_per_reg;
+			y0 += 2*n_elem_per_reg;
+		}
+
+		for ( ; ( i + 7 ) < n; i += 8 )
+		{
+			xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+			_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+
+			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+
+			x0 += 1*n_elem_per_reg;
+			y0 += 1*n_elem_per_reg;
+		}
+
+		for ( ; (i + 0) < n; i += 1 )
+		{
+			PASTEMAC(s,swaps)( x[i], y[i] );
+		}
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			PASTEMAC(s,swaps)( (*x0), (*y0) );
+
+			x0 += incx;
+			y0 += incy;
+		}
+	}
+
+}
+
+//--------------------------------------------------------------------------------
+
+void bli_dswapv_zen_int8
+     (
+       dim_t            n,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+	const dim_t      n_elem_per_reg = 4;
+	dim_t            i = 0;
+
+	double* restrict x0;
+	double* restrict y0;
+
+	__m256d          xv[8];
+	__m256d          yv[8];
+
+	// If the vector dimension is zero, return early.
+	if ( bli_zero_dim1( n ) ) return;
+
+	x0 = x;
+	y0 = y;
+
+	if ( incx == 1 && incy == 1 )
+	{
+		for ( ; ( i + 31 ) < n; i += 32 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+			yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
+			yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
+			yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
+			yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
+
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
+			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), yv[4]);
+			_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), yv[5]);
+			_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), yv[6]);
+			_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), yv[7]);
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
+			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
+			_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), xv[4]);
+			_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), xv[5]);
+			_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), xv[6]);
+			_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), xv[7]);
+
+			x0 += 8*n_elem_per_reg;
+			y0 += 8*n_elem_per_reg;
+		}
+
+		for ( ; ( i + 15 ) < n; i += 16 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+			yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+			yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
+			_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
+			_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
+
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
+
+			x0 += 4*n_elem_per_reg;
+			y0 += 4*n_elem_per_reg;
+		}
+
+		for ( ; ( i + 7 ) < n; i += 8 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+			yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
+
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
+
+			x0 += 2*n_elem_per_reg;
+			y0 += 2*n_elem_per_reg;
+		}
+
+		for ( ; ( i + 3 ) < n; i += 4 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+
+			yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+
+			x0 += 1*n_elem_per_reg;
+			y0 += 1*n_elem_per_reg;
+		}
+
+		for ( ; (i + 0) < n; i += 1 )
+		{
+			PASTEMAC(d,swaps)( x[i], y[i] );
+		}
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i )
+		{
+			PASTEMAC(d,swaps)( (*x0), (*y0) );
+
+			x0 += incx;
+			y0 += incy;
+		}
+	}
+}
+
--- a/kernels/zen/1f/bli_axpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c
@@ -4,8 +4,8 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
   Copyright (C) 2018, The University of Texas at Austin
+   Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
--- a/kernels/zen/1f/bli_dotxf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c
@@ -4,8 +4,8 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
   Copyright (C) 2018, The University of Texas at Austin
+   Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
--- a/kernels/zen/3/bli_gemm_small.c
+++ b/kernels/zen/3/bli_gemm_small.c
--- a/kernels/zen/3/bli_trsm_small.c
+++ b/kernels/zen/3/bli_trsm_small.c
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
--- a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c
+++ b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c
--- a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c
+++ b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c
--- a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c
+++ b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c
--- a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c
+++ b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c
--- a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c
+++ b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c
--- a/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c
+++ b/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -32,6 +33,13 @@

 */

+// -- level-1m --
+PACKM_KER_PROT(double, d, packm_8xk_gen_zen)
+PACKM_KER_PROT(double, d, packm_6xk_gen_zen)
+PACKM_KER_PROT(double, d, packm_8xk_nn_zen)
+PACKM_KER_PROT(double, d, packm_6xk_nn_zen)
+
+
 // -- level-1v --

 // amaxv (intrinsics)
@@ -42,17 +50,17 @@ AMAXV_KER_PROT( double,   d, amaxv_zen_int )
 AXPYV_KER_PROT( float,    s, axpyv_zen_int )
 AXPYV_KER_PROT( double,   d, axpyv_zen_int )

-	// axpyv (intrinsics unrolled x10)
-	AXPYV_KER_PROT( float,    s, axpyv_zen_int10 )
-	AXPYV_KER_PROT( double,   d, axpyv_zen_int10 )
+// axpyv (intrinsics unrolled x10)
+AXPYV_KER_PROT( float,    s, axpyv_zen_int10 )
+AXPYV_KER_PROT( double,   d, axpyv_zen_int10 )

 // dotv (intrinsics)
 DOTV_KER_PROT( float,    s, dotv_zen_int )
 DOTV_KER_PROT( double,   d, dotv_zen_int )

-	// dotv (intrinsics, unrolled x10)
-	DOTV_KER_PROT( float,    s, dotv_zen_int10 )
-	DOTV_KER_PROT( double,   d, dotv_zen_int10 )
+// dotv (intrinsics, unrolled x10)
+DOTV_KER_PROT( float,    s, dotv_zen_int10 )
+DOTV_KER_PROT( double,   d, dotv_zen_int10 )

 // dotxv (intrinsics)
 DOTXV_KER_PROT( float,    s, dotxv_zen_int )
@@ -62,9 +70,21 @@ DOTXV_KER_PROT( double,   d, dotxv_zen_int )
 SCALV_KER_PROT( float,    s, scalv_zen_int )
 SCALV_KER_PROT( double,   d, scalv_zen_int )

-	// scalv (intrinsics unrolled x10)
-	SCALV_KER_PROT( float,    s, scalv_zen_int10 )
-	SCALV_KER_PROT( double,   d, scalv_zen_int10 )
+// scalv (intrinsics unrolled x10)
+SCALV_KER_PROT( float,    s, scalv_zen_int10 )
+SCALV_KER_PROT( double,   d, scalv_zen_int10 )
+
+// swapv (intrinsics)
+SWAPV_KER_PROT(float,    s, swapv_zen_int8 )
+SWAPV_KER_PROT(double,   d, swapv_zen_int8 )
+
+// copyv (intrinsics)
+COPYV_KER_PROT( float,    s, copyv_zen_int )
+COPYV_KER_PROT( double,   d, copyv_zen_int )
+
+//
+SETV_KER_PROT(float,    s, setv_zen_int)
+SETV_KER_PROT(double,   d, setv_zen_int)

 // -- level-1f --

@@ -76,3 +96,106 @@ AXPYF_KER_PROT( double,   d, axpyf_zen_int_8 )
 DOTXF_KER_PROT( float,    s, dotxf_zen_int_8 )
 DOTXF_KER_PROT( double,   d, dotxf_zen_int_8 )

+// -- level-3 sup --------------------------------------------------------------
+
+// semmsup_rv
+
+//GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x16 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x16 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x16 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x16 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x16 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x16 )
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x8 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x8 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x8 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x8 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x8 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x8 )
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x4 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x4 ) 
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x4 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x4 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x4 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x4 )
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x2 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x2 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x2 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x2 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x2 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x2 )
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_r_zen_ref_6x1 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_r_zen_ref_5x1 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_r_zen_ref_4x1 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_r_zen_ref_3x1 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_r_zen_ref_2x1 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_r_zen_ref_1x1 )
+
+// gemmsup_rv (mkernel in m dim)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x16m )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x8m )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x4m )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x2m )
+// gemmsup_rv (mkernel in n dim)
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x16n )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x16n )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x16n )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x16n )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x16n )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x16n )
+
+// gemmsup_rd
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x8)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x16)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_1x8)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_1x16)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x4)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x4)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_1x4)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x2)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_3x2)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x2)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_1x2)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x16m)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x8m)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x4m)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x2m)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x16n)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_3x16n)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x16n)
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_1x16n)
+
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_3x8m )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_3x4m )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_3x2m )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_2x8 )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_1x8 )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_2x4 )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_1x4 )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_2x2 )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_1x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_3x4m )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_3x2m )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_2x4 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_1x4 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_2x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_1x2 )
+
+// gemmsup_rv (mkernel in n dim)
+
+
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_3x8n )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_2x8n )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_1x8n )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_3x4 )
+GEMMSUP_KER_PROT( scomplex,   c, gemmsup_rv_zen_asm_3x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_3x4n )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_2x4n )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_1x4n )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_3x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_3x1 )
--- a/kernels/zen2/.gitignore
+++ b/kernels/zen2/.gitignore
@@ -1,4 +0,0 @@
-# Ignore everything in this directory
-*
-# Except this file
-!.gitignore
--- a/kernels/zen2/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
@@ -0,0 +1,599 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+    __m256d v;
+    double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+
+void bli_saxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       float* restrict alpha,
+       float* restrict a, inc_t inca, inc_t lda,
+       float* restrict x, inc_t incx,
+       float* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 8;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    float* restrict a0;
+    float* restrict a1;
+    float* restrict a2;
+    float* restrict a3;
+    float* restrict a4;
+
+    float* restrict y0;
+
+    v8sf_t           chi0v, chi1v, chi2v, chi3v;
+    v8sf_t           chi4v;
+
+    v8sf_t           a00v, a01v, a02v, a03v;
+    v8sf_t           a04v;
+
+    v8sf_t           a10v, a11v, a12v, a13v;
+    v8sf_t           a14v;
+
+    v8sf_t           y0v, y1v;
+
+    float           chi0, chi1, chi2, chi3;
+    float           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+#ifdef BLIS_CONFIG_ZEN2
+        for ( i = 0; i < b_n; ++i )
+        {
+            float* a1   = a + (0  )*inca + (i  )*lda;
+            float* chi1 = x + (i  )*incx;
+            float* y1   = y + (0  )*incy;
+            float  alpha_chi1;
+
+            bli_scopycjs( conjx, *chi1, alpha_chi1 );
+            bli_sscals( *alpha, alpha_chi1 );
+
+            bli_saxpyv_zen_int10
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#else
+        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            float* a1   = a + (0  )*inca + (i  )*lda;
+            float* chi1 = x + (i  )*incx;
+            float* y1   = y + (0  )*incy;
+            float  alpha_chi1;
+
+            bli_scopycjs( conjx, *chi1, alpha_chi1 );
+            bli_sscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#endif
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_sscals( *alpha, chi0 );
+    bli_sscals( *alpha, chi1 );
+    bli_sscals( *alpha, chi2 );
+    bli_sscals( *alpha, chi3 );
+    bli_sscals( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_ss( &chi0 );
+    chi1v.v = _mm256_broadcast_ss( &chi1 );
+    chi2v.v = _mm256_broadcast_ss( &chi2 );
+    chi3v.v = _mm256_broadcast_ss( &chi3 );
+    chi4v.v = _mm256_broadcast_ss( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 15) < m; i += 16 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+    
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *a1;
+            const float a2c = *a2;
+            const float a3c = *a3;
+            const float a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *a1;
+            const float a2c = *a2;
+            const float a3c = *a3;
+            const float a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca; 
+            y0 += incy;
+        }
+
+    }
+}
+
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+    double* restrict a2;
+    double* restrict a3;
+    double* restrict a4;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v, chi2v, chi3v;
+    v4df_t           chi4v;
+
+    v4df_t           a00v, a01v, a02v, a03v;
+    v4df_t           a04v;
+
+    v4df_t           a10v, a11v, a12v, a13v;
+    v4df_t           a14v;
+
+    v4df_t           y0v, y1v;
+
+    double           chi0, chi1, chi2, chi3;
+    double           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+#ifdef BLIS_CONFIG_ZEN2
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            bli_daxpyv_zen_int10
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#else
+        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+#endif
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_dscals( *alpha, chi0 );
+    bli_dscals( *alpha, chi1 );
+    bli_dscals( *alpha, chi2 );
+    bli_dscals( *alpha, chi3 );
+    bli_dscals( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+    chi2v.v = _mm256_broadcast_sd( &chi2 );
+    chi3v.v = _mm256_broadcast_sd( &chi3 );
+    chi4v.v = _mm256_broadcast_sd( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 3) < m; i += 4 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+    
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca; 
+            y0 += incy;
+        }
+
+    }
+}
+
--- a/kernels/zen2/bli_kernels_zen2.h
+++ b/kernels/zen2/bli_kernels_zen2.h
@@ -0,0 +1,40 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// -- level-1f --
+
+AXPYF_KER_PROT( float,    s, axpyf_zen_int_5 )
+AXPYF_KER_PROT( double,   d, axpyf_zen_int_5 )
+
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -463,7 +463,8 @@ void GENBARNAME(cntx_init)
 	// operation.

 	// Set the gemm slot to the default gemm sup handler.
-	vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref;
+	vfuncs[ BLIS_GEMM ]  = bli_gemmsup_ref;
+	vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;


 	// -- Set level-3 small/unpacked micro-kernels and preferences -------------
--- a/test/Makefile
+++ b/test/Makefile
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
+#  Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -97,16 +97,11 @@ endif
 BLAS_LIB_PATH  := $(HOME)/flame/lib
 #MKL_LIB_PATH   := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
 #MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64
-MKL_LIB_PATH   := ${MKLROOT}/lib/intel64
-#ESSL_LIB_PATH  := $(HOME)/path/to/essl/changeme
+MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64

 # OpenBLAS
 OPENBLAS_LIB   := $(BLAS_LIB_PATH)/libopenblas.a

-# ATLAS
-ATLAS_LIB      := $(BLAS_LIB_PATH)/libf77blas.a \
-                  $(BLAS_LIB_PATH)/libatlas.a
-
 # MKL
 MKL_LIB        := -L$(MKL_LIB_PATH) \
                  -lmkl_intel_lp64 \
@@ -114,18 +109,6 @@ MKL_LIB        := -L$(MKL_LIB_PATH) \
                  -lmkl_sequential \
                  -lpthread -lm -ldl

-# ESSL
-# Note: ESSL is named differently for SMP and/or BG
-#ESSL_TYPE      :=       # This is the 32b library on POWER
-#ESSL_TYPE      := 6464  # This is the 64b library on POWER
-#ESSL_TYPE      := bg    # This is the 32b single-threaded library on Blue Gene
-#ESSL_TYPE      := smpbg # This is the 32b multi-threaded library on Blue Gene
-#ESSL_LIB       := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a
-
-# Accelerate
-MAC_LIB        := -framework Accelerate
-
-

 #
 # --- General build definitions ------------------------------------------------
@@ -159,121 +142,32 @@ CFLAGS         += -I$(TEST_SRC_PATH)
 # --- Targets/rules ------------------------------------------------------------
 #

-# Complete list of possible targets when defining 'all':
-#
-#   blis openblas atlas mkl mac essl
-#
-#all: blis openblas atlas mkl
+# Define the operations we will test.
+TEST_OPS := dotv axpyv \
+            gemv ger hemv her her2 trmv trsv \
+            gemm hemm herk her2k trmm trsm
+
+# Optionally test gemmt, which some libraries might not implement.
+ifeq ($(BUILD_GEMMT),yes)
+TEST_OPS := $(TEST_OPS) gemmt
+endif
+
+# Define a function to create the executable names.
+test-bins = $(foreach op, $(TEST_OPS), test_$(op)_$(1).x)
+
+# Create the list of executables for each implementation.
+TEST_BINS_BLIS     := $(call test-bins,blis)
+TEST_BINS_OPENBLAS := $(call test-bins,openblas)
+TEST_BINS_MKL      := $(call test-bins,mkl)
+
+
 all: blis openblas mkl

-blis: check-env \
-       test_dotv_blis.x \
-       test_axpyv_blis.x \
-       test_gemv_blis.x \
-       test_ger_blis.x \
-       test_hemv_blis.x \
-       test_her_blis.x \
-       test_her2_blis.x \
-       test_trmv_blis.x \
-       test_trsv_blis.x \
-       \
-       test_gemm_blis.x \
-       test_hemm_blis.x \
-       test_herk_blis.x \
-       test_her2k_blis.x \
-       test_trmm_blis.x \
-       test_trsm_blis.x
+blis: check-env $(TEST_BINS_BLIS)

-openblas: check-env \
-      test_dotv_openblas.x \
-      test_axpyv_openblas.x \
-      test_gemv_openblas.x \
-      test_ger_openblas.x \
-      test_hemv_openblas.x \
-      test_her_openblas.x \
-      test_her2_openblas.x \
-      test_trmv_openblas.x \
-      test_trsv_openblas.x \
-      \
-      test_gemm_openblas.x \
-      test_hemm_openblas.x \
-      test_herk_openblas.x \
-      test_her2k_openblas.x \
-      test_trmm_openblas.x \
-      test_trsm_openblas.x
-
-atlas: check-env \
-      test_dotv_atlas.x \
-      test_axpyv_atlas.x \
-      test_gemv_atlas.x \
-      test_ger_atlas.x \
-      test_hemv_atlas.x \
-      test_her_atlas.x \
-      test_her2_atlas.x \
-      test_trmv_atlas.x \
-      test_trsv_atlas.x \
-      \
-      test_gemm_atlas.x \
-      test_hemm_atlas.x \
-      test_herk_atlas.x \
-      test_her2k_atlas.x \
-      test_trmm_atlas.x \
-      test_trsm_atlas.x
-
-mkl:  check-env \
-      test_dotv_mkl.x \
-      test_axpyv_mkl.x \
-      test_gemv_mkl.x \
-      test_ger_mkl.x \
-      test_hemv_mkl.x \
-      test_her_mkl.x \
-      test_her2_mkl.x \
-      test_trmv_mkl.x \
-      test_trsv_mkl.x \
-      \
-      test_gemm_mkl.x \
-      test_hemm_mkl.x \
-      test_herk_mkl.x \
-      test_her2k_mkl.x \
-      test_trmm_mkl.x \
-      test_trsm_mkl.x
-
-essl: check-env \
-      test_dotv_essl.x \
-      test_axpyv_essl.x \
-      test_gemv_essl.x \
-      test_ger_essl.x \
-      test_hemv_essl.x \
-      test_her_essl.x \
-      test_her2_essl.x \
-      test_trmv_essl.x \
-      test_trsv_essl.x \
-      \
-      test_gemm_essl.x \
-      test_hemm_essl.x \
-      test_herk_essl.x \
-      test_her2k_essl.x \
-      test_trmm_essl.x \
-      test_trsm_essl.x
-
-mac:  check-env \
-      test_dotv_mac.x \
-      test_axpyv_mac.x \
-      test_gemv_mac.x \
-      test_ger_mac.x \
-      test_hemv_mac.x \
-      test_her_mac.x \
-      test_her2_mac.x \
-      test_trmv_mac.x \
-      test_trsv_mac.x \
-      \
-      test_gemm_mac.x \
-      test_hemm_mac.x \
-      test_herk_mac.x \
-      test_her2k_mac.x \
-      test_trmm_mac.x \
-      test_trsm_mac.x
+openblas: check-env $(TEST_BINS_OPENBLAS)

+mkl: check-env $(TEST_BINS_MKL)


 # --Object file rules --
@@ -281,21 +175,13 @@ mac:  check-env \
 $(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
 	$(CC) $(CFLAGS) -c $< -o $@

+
 test_%_openblas.o: test_%.c
 	$(CC) $(CFLAGS) -DBLAS=\"openblas\" -c $< -o $@

-test_%_atlas.o: test_%.c
-	$(CC) $(CFLAGS) -DBLAS=\"atlas\" -c $< -o $@
-
 test_%_mkl.o: test_%.c
 	$(CC) $(CFLAGS) -DBLAS=\"mkl\" -c $< -o $@

-test_%_essl.o: test_%.c
-	$(CC) $(CFLAGS) -DBLAS=\"essl\" -c $< -o $@
-
-test_%_mac.o: test_%.c
-	$(CC) $(CFLAGS) -DBLAS=\"mac\" -c $< -o $@
-
 test_%_blis.o: test_%.c
 	$(CC) $(CFLAGS) -DBLIS -c $< -o $@

@@ -310,18 +196,9 @@ test_%_blis.o: test_%.c
 test_%_openblas.x: test_%_openblas.o $(LIBBLIS_LINK)
 	$(LINKER) $<             $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@

-test_%_atlas.x: test_%_atlas.o $(LIBBLIS_LINK)
-	$(LINKER) $<             $(ATLAS_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
 test_%_mkl.x: test_%_mkl.o $(LIBBLIS_LINK)
 	$(LINKER) $<             $(MKL_LIB)      $(LIBBLIS_LINK) $(LDFLAGS) -o $@

-test_%_essl.x: test_%_essl.o $(LIBBLIS_LINK)
-	$(LINKER) $<             $(ESSL_LIB)     $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
-test_%_mac.x: test_%_mac.o $(LIBBLIS_LINK)
-	$(LINKER) $<             $(MAC_LIB)      $(LIBBLIS_LINK) $(LDFLAGS) -o $@
-
 test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK)
 	$(LINKER) $<                             $(LIBBLIS_LINK) $(LDFLAGS) -o $@

--- a/test/other/test_copyv.c
+++ b/test/other/test_copyv.c
@@ -0,0 +1,218 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+
+
+//#define BLIS_ACCURACY_TEST
+#ifdef BLIS_ACCURACY_TEST
+
+bool_t scompare_result(int n, float *x, int incx, float *y, int incy) {
+	for (int i = 0; i < n; i++) {
+		if ((*x) != (*y)) {
+			printf("%4f != %4f at location %d\n", *x, *y, i);
+			return FALSE;
+		}
+		x += incx;
+		y += incy;
+	}
+	return TRUE;
+}
+
+bool_t dcompare_result(int n, double *x, int incx, double *y, int incy) {
+	for (int i = 0; i < n; i++) {
+		if ((*x) != (*y)) {
+			printf("%4f != %4f at location %d\n", *x, *y, i);
+			return FALSE;
+		}
+		x += incx;
+		y += incy;
+	}
+	return TRUE;
+}
+#endif
+
+
+int main(int argc, char** argv)
+{
+	obj_t x, y;
+	dim_t n;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   n_input, sizeof_dt;
+	int   r, n_repeats;
+	num_t dt;
+
+	double dtime;
+	double dtime_save;
+	double Gbps;
+
+	//bli_init();
+
+	n_repeats = 100000;
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end = 100000;
+	p_inc = 200;
+
+	n_input = -1;
+#else
+	p_begin = 16;
+	p_end = 16;
+	p_inc = 1;
+
+	n_input = 16;
+#endif
+
+#if 1
+	 // dt = BLIS_FLOAT;
+	dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+	if (dt == BLIS_DOUBLE)
+		sizeof_dt = sizeof(double);
+	else if (dt == BLIS_FLOAT)
+		sizeof_dt = sizeof(float);
+
+	printf("executable\t n\t GBs per sec\n");
+	for (p = p_begin; p <= p_end; p += p_inc)
+	{
+
+		if (n_input < 0) n = p * (dim_t)abs(n_input);
+		else               n = (dim_t)n_input;
+
+		bli_obj_create(dt, n, 1, 0, 0, &x);
+		bli_obj_create(dt, n, 1, 0, 0, &y);
+		bli_randm(&x);
+
+
+		dtime_save = DBL_MAX;
+
+		for (r = 0; r < n_repeats; ++r)
+		{
+			dtime = bli_clock();
+
+#ifdef BLIS
+			bli_copyv(&x,
+				&y
+			);
+#else
+			if (bli_is_float(dt))
+			{
+				f77_int nn = bli_obj_length(&x);
+				f77_int incx = bli_obj_vector_inc(&x);
+				float*  xp = bli_obj_buffer(&x);
+				f77_int incy = bli_obj_vector_inc(&y);
+				float*  yp = bli_obj_buffer(&y);
+
+				scopy_(&nn,
+					xp, &incx,
+					yp, &incy);
+
+			}
+			else if (bli_is_double(dt))
+			{
+
+				f77_int  nn = bli_obj_length(&x);
+				f77_int  incx = bli_obj_vector_inc(&x);
+				double*  xp = bli_obj_buffer(&x);
+				f77_int incy = bli_obj_vector_inc(&y);
+				double*  yp = bli_obj_buffer(&y);
+
+				dcopy_(&nn,
+					xp, &incx,
+					yp, &incy
+				);
+			}
+#endif
+			dtime_save = bli_clock_min_diff(dtime_save, dtime);
+#ifdef BLIS_ACCURACY_TEST
+			if (dt == BLIS_FLOAT) {
+				int nn = bli_obj_length(&x);
+				int incx = bli_obj_vector_inc(&x);
+				float*  xp = bli_obj_buffer(&x);
+				int incy = bli_obj_vector_inc(&y);
+				float*  yp = bli_obj_buffer(&y);
+				if (scompare_result(nn, xp, incx, yp, incy))
+					printf("Copy Successful\n");
+				else
+					printf("ALERT!!! Copy Failed\n");
+			}
+			if (dt == BLIS_DOUBLE) {
+				int nn = bli_obj_length(&x);
+				int incx = bli_obj_vector_inc(&x);
+				double*  xp = bli_obj_buffer(&x);
+				int incy = bli_obj_vector_inc(&y);
+				double*  yp = bli_obj_buffer(&y);
+				if (dcompare_result(nn, xp, incx, yp, incy))
+					printf("Copy Successful\n");
+				else
+					printf("ALERT!!! Copy Failed\n");
+			}
+#endif
+		}
+		// Size of the vectors are incrementd by 1000, to test wide range of inputs.
+		if (p >= 1000)
+			p_inc = 1000;
+
+		if (p >= 10000)
+			p_inc = 10000;
+		Gbps = (n * sizeof_dt) / (dtime_save * 1.0e9);
+#ifdef BLIS
+		printf("data_copyv_blis\t");
+#else
+		printf("data_copyv_%s\t", BLAS);
+#endif
+		printf("%4lu\t %7.2f\n", 
+			(unsigned long)n, Gbps);
+
+		bli_obj_free(&x);
+		bli_obj_free(&y);
+	}
+
+	//	bli_finalize();
+
+	return 0;
+}
+
--- a/test/other/test_gemm.c
+++ b/test/other/test_gemm.c
@@ -0,0 +1,392 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+//#define FILE_IN_OUT
+//#define PRINT
+//#define MATRIX_INITIALISATION
+int main( int argc, char** argv )
+{
+	obj_t a, b, c;
+	obj_t c_save;
+	obj_t alpha, beta;
+	dim_t m, n, k;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   m_input, n_input, k_input;
+	num_t dt;
+	int   r, n_repeats;
+	trans_t  transa;
+	trans_t  transb;
+	f77_char f77_transa;
+	f77_char f77_transb;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+#ifdef FILE_IN_OUT
+	FILE* fin  = NULL;
+	FILE* fout = NULL;
+	char gemm = 's';
+
+#endif
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end   = 2000;
+	p_inc   = 200;
+
+	m_input = -1;
+	n_input = -1;
+	k_input = -1;
+#else
+	p_begin = 16;
+	p_end   = 16;
+	p_inc   = 1;
+
+	m_input = 5;
+	k_input = 6;
+	n_input = 4;
+#endif
+
+#if 1
+	//dt = BLIS_FLOAT;
+	dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+	transa = BLIS_NO_TRANSPOSE;
+	transb = BLIS_NO_TRANSPOSE;
+
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+
+#ifdef FILE_IN_OUT
+	if (argc < 3)
+	  {
+	    printf("Usage: ./test_gemm_XX.x input.csv output.csv\n");
+	    exit(1);
+	  }
+	fin = fopen(argv[1], "r");
+	if (fin == NULL)
+	  {
+	    printf("Error opening the file %s\n", argv[1]);
+	    exit(1);
+	  }
+	fout = fopen(argv[2], "w");
+	if (fout == NULL)
+	  {
+	    printf("Error opening output file %s\n", argv[2]);
+	    exit(1);
+	  }
+	fprintf(fout, "m\t k\t n\t cs_a\t cs_b\t cs_c\t gflops\t GEMM_Algo\n");
+
+
+	printf("~~~~~~~~~~_BLAS\t m\t k\t n\t cs_a\t cs_b\t cs_c \t gflops\t GEMM_Algo\n");
+
+	inc_t cs_a;
+	inc_t cs_b;
+	inc_t cs_c;
+
+	while (fscanf(fin, "%lld %lld %lld %lld %lld %lld\n", &m, &k, &n, &cs_a, &cs_b, &cs_c) == 6)
+	  {
+	    if ((m > cs_a) || (k > cs_b) || (m > cs_c)) continue; // leading dimension should be greater than number of rows
+	    
+	    bli_obj_create( dt, 1, 1, 0, 0, &alpha);
+	    bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+	    bli_obj_create( dt, m, k, 1, cs_a, &a );
+	    bli_obj_create( dt, k, n, 1, cs_b, &b );
+	    bli_obj_create( dt, m, n, 1, cs_c, &c );
+	    bli_obj_create( dt, m, n, 1, cs_c, &c_save );
+#ifdef MATRIX_INITIALISATION
+	    bli_randm( &a );
+	    bli_randm( &b );
+	    bli_randm( &c );
+#endif
+	    bli_obj_set_conjtrans( transa, &a);
+	    bli_obj_set_conjtrans( transb, &b);
+
+	    //bli_setsc( 0.0, -1, &alpha );
+	    //bli_setsc( 0.0, 1, &beta );
+
+	    bli_setsc( -1, 0.0, &alpha );
+	    bli_setsc( 1, 0.0, &beta );
+
+#else
+	for ( p = p_begin; p <= p_end; p += p_inc )
+	{
+		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+		bli_obj_create( dt, m, k, 0, 0, &a );
+		bli_obj_create( dt, k, n, 0, 0, &b );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_conjtrans( transb, &b );
+
+		bli_setsc(  (0.9/1.0), 0.2, &alpha );
+		bli_setsc( -(1.1/1.0), 0.3, &beta );
+
+#endif
+		bli_copym( &c, &c_save );
+	
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "b", &b, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_gemm( &alpha,
+			          &a,
+			          &b,
+			          &beta,
+			          &c );
+
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  nn     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			float*   alphap = bli_obj_buffer( &alpha );
+			float*   ap     = bli_obj_buffer( &a );
+			float*   bp     = bli_obj_buffer( &b );
+			float*   betap  = bli_obj_buffer( &beta );
+			float*   cp     = bli_obj_buffer( &c );
+
+			sgemm_( &f77_transa,
+			        &f77_transb,
+			        &mm,
+			        &nn,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        bp, &ldb,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  nn     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			double*  alphap = bli_obj_buffer( &alpha );
+			double*  ap     = bli_obj_buffer( &a );
+			double*  bp     = bli_obj_buffer( &b );
+			double*  betap  = bli_obj_buffer( &beta );
+			double*  cp     = bli_obj_buffer( &c );
+
+			dgemm_( &f77_transa,
+			        &f77_transb,
+			        &mm,
+			        &nn,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        bp, &ldb,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  nn     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			scomplex*  alphap = bli_obj_buffer( &alpha );
+			scomplex*  ap     = bli_obj_buffer( &a );
+			scomplex*  bp     = bli_obj_buffer( &b );
+			scomplex*  betap  = bli_obj_buffer( &beta );
+			scomplex*  cp     = bli_obj_buffer( &c );
+
+			cgemm_( &f77_transa,
+			        &f77_transb,
+			        &mm,
+			        &nn,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        bp, &ldb,
+			        betap,
+			        cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  nn     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			dcomplex*  alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  bp     = bli_obj_buffer( &b );
+			dcomplex*  betap  = bli_obj_buffer( &beta );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			zgemm_( &f77_transa,
+			        &f77_transb,
+			        &mm,
+			        &nn,
+			        &kk,
+			        alphap,
+			        ap, &lda,
+			        bp, &ldb,
+			        betap,
+			        cp, &ldc );
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_gemm_blis" );
+#else
+		printf( "data_gemm_%s", BLAS );
+#endif
+
+
+#ifdef FILE_IN_OUT
+
+		if ( bli_is_double( dt ) ) {
+
+		  if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES/4))  || ((m  < (BLIS_SMALL_M_RECT_MATRIX_THRES/2) ) && (k < (BLIS_SMALL_K_RECT_MATRIX_THRES/2) )))
+		    gemm = 'S';      // small gemm
+		  else gemm = 'N';   // Normal blis gemm
+		  
+		}
+		else if (bli_is_float( dt )) {
+		  if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES))  || ((m  < BLIS_SMALL_M_RECT_MATRIX_THRES) && (k < BLIS_SMALL_K_RECT_MATRIX_THRES)))
+		    gemm = 'S';    // small gemm
+		  else gemm = 'N'; // normal blis gemm
+		}
+		
+		
+
+		printf("%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
+			( unsigned long )m,
+		        ( unsigned long )k,
+		       ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c,  gflops, gemm );
+		
+		
+		fprintf(fout, "%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
+			( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c,  gflops, gemm );
+		fflush(fout);
+
+#else
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, gflops );
+#endif
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &b );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+#ifdef FILE_IN_OUT
+    fclose(fin);
+    fclose(fout);
+#endif
+	return 0;
+}
+
--- a/test/other/test_scalv.c
+++ b/test/other/test_scalv.c
@@ -0,0 +1,154 @@
+/*
+
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name of The University of Texas at Austin nor the names
+	  of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+
+//#define PRINT
+
+int main(int argc, char** argv)
+{
+	obj_t a, alpha;
+	dim_t n, p;
+	dim_t p_begin, p_end, p_inc;
+	int   n_input;
+	num_t dt;
+	int   r, n_repeats;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+
+	//bli_init();
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 100000;
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end = 100000;
+	p_inc = 200;
+
+	n_input = -1;
+#else
+	p_begin = 16;
+	p_end = 16;
+	p_inc = 1;
+
+	n_input = 4;
+#endif
+
+#if 1
+    dt = BLIS_FLOAT;
+    //dt = BLIS_DOUBLE;
+#else
+    //dt = BLIS_SCOMPLEX;
+      dt = BLIS_DCOMPLEX;
+#endif
+#ifdef BLIS
+	printf( "data_scalv_blis\t n\t gflops\n" );
+#else
+	printf( "data_scalv_%s\t n\t gflops\n", BLAS );
+#endif
+
+	for (p = p_begin; p <= p_end; p += p_inc)
+	{
+		if (n_input < 0) n = p * (dim_t)abs(n_input);
+		else               n = (dim_t)n_input;
+
+
+		bli_obj_create(dt, 1, 1, 0, 0, &alpha);
+		bli_obj_create(dt, 1, n, 0, 0, &a);
+
+		bli_randm(&a);
+		bli_setsc((2.0), 0.0, &alpha);
+		dtime_save = DBL_MAX;
+
+		for (r = 0; r < n_repeats; ++r)
+		{
+			dtime = bli_clock();
+#ifdef BLIS
+		bli_scalm(&BLIS_TWO, &a);
+#else
+		if ( bli_is_float( dt ) )
+		{
+			f77_int nn     = bli_obj_length( &a );
+			f77_int inca   = bli_obj_vector_inc( &a );
+			float*  scalar = bli_obj_buffer( &alpha );
+			float*  ap     = bli_obj_buffer( &a );
+
+			sscal_( &nn, scalar,
+				 ap, &inca );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int  nn     = bli_obj_length( &a );
+			f77_int  inca   = bli_obj_vector_inc( &a );
+			double*  scalar = bli_obj_buffer( &alpha );
+			double*  ap     = bli_obj_buffer( &a );
+
+			dscal_( &nn, scalar,
+				 ap, &inca );
+			}
+#endif
+			dtime_save = bli_clock_min_diff(dtime_save, dtime);
+		}
+// Size of the vectors are incrementd by 1000, to test wide range of inputs.
+		if (p == 10000)
+		    p_inc = 10000;
+
+		if (p == 1000)
+		    p_inc = 1000;
+
+		gflops = n / (dtime_save * 1.0e9);
+#ifdef BLIS
+	printf( "data_scalv_blis\t" );
+#else
+	printf( "data_scalv_%s\t", BLAS );
+#endif
+		printf(" %4lu\t %7.2f \n",
+			(unsigned long)n, gflops);
+
+		bli_obj_free(&alpha);
+		bli_obj_free(&a);
+	}
+	return 0;
+}
+
+
+
--- a/test/other/test_swapv.c
+++ b/test/other/test_swapv.c
@@ -0,0 +1,185 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+//               n     x      incx      y        incy
+//void  dswap_( int*, double*, int*, double*,   int* );
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+    obj_t x, y;
+    dim_t n;
+    dim_t p;
+    dim_t p_begin, p_end, p_inc;
+    int   n_input;
+    int   r, n_repeats;
+    num_t dt;
+
+    double dtime;
+    double dtime_save;
+    double gflops;
+
+    bli_init();
+
+    n_repeats = 3;
+
+#ifndef PRINT
+    p_begin = 40;
+    p_end   = 8000;
+    p_inc   = 40;
+
+    n_input = -1;
+#else
+    p_begin = 16;
+    p_end   = 16;
+    p_inc   = 1;
+
+    n_input = -1;
+#endif
+
+#if 1
+    dt = BLIS_FLOAT;
+    //dt = BLIS_DOUBLE;
+#else
+    //dt = BLIS_SCOMPLEX;
+    dt = BLIS_DCOMPLEX;
+#endif
+
+    // Begin with initializing the last entry to zero so that
+    // matlab allocates space for the entire array once up-front.
+    for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+    printf( "data_swapv_blis" );
+#else
+    printf( "data_swapv_%s", BLAS );
+#endif
+    printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
+            ( unsigned long )(p - p_begin)/p_inc + 1,
+            ( unsigned long )0, 0.0 );
+
+    //for ( p = p_begin; p <= p_end; p += p_inc )
+    for ( p = p_end; p_begin <= p; p -= p_inc )
+    {
+
+        if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+        else               n =     ( dim_t )    n_input;
+
+        bli_obj_create( dt, n, 1, 0, 0, &x );
+        bli_obj_create( dt, n, 1, 0, 0, &y );
+
+        bli_randm( &x );
+        bli_randm( &y );
+
+        dtime_save = 1.0e9;
+
+        for ( r = 0; r < n_repeats; ++r )
+        {
+
+            dtime = bli_clock();
+
+#ifdef PRINT
+            bli_printm( "x", &x, "%4.1f", "" );
+            bli_printm( "y", &y, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+            bli_swapv( &x,
+                      &y
+                      );
+#else
+            if ( bli_is_float( dt ) )
+            {
+                f77_int nn     = bli_obj_length( &x );
+                f77_int incx   = bli_obj_vector_inc( &x );
+                f77_int incy   = bli_obj_vector_inc( &y );
+                float*  xp     = bli_obj_buffer( &x );
+                float*  yp     = bli_obj_buffer( &y );
+
+                sswap_( &nn,
+                               xp, &incx,
+                               yp, &incy );
+
+            }
+            else if ( bli_is_double( dt ) )
+            {
+
+                f77_int  nn     = bli_obj_length( &x );
+                f77_int  incx   = bli_obj_vector_inc( &x );
+                f77_int  incy   = bli_obj_vector_inc( &y );
+                double*  xp     = bli_obj_buffer( &x );
+                double*  yp     = bli_obj_buffer( &y );
+
+                dswap_( &nn,
+                               xp, &incx,
+                               yp, &incy );
+            }
+#endif
+
+#ifdef PRINT
+            bli_printm( "X after", &x, "%4.1f", "" );
+            bli_printm( "Y after", &y, "%4.1f", "" );
+
+            exit(1);
+#endif
+
+            dtime_save = bli_clock_min_diff( dtime_save, dtime );
+        }
+
+        gflops = ( n ) / ( dtime_save * 1.0e9 );
+
+#ifdef BLIS
+        printf( "data_swapv_blis" );
+#else
+        printf( "data_swapv_%s", BLAS );
+#endif
+        printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
+            ( unsigned long )(p - p_begin)/p_inc + 1,
+                ( unsigned long )n, gflops );
+
+        bli_obj_free( &x );
+        bli_obj_free( &y );
+    }
+
+    bli_finalize();
+
+    return 0;
+}
--- a/test/other/test_trsm.c
+++ b/test/other/test_trsm.c
@@ -0,0 +1,443 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+//#define FILE_IN_OUT
+#ifdef FILE_IN_OUT
+//#define READ_ALL_PARAMS_FROM_FILE
+#endif
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t a, c;
+	obj_t c_save;
+	obj_t alpha;
+	dim_t m, n;
+	num_t dt;
+	int   r, n_repeats;
+	side_t side;
+	uplo_t uploa;
+	trans_t transa;
+	diag_t diaga;
+	f77_char f77_side;
+	f77_char f77_uploa;
+	f77_char f77_transa;
+	f77_char f77_diaga;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+
+#ifdef FILE_IN_OUT
+	FILE* fin = NULL;
+	FILE* fout = NULL;
+#else
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   m_input, n_input;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end   = 2000;
+	p_inc   = 200;
+
+	m_input = -1;
+	n_input = -1;
+#else
+	p_begin = 16;
+	p_end   = 16;
+	p_inc   = 1;
+
+	m_input = 4;
+	n_input = 4;
+#endif
+#endif
+
+	n_repeats = 3;
+
+#if 1
+	//dt = BLIS_FLOAT;
+	dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+#ifdef FILE_IN_OUT
+	if(argc < 3)
+	{
+		printf("Usage: ./test_trsm_XX.x input.csv output.csv\n");
+		exit(1);
+	}
+	fin = fopen(argv[1], "r");
+	if(fin == NULL)
+	{
+		printf("Error opening the file %s\n", argv[1]);
+		exit(1);
+	}
+
+	fout = fopen(argv[2], "w");
+	if(fout == NULL)
+	{
+		printf("Error opening the file %s\n", argv[2]);
+		exit(1);
+	}
+	inc_t cs_a;
+	inc_t cs_b;
+#ifdef READ_ALL_PARAMS_FROM_FILE
+	char side_c, uploa_c, transa_c, diaga_c;
+	
+	fprintf(fout, "side, uploa, transa, diaga, m\t n\t cs_a\t cs_b\t gflops\n");
+
+	printf("~~~~~~~_BLAS\t side, uploa, transa, diaga, m\t n\t cs_a\t cs_b\t gflops\n");
+
+	while(fscanf(fin, "%c %c %c %c %ld %ld %ld %ld\n", &side_c, &uploa_c, &transa_c, &diaga_c, &m, &n, &cs_a, &cs_b) == 8)
+	{
+
+	if( 'l' == side_c|| 'L' == side_c)
+		side = BLIS_LEFT;
+	else if('r' == side_c || 'R' == side_c)
+		side = BLIS_RIGHT;
+	else
+	{
+		printf("Invalid entry for the argument 'side':%c\n",side_c);
+		continue;
+	}
+
+	if('l' == uploa_c || 'L' == uploa_c)
+		uploa = BLIS_LOWER;
+	else if('u' == uploa_c || 'U' == uploa_c)
+		uploa = BLIS_UPPER;
+	else
+	{
+		printf("Invalid entry for the argument 'uplo':%c\n",uploa_c);
+		continue;
+	}
+
+	if('t' == transa_c || 'T' == transa_c)
+		transa = BLIS_TRANSPOSE;
+	else if('n' == transa_c || 'N' == transa_c)
+		transa = BLIS_NO_TRANSPOSE;
+	else
+	{
+		printf("Invalid entry for the argument 'transa':%c\n",transa_c);
+		continue;
+	}
+	
+	if('u' == diaga_c || 'U' == diaga_c)
+		diaga = BLIS_UNIT_DIAG;
+	else if('n' == diaga_c || 'N' == diaga_c)
+		diaga = BLIS_NONUNIT_DIAG;
+	else
+	{
+		printf("Invalid entry for the argument 'diaga':%c\n", diaga_c);
+		continue;
+	}
+#else
+	
+	fprintf(fout, "m\t n\t cs_a\t cs_b\t gflops\n");
+
+	printf("~~~~~~~_BLAS\t m\t n\t cs_a\t cs_b\t gflops\n");
+
+	while(fscanf(fin, "%ld %ld %ld %ld\n", &m, &n, &cs_a, &cs_b) == 4)
+	{
+	
+	side = BLIS_LEFT;
+        //side = BLIS_RIGHT;
+
+        uploa = BLIS_LOWER;
+        //uploa = BLIS_UPPER;
+
+        transa = BLIS_NO_TRANSPOSE;
+
+        diaga = BLIS_NONUNIT_DIAG;
+
+
+#endif
+
+	bli_param_map_blis_to_netlib_side( side, &f77_side );
+	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
+
+		if(bli_is_left(side) && ((m > cs_a) || (m > cs_b))) continue; //leading dimension should be greater than number of rows
+
+		if(bli_is_right(side) && ((n > cs_a) || (m > cs_b))) continue; //leading dimension should be greater than number of rows
+
+		if ( bli_is_left( side ) )
+			bli_obj_create( dt, m, m, 1, m, &a );
+		else
+			bli_obj_create( dt, n, n, 1, n, &a );
+		bli_obj_create( dt, m, n, 1, m, &c );
+		bli_obj_create( dt, m, n, 1, m, &c_save );
+
+#else
+
+	for ( p = p_end; p >= p_begin; p -= p_inc )
+	{
+		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+
+ 
+        side = BLIS_LEFT;
+        //side = BLIS_RIGHT;
+
+        uploa = BLIS_LOWER;
+        //uploa = BLIS_UPPER;
+
+        transa = BLIS_NO_TRANSPOSE;
+
+        diaga = BLIS_NONUNIT_DIAG;
+
+       bli_param_map_blis_to_netlib_side( side, &f77_side );
+        bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
+        bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+        bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
+
+		if ( bli_is_left( side ) )
+			bli_obj_create( dt, m, m, 0, 0, &a );
+		else
+			bli_obj_create( dt, n, n, 0, 0, &a );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+#endif
+
+		bli_randm( &a );
+		bli_randm( &c );
+
+		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
+		bli_obj_set_uplo( uploa, &a );
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_diag( diaga, &a );
+
+		// Randomize A and zero the unstored triangle to ensure the
+		// implementation reads only from the stored region.
+		bli_randm( &a );
+		bli_mktrim( &a );
+
+		// Load the diagonal of A to make it more likely to be invertible.
+		bli_shiftd( &BLIS_TWO, &a );
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+		bli_setsc(  (2.0/1.0), 1.0, &alpha );
+
+
+		bli_copym( &c, &c_save );
+	
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_invertd( &a );
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_invertd( &a );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_trsm( side,
+			          &alpha,
+			          &a,
+			          &c );
+#else
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  nn     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			float*   alphap = bli_obj_buffer( &alpha );
+			float*   ap     = bli_obj_buffer( &a );
+			float*   cp     = bli_obj_buffer( &c );
+
+			strsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &nn,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  nn     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			double*  alphap = bli_obj_buffer( &alpha );
+			double*  ap     = bli_obj_buffer( &a );
+			double*  cp     = bli_obj_buffer( &c );
+
+			dtrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &nn,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  nn     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			scomplex*  alphap = bli_obj_buffer( &alpha );
+			scomplex*  ap     = bli_obj_buffer( &a );
+			scomplex*  cp     = bli_obj_buffer( &c );
+
+			ctrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &nn,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  nn     = bli_obj_width( &c );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			dcomplex*  alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			ztrsm_( &f77_side,
+			        &f77_uploa,
+			        &f77_transa,
+			        &f77_diaga,
+			        &mm,
+			        &nn,
+			        alphap,
+			        ap, &lda,
+			        cp, &ldc );
+		}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%9.5f", "" );
+			exit(1);
+#endif
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		if ( bli_is_left( side ) )
+			gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
+		else
+			gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_trsm_blis" );
+#else
+		printf( "data_trsm_%s", BLAS );
+#endif
+
+#ifdef FILE_IN_OUT
+#ifdef READ_ALL_PARAMS_FROM_FILE
+
+	printf("%c\t %c\t %c\t %c\t %4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n",side_c, uploa_c, transa_c, diaga_c,
+									(unsigned long )m, (unsigned long ) n,
+	  					  			(unsigned long )cs_a, (unsigned long )cs_b,
+									gflops);
+
+	fprintf(fout,"%c\t %c\t %c\t %c\t %4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", side_c, uploa_c, transa_c, diaga_c,
+										(unsigned long )m, (unsigned long ) n,
+										(unsigned long )cs_a, (unsigned long )cs_b,
+										gflops);
+#else
+	printf("%4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", (unsigned long )m, (unsigned long ) n,
+						  (unsigned long )cs_a, (unsigned long )cs_b,
+						  gflops);
+	fprintf(fout,"%4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", (unsigned long )m, (unsigned long ) n,
+							  (unsigned long )cs_a, (unsigned long )cs_b,
+							  gflops);
+#endif
+fflush(fout);
+
+#else
+		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )n, gflops );
+#endif
+		bli_obj_free( &alpha );
+
+		bli_obj_free( &a );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+#ifdef FILE_IN_OUT
+    fclose(fin);
+    fclose(fout);
+#endif
+	//bli_finalize();
+
+	return 0;
+}
+
--- a/test/test_axpyv.c
+++ b/test/test_axpyv.c
@@ -33,7 +33,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"

 //                n    alpha     x      incx      y        incy
--- a/test/test_dotv.c
+++ b/test/test_dotv.c
@@ -33,7 +33,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"

 //        res             n     x      incx      y        incy
--- a/test/test_gemm.c
+++ b/test/test_gemm.c
@@ -32,7 +32,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"


--- a/test/test_gemmt.c
+++ b/test/test_gemmt.c
@@ -0,0 +1,483 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+#include "blis.h"
+
+//#define CBLAS
+//#define C_STOR_R
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t a, b, c;
+	obj_t c_save;
+	obj_t alpha, beta;
+	dim_t m, k;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   m_input, k_input;
+	num_t dt;
+	int   r, n_repeats;
+	uplo_t   uploc;
+	trans_t  transa;
+	trans_t  transb;
+	f77_char f77_uploc;
+	f77_char f77_transa;
+	f77_char f77_transb;
+
+	double dtime;
+	double dtime_save;
+	double gflops;
+
+	//bli_init();
+
+	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end   = 2000;
+	p_inc   = 200;
+
+	m_input = -1;
+	k_input = -1;
+#else
+	p_begin = 16;
+	p_end   = 16;
+	p_inc   = 1;
+
+	m_input = 5;
+	k_input = 4;
+#endif
+
+#if 1
+	//dt = BLIS_FLOAT;
+	dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+	uploc  = BLIS_LOWER;
+	//uploc  = BLIS_UPPER;
+
+	transa = BLIS_NO_TRANSPOSE;
+	transb = BLIS_NO_TRANSPOSE;
+
+	bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+	char uplocl  = tolower( f77_uploc );
+	char transal = tolower( f77_transa );
+	char transbl = tolower( f77_transb );
+
+	f77_int cbla_uploc  = ( uplocl  == 'l' ? CblasLower : CblasUpper );
+	f77_int cbla_transa = ( transal == 'n' ? CblasNoTrans : CblasTrans );
+	f77_int cbla_transb = ( transbl == 'n' ? CblasNoTrans : CblasTrans );
+
+	( void )cbla_uploc;
+	( void )cbla_transa;
+	( void )cbla_transb;
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+	printf( "data_gemmt_blis" );
+#else
+	printf( "data_gemmt_%s", BLAS );
+#endif
+	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
+	{
+		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+#ifndef C_STOR_R
+		if ( bli_does_trans( transa ) )
+			bli_obj_create( dt, k, m, 0, 0, &a );
+		else
+			bli_obj_create( dt, m, k, 0, 0, &a );
+
+		if ( bli_does_trans( transb ) )
+			bli_obj_create( dt, m, k, 0, 0, &b );
+		else
+			bli_obj_create( dt, k, m, 0, 0, &b );
+
+		bli_obj_create( dt, m, m, 0, 0, &c );
+		bli_obj_create( dt, m, m, 0, 0, &c_save );
+#else
+		if ( bli_does_trans( transa ) )
+			bli_obj_create( dt, k, m, -1, -1, &a );
+		else
+			bli_obj_create( dt, m, k, -1, -1, &a );
+
+		if ( bli_does_trans( transb ) )
+			bli_obj_create( dt, m, k, -1, -1, &b );
+		else
+			bli_obj_create( dt, k, m, -1, -1, &b );
+
+		bli_obj_create( dt, m, m, -1, -1, &c );
+		bli_obj_create( dt, m, m, -1, -1, &c_save );
+#endif
+
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+
+		bli_obj_set_uplo( uploc, &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_conjtrans( transb, &b );
+
+		bli_setsc(  (0.9/1.0), 0.2, &alpha );
+		bli_setsc( -(1.1/1.0), 0.3, &beta );
+
+
+		bli_copym( &c, &c_save );
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+
+			dtime = bli_clock();
+
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "b", &b, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+			bli_gemmt( &alpha,
+			           &a,
+			           &b,
+			           &beta,
+			           &c );
+
+#else
+
+#ifndef CBLAS
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			float*   alphap = bli_obj_buffer( &alpha );
+			float*   ap     = bli_obj_buffer( &a );
+			float*   bp     = bli_obj_buffer( &b );
+			float*   betap  = bli_obj_buffer( &beta );
+			float*   cp     = bli_obj_buffer( &c );
+
+			sgemmt_( &f77_uploc,
+			         &f77_transa,
+			         &f77_transb,
+			         &mm,
+			         &kk,
+			         alphap,
+			         ap, &lda,
+			         bp, &ldb,
+			         betap,
+			         cp, &ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			double*  alphap = bli_obj_buffer( &alpha );
+			double*  ap     = bli_obj_buffer( &a );
+			double*  bp     = bli_obj_buffer( &b );
+			double*  betap  = bli_obj_buffer( &beta );
+			double*  cp     = bli_obj_buffer( &c );
+
+			dgemmt_( &f77_uploc,
+			         &f77_transa,
+			         &f77_transb,
+			         &mm,
+			         &kk,
+			         alphap,
+			         ap, &lda,
+			         bp, &ldb,
+			         betap,
+			         cp, &ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			scomplex*  alphap = bli_obj_buffer( &alpha );
+			scomplex*  ap     = bli_obj_buffer( &a );
+			scomplex*  bp     = bli_obj_buffer( &b );
+			scomplex*  betap  = bli_obj_buffer( &beta );
+			scomplex*  cp     = bli_obj_buffer( &c );
+
+			cgemmt_( &f77_uploc,
+			         &f77_transa,
+			         &f77_transb,
+			         &mm,
+			         &kk,
+			         alphap,
+			         ap, &lda,
+			         bp, &ldb,
+			         betap,
+			         cp, &ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int  mm     = bli_obj_length( &c );
+			f77_int  kk     = bli_obj_width_after_trans( &a );
+			f77_int  lda    = bli_obj_col_stride( &a );
+			f77_int  ldb    = bli_obj_col_stride( &b );
+			f77_int  ldc    = bli_obj_col_stride( &c );
+			dcomplex*  alphap = bli_obj_buffer( &alpha );
+			dcomplex*  ap     = bli_obj_buffer( &a );
+			dcomplex*  bp     = bli_obj_buffer( &b );
+			dcomplex*  betap  = bli_obj_buffer( &beta );
+			dcomplex*  cp     = bli_obj_buffer( &c );
+
+			zgemmt_( &f77_uploc,
+			         &f77_transa,
+			         &f77_transb,
+			         &mm,
+			         &kk,
+			         alphap,
+			         ap, &lda,
+			         bp, &ldb,
+			         betap,
+			         cp, &ldc );
+		}
+
+#else // #ifdef CBLAS
+
+		f77_int cbla_storage = ( bli_obj_is_row_stored( &c ) ? CblasRowMajor
+		                                                     : CblasColMajor );
+
+		if ( bli_is_float( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width_after_trans( &a );
+#ifdef C_STOR_R
+			f77_int   lda    = bli_obj_row_stride( &a );
+			f77_int   ldb    = bli_obj_row_stride( &b );
+			f77_int   ldc    = bli_obj_row_stride( &c );
+#else
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldb    = bli_obj_col_stride( &b );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+#endif
+			float*    alphap = bli_obj_buffer( &alpha );
+			float*    ap     = bli_obj_buffer( &a );
+			float*    bp     = bli_obj_buffer( &b );
+			float*    betap  = bli_obj_buffer( &beta );
+			float*    cp     = bli_obj_buffer( &c );
+
+			cblas_sgemmt( cbla_storage,
+						 cbla_uploc,
+						 cbla_transa,
+						 cbla_transb,
+						 mm,
+						 kk,
+						 *alphap,
+						 ap, lda,
+						 bp, ldb,
+						 *betap,
+						 cp, ldc );
+		}
+		else if ( bli_is_double( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width_after_trans( &a );
+#ifdef C_STOR_R
+			f77_int   lda    = bli_obj_row_stride( &a );
+			f77_int   ldb    = bli_obj_row_stride( &b );
+			f77_int   ldc    = bli_obj_row_stride( &c );
+#else
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldb    = bli_obj_col_stride( &b );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+#endif
+			double*   alphap = bli_obj_buffer( &alpha );
+			double*   ap     = bli_obj_buffer( &a );
+			double*   bp     = bli_obj_buffer( &b );
+			double*   betap  = bli_obj_buffer( &beta );
+			double*   cp     = bli_obj_buffer( &c );
+
+			cblas_dgemmt( cbla_storage,
+						 cbla_uploc,
+						 cbla_transa,
+						 cbla_transb,
+						 mm,
+						 kk,
+						 *alphap,
+						 ap, lda,
+						 bp, ldb,
+						 *betap,
+						 cp, ldc );
+		}
+		else if ( bli_is_scomplex( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width_after_trans( &a );
+#ifdef C_STOR_R
+			f77_int   lda    = bli_obj_row_stride( &a );
+			f77_int   ldb    = bli_obj_row_stride( &b );
+			f77_int   ldc    = bli_obj_row_stride( &c );
+#else
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldb    = bli_obj_col_stride( &b );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+#endif
+			scomplex* alphap = bli_obj_buffer( &alpha );
+			scomplex* ap     = bli_obj_buffer( &a );
+			scomplex* bp     = bli_obj_buffer( &b );
+			scomplex* betap  = bli_obj_buffer( &beta );
+			scomplex* cp     = bli_obj_buffer( &c );
+
+			cblas_cgemmt( cbla_storage,
+						 cbla_uploc,
+						 cbla_transa,
+						 cbla_transb,
+						 mm,
+						 kk,
+						 alphap,
+						 ap, lda,
+						 bp, ldb,
+						 betap,
+						 cp, ldc );
+		}
+		else if ( bli_is_dcomplex( dt ) )
+		{
+			f77_int   mm     = bli_obj_length( &c );
+			f77_int   kk     = bli_obj_width_after_trans( &a );
+#ifdef C_STOR_R
+			f77_int   lda    = bli_obj_row_stride( &a );
+			f77_int   ldb    = bli_obj_row_stride( &b );
+			f77_int   ldc    = bli_obj_row_stride( &c );
+#else
+			f77_int   lda    = bli_obj_col_stride( &a );
+			f77_int   ldb    = bli_obj_col_stride( &b );
+			f77_int   ldc    = bli_obj_col_stride( &c );
+#endif
+			dcomplex* alphap = bli_obj_buffer( &alpha );
+			dcomplex* ap     = bli_obj_buffer( &a );
+			dcomplex* bp     = bli_obj_buffer( &b );
+			dcomplex* betap  = bli_obj_buffer( &beta );
+			dcomplex* cp     = bli_obj_buffer( &c );
+
+			cblas_zgemmt( cbla_storage,
+						 cbla_uploc,
+						 cbla_transa,
+						 cbla_transb,
+						 mm,
+						 kk,
+						 alphap,
+						 ap, lda,
+						 bp, ldb,
+						 betap,
+						 cp, ldc );
+		}
+#endif
+
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+#ifdef BLIS
+		printf( "data_gemmt_blis" );
+#else
+		printf( "data_gemmt_%s", BLAS );
+#endif
+		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k, gflops );
+
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &b );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
--- a/test/test_gemv.c
+++ b/test/test_gemv.c
@@ -32,7 +32,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"

 //           transa m     n     alpha    a        lda   x        incx  beta     y        incy
--- a/test/test_ger.c
+++ b/test/test_ger.c
@@ -32,7 +32,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"

 //          m     n     alpha    x        incx  y        incy  a        lda
--- a/test/test_hemm.c
+++ b/test/test_hemm.c
@@ -32,7 +32,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"


--- a/test/test_hemv.c
+++ b/test/test_hemv.c
@@ -32,7 +32,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"

 //           uploa  m     alpha    a        lda   x        incx  beta     y        incy
--- a/test/test_her.c
+++ b/test/test_her.c
@@ -32,7 +32,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"

 //          uplo   m     alpha    x        incx  a        lda
--- a/test/test_her2.c
+++ b/test/test_her2.c
@@ -32,7 +32,11 @@

 */

+#ifdef WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include "blis.h"

 //           uplo   m     alpha    x        incx  y        incy  a        lda
--- a/Show More
+++ b/Show More