Added template implementations and other tweaks.

Details: - Added a 'template' configuration, which contains stub implementations of the level 1, 1f, and 3 kernels with one datatype implemented in C for each, with lots of in-file comments and documentation. - Modified some variable/parameter names for some 1/1f operations. (e.g. renaming vector length parameter from m to n.) - Moved level-1f fusing factors from axpyf, dotxf, and dotxaxpyf header files to bli_kernel.h. - Modifed test suite to print out fusing factors for axpyf, dotxf, and dotxaxpyf, as well as the default fusing factor (which are all equal in the reference and template implementations). - Cleaned up some sloppiness in the level-1f unb_var1.c files whereby these reference variants were implemented in terms of front-end routines rather that directly in terms of the kernels. (For example, axpy2v was implemented as two calls to axpyv rather than two calls to AXPYV_KERNEL.) - Changed the interface to dotxf so that it matches that of axpyf, in that A is assumed to be m x b_n in both cases, and for dotxf A is actually used as A^T. - Minor variable naming and comment changes to reference micro-kernels in frame/3/gemm/ukernels and frame/3/trsm/ukernels.
2026-07-01 19:57:31 +00:00 · 2013-09-30 12:58:18 -05:00
parent 97aaf220a8
commit 5e54f46ccb
80 changed files with 6343 additions and 799 deletions
--- a/config/bgq/bli_config.h
+++ b/config/bgq/bli_config.h
@@ -97,6 +97,10 @@
 #define BLIS_CACHE_LINE_SIZE             64
 #define BLIS_PAGE_SIZE                   4096

+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             32
+
 // Alignment size used to align local stack buffers within macro-kernel
 // functions.
 #define BLIS_STACK_BUF_ALIGN_SIZE        32
--- a/config/bgq/bli_kernel.h
+++ b/config/bgq/bli_kernel.h
@@ -226,10 +226,25 @@
 // of level-1f operations. They are here only for use when these operations
 // are optimized.

-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z



--- a/config/dunnington/bli_config.h
+++ b/config/dunnington/bli_config.h
@@ -97,6 +97,10 @@
 #define BLIS_CACHE_LINE_SIZE             64
 #define BLIS_PAGE_SIZE                   4096

+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             16
+
 // Alignment size used to align local stack buffers within macro-kernel
 // functions.
 #define BLIS_STACK_BUF_ALIGN_SIZE        16
--- a/config/dunnington/bli_kernel.h
+++ b/config/dunnington/bli_kernel.h
@@ -216,10 +216,25 @@
 // of level-1f operations. They are here only for use when these operations
 // are optimized.

-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z



--- a/config/loongson3a/bli_config.h
+++ b/config/loongson3a/bli_config.h
@@ -97,6 +97,10 @@
 #define BLIS_CACHE_LINE_SIZE             64
 #define BLIS_PAGE_SIZE                   4096

+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             16
+
 // Alignment size used to align local stack buffers within macro-kernel
 // functions.
 #define BLIS_STACK_BUF_ALIGN_SIZE        16
--- a/config/loongson3a/bli_kernel.h
+++ b/config/loongson3a/bli_kernel.h
@@ -220,10 +220,25 @@
 // of level-1f operations. They are here only for use when these operations
 // are optimized.

-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z



--- a/config/mic/bli_config.h
+++ b/config/mic/bli_config.h
@@ -97,6 +97,10 @@
 #define BLIS_CACHE_LINE_SIZE             64
 #define BLIS_PAGE_SIZE                   4096

+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             32
+
 // Alignment size used to align local stack buffers within macro-kernel
 // functions.
 #define BLIS_STACK_BUF_ALIGN_SIZE        BLIS_CACHE_LINE_SIZE
--- a/config/mic/bli_kernel.h
+++ b/config/mic/bli_kernel.h
@@ -220,10 +220,25 @@
 // of level-1f operations. They are here only for use when these operations
 // are optimized.

-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z



--- a/config/piledriver/bli_config.h
+++ b/config/piledriver/bli_config.h
@@ -97,6 +97,10 @@
 #define BLIS_CACHE_LINE_SIZE             64
 #define BLIS_PAGE_SIZE                   4096

+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             16
+
 // Alignment size used to align local stack buffers within macro-kernel
 // functions.
 #define BLIS_STACK_BUF_ALIGN_SIZE        BLIS_CACHE_LINE_SIZE
--- a/config/piledriver/bli_kernel.h
+++ b/config/piledriver/bli_kernel.h
@@ -220,10 +220,25 @@
 // of level-1f operations. They are here only for use when these operations
 // are optimized.

-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z



--- a/config/power7/bli_config.h
+++ b/config/power7/bli_config.h
@@ -95,6 +95,10 @@
 #define BLIS_CACHE_LINE_SIZE             64
 #define BLIS_PAGE_SIZE                   4096

+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             16
+
 // Alignment size used to align local stack buffers within macro-kernel
 // functions.
 #define BLIS_STACK_BUF_ALIGN_SIZE        16
--- a/config/power7/bli_kernel.h
+++ b/config/power7/bli_kernel.h
@@ -220,10 +220,25 @@
 // of level-1f operations. They are here only for use when these operations
 // are optimized.

-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z



--- a/config/reference/bli_config.h
+++ b/config/reference/bli_config.h
@@ -97,14 +97,18 @@
 #define BLIS_CACHE_LINE_SIZE             64
 #define BLIS_PAGE_SIZE                   4096

+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             16
+
 // Alignment size used to align local stack buffers within macro-kernel
 // functions.
-#define BLIS_STACK_BUF_ALIGN_SIZE        16
+#define BLIS_STACK_BUF_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE

 // Alignment size used when allocating memory dynamically from the operating
 // system (eg: posix_memalign()). To disable heap alignment and just use
 // malloc() instead, set this to 1.
-#define BLIS_HEAP_ADDR_ALIGN_SIZE        16
+#define BLIS_HEAP_ADDR_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE

 // Alignment size used when sizing leading dimensions of dynamically
 // allocated memory.
@@ -116,7 +120,7 @@

 // Alignment size used when sizing strides (eg: of packed micro-panels)
 // within a block of contiguous memory.
-#define BLIS_CONTIG_STRIDE_ALIGN_SIZE    16
+#define BLIS_CONTIG_STRIDE_ALIGN_SIZE    BLIS_SIMD_ALIGN_SIZE



--- a/config/reference/bli_kernel.h
+++ b/config/reference/bli_kernel.h
@@ -54,21 +54,21 @@
 //     (b) NR (for triangular operations such as trmm and trsm).
 // 

-#define BLIS_DEFAULT_MC_S              256
-#define BLIS_DEFAULT_KC_S              256
-#define BLIS_DEFAULT_NC_S              8192
+#define BLIS_DEFAULT_MC_S              64 
+#define BLIS_DEFAULT_KC_S              128
+#define BLIS_DEFAULT_NC_S              4096

-#define BLIS_DEFAULT_MC_D              128
-#define BLIS_DEFAULT_KC_D              256
+#define BLIS_DEFAULT_MC_D              64 
+#define BLIS_DEFAULT_KC_D              128
 #define BLIS_DEFAULT_NC_D              4096

-#define BLIS_DEFAULT_MC_C              128
-#define BLIS_DEFAULT_KC_C              256
+#define BLIS_DEFAULT_MC_C              64 
+#define BLIS_DEFAULT_KC_C              128
 #define BLIS_DEFAULT_NC_C              4096

 #define BLIS_DEFAULT_MC_Z              64
-#define BLIS_DEFAULT_KC_Z              256
-#define BLIS_DEFAULT_NC_Z              2048
+#define BLIS_DEFAULT_KC_Z              128
+#define BLIS_DEFAULT_NC_Z              4096

 // -- Cache blocksize extensions (for optimizing edge cases) --

@@ -220,10 +220,25 @@
 // of level-1f operations. They are here only for use when these operations
 // are optimized.

-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z



--- a/config/template/bli_config.h
+++ b/config/template/bli_config.h
@@ -0,0 +1,169 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_CONFIG_H
+#define BLIS_CONFIG_H
+
+
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
+
+
+// -- INTEGER PROPERTIES -------------------------------------------------------
+
+// The bit size of the integer type used to track values such as dimensions,
+// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
+// integers while 64 results in 64-bit integers. Any other value results in use
+// of the C99 type "long int". Note that this ONLY affects integers used
+// internally within BLIS as well as those exposed in the native BLAS-like BLIS
+// interface.
+#define BLIS_INT_TYPE_SIZE               32
+
+
+
+// -- FLOATING-POINT PROPERTIES ------------------------------------------------
+
+// Define the number of floating-point types supported, and the size of the
+// largest type.
+#define BLIS_NUM_FP_TYPES                4
+#define BLIS_MAX_TYPE_SIZE               sizeof(dcomplex)
+
+// Enable use of built-in C99 "float complex" and "double complex" types and
+// associated overloaded operations and functions? Disabling results in
+// scomplex and dcomplex being defined in terms of simple structs.
+//#define BLIS_ENABLE_C99_COMPLEX
+
+
+
+// -- MULTITHREADING -----------------------------------------------------------
+
+// The maximum number of BLIS threads that will run concurrently.
+#define BLIS_MAX_NUM_THREADS             1
+
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+// -- Contiguous (static) memory allocator --
+
+// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
+// contiguous memory pools.
+#define BLIS_NUM_MC_X_KC_BLOCKS          BLIS_MAX_NUM_THREADS
+#define BLIS_NUM_KC_X_NC_BLOCKS          1
+#define BLIS_NUM_MC_X_NC_BLOCKS          0
+
+// The maximum preload byte offset is used to pad the end of the contiguous
+// memory pools so that the micro-kernel, when computing with the end of the
+// last block, can exceed the bounds of the usable portion of the memory
+// region without causing a segmentation fault.
+#define BLIS_MAX_PRELOAD_BYTE_OFFSET     128
+
+// -- Memory alignment --
+
+// It is sometimes useful to define the various memory alignments in terms
+// of some other characteristics of the system, such as the cache line size
+// and the page size.
+#define BLIS_CACHE_LINE_SIZE             64
+#define BLIS_PAGE_SIZE                   4096
+
+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             16
+
+// Alignment size used to align local stack buffers within macro-kernel
+// functions.
+#define BLIS_STACK_BUF_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when allocating memory dynamically from the operating
+// system (eg: posix_memalign()). To disable heap alignment and just use
+// malloc() instead, set this to 1.
+#define BLIS_HEAP_ADDR_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when sizing leading dimensions of dynamically
+// allocated memory.
+#define BLIS_HEAP_STRIDE_ALIGN_SIZE      BLIS_CACHE_LINE_SIZE
+
+// Alignment size used when allocating entire blocks of contiguous memory
+// from the contiguous memory allocator.
+#define BLIS_CONTIG_ADDR_ALIGN_SIZE      BLIS_PAGE_SIZE
+
+// Alignment size used when sizing strides (eg: of packed micro-panels)
+// within a block of contiguous memory.
+#define BLIS_CONTIG_STRIDE_ALIGN_SIZE    BLIS_SIMD_ALIGN_SIZE
+
+
+
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Basic (homogeneous) datatype support always enabled.
+
+// Enable mixed domain operations?
+//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+
+// Enable extra mixed precision operations?
+//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+
+
+
+// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
+
+// Stay initialized after auto-initialization, unless and until the user
+// explicitly calls bli_finalize().
+#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
+
+
+
+// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
+
+// Enable the BLAS compatibility layer?
+#define BLIS_ENABLE_BLAS2BLIS
+
+// The bit size of the integer type used to track values such as dimensions and
+// leading dimensions (ie: column strides) within the BLAS compatibility layer.
+// A value of 32 results in the compatibility layer using 32-bit signed integers
+// while 64 results in 64-bit integers. Any other value results in use of the
+// C99 type "long int". Note that this ONLY affects integers used within the
+// BLAS compatibility layer.
+#define BLIS_BLAS2BLIS_INT_TYPE_SIZE     32
+
+// Fortran-77 name-mangling macros.
+#define PASTEF770(name)                        name ## _
+#define PASTEF77(ch1,name)       ch1        ## name ## _
+#define PASTEF772(ch1,ch2,name)  ch1 ## ch2 ## name ## _
+
+
+
+
+#endif
+
--- a/config/template/bli_kernel.h
+++ b/config/template/bli_kernel.h
@@ -0,0 +1,391 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_KERNEL_H
+#define BLIS_KERNEL_H
+
+
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Default cache blocksizes --
+
+//
+// Constraints:
+//
+// (1) MC must be a multiple of:
+//     (a) MR (for zero-padding purposes)
+//     (b) NR (for zero-padding purposes when MR and NR are "swapped")
+// (2) NC must be a multiple of
+//     (a) NR (for zero-padding purposes)
+//     (b) MR (for zero-padding purposes when MR and NR are "swapped")
+// (3) KC must be a multiple of
+//     (a) MR and
+//     (b) NR (for triangular operations such as trmm and trsm).
+// 
+
+#define BLIS_DEFAULT_MC_S              64 
+#define BLIS_DEFAULT_KC_S              128
+#define BLIS_DEFAULT_NC_S              4096
+
+#define BLIS_DEFAULT_MC_D              64
+#define BLIS_DEFAULT_KC_D              128
+#define BLIS_DEFAULT_NC_D              4096
+
+#define BLIS_DEFAULT_MC_C              64
+#define BLIS_DEFAULT_KC_C              128
+#define BLIS_DEFAULT_NC_C              4096
+
+#define BLIS_DEFAULT_MC_Z              64
+#define BLIS_DEFAULT_KC_Z              128
+#define BLIS_DEFAULT_NC_Z              4096
+
+// -- Cache blocksize extensions (for optimizing edge cases) --
+
+// NOTE: These cache blocksize "extensions" have the same constraints as
+// the corresponding default blocksizes above. When these values are
+// non-zero, blocksizes used at edge cases are extended (enlarged) if
+// such an extension would encompass the remaining portion of the
+// matrix dimension.
+
+#define BLIS_EXTEND_MC_S               0 //(BLIS_DEFAULT_MC_S/4)
+#define BLIS_EXTEND_KC_S               0 //(BLIS_DEFAULT_KC_S/4)
+#define BLIS_EXTEND_NC_S               0 //(BLIS_DEFAULT_NC_S/4)
+
+#define BLIS_EXTEND_MC_D               0 //(BLIS_DEFAULT_MC_D/4)
+#define BLIS_EXTEND_KC_D               0 //(BLIS_DEFAULT_KC_D/4)
+#define BLIS_EXTEND_NC_D               0 //(BLIS_DEFAULT_NC_D/4)
+
+#define BLIS_EXTEND_MC_C               0 //(BLIS_DEFAULT_MC_C/4)
+#define BLIS_EXTEND_KC_C               0 //(BLIS_DEFAULT_KC_C/4)
+#define BLIS_EXTEND_NC_C               0 //(BLIS_DEFAULT_NC_C/4)
+
+#define BLIS_EXTEND_MC_Z               0 //(BLIS_DEFAULT_MC_Z/4)
+#define BLIS_EXTEND_KC_Z               0 //(BLIS_DEFAULT_KC_Z/4)
+#define BLIS_EXTEND_NC_Z               0 //(BLIS_DEFAULT_NC_Z/4)
+
+// -- Default register blocksizes for micro-kernel --
+
+// NOTE: When using the reference configuration, these register blocksizes
+// in the m and n dimensions should all be equal to the size expected by
+// the reference micro-kernel(s).
+
+#define BLIS_DEFAULT_MR_S              8
+#define BLIS_DEFAULT_NR_S              4
+
+#define BLIS_DEFAULT_MR_D              8
+#define BLIS_DEFAULT_NR_D              4
+
+#define BLIS_DEFAULT_MR_C              8
+#define BLIS_DEFAULT_NR_C              4
+
+#define BLIS_DEFAULT_MR_Z              8
+#define BLIS_DEFAULT_NR_Z              4
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+#define BLIS_DEFAULT_KR_S              1
+#define BLIS_DEFAULT_KR_D              1
+#define BLIS_DEFAULT_KR_C              1
+#define BLIS_DEFAULT_KR_Z              1
+
+// -- Register blocksize extensions (for packed micro-panels) --
+
+// NOTE: These register blocksize "extensions" determine whether the
+// leading dimensions used within the packed micro-panels are equal to
+// or greater than their corresponding register blocksizes above.
+
+#define BLIS_EXTEND_MR_S               0
+#define BLIS_EXTEND_NR_S               0
+
+#define BLIS_EXTEND_MR_D               0
+#define BLIS_EXTEND_NR_D               0
+
+#define BLIS_EXTEND_MR_C               0
+#define BLIS_EXTEND_NR_C               0
+
+#define BLIS_EXTEND_MR_Z               0
+#define BLIS_EXTEND_NR_Z               0
+
+// Register blocksize extensions in the k dimension are not used.
+
+#define BLIS_EXTEND_KR_S               0
+#define BLIS_EXTEND_KR_D               0
+#define BLIS_EXTEND_KR_C               0
+#define BLIS_EXTEND_KR_Z               0
+
+// -- Number of elements per vector register --
+
+// NOTE: These constants are typically only used to determine the amount
+// of duplication needed when configuring level-3 macro-kernels that
+// copy and duplicate elements of B to a temporary duplication buffer
+// (so that element-wise vector multiplication and addition instructions
+// can be used).
+
+#define BLIS_NUM_ELEM_PER_REG_S        4
+#define BLIS_NUM_ELEM_PER_REG_D        2
+#define BLIS_NUM_ELEM_PER_REG_C        2
+#define BLIS_NUM_ELEM_PER_REG_Z        1
+
+// -- Default switch for duplication of B --
+
+// NOTE: Setting these values to 1 disables duplication. Any value
+// d > 1 results in a d-1 duplicates created within special macro-kernel
+// buffer of dimension k x NR*d.
+
+//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
+//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
+//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
+//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
+#define BLIS_DEFAULT_NUM_DUPL_S        1
+#define BLIS_DEFAULT_NUM_DUPL_D        1
+#define BLIS_DEFAULT_NUM_DUPL_C        1
+#define BLIS_DEFAULT_NUM_DUPL_Z        1
+
+// -- Default incremental packing blocksizes (n dimension) --
+
+// NOTE: These incremental packing blocksizes (for the n dimension) are only
+// used by certain blocked variants. But when the *are* used, they MUST be
+// be an integer multiple of NR!
+
+#define BLIS_DEFAULT_NI_FAC            16
+#define BLIS_DEFAULT_NI_S              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
+#define BLIS_DEFAULT_NI_D              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
+#define BLIS_DEFAULT_NI_C              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
+#define BLIS_DEFAULT_NI_Z              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
+
+
+
+// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
+
+// NOTE: These values determine high-level cache blocking for level-2
+// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
+// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
+// gemv subproblems are called. The blocked algorithms are only useful in
+// that they provide the opportunity for packing vectors. (Matrices can also
+// be packed here, but this tends to be much too expensive in practice to
+// actually employ.)
+
+#define BLIS_DEFAULT_L2_MC_S           1000
+#define BLIS_DEFAULT_L2_NC_S           1000
+
+#define BLIS_DEFAULT_L2_MC_D           1000
+#define BLIS_DEFAULT_L2_NC_D           1000
+
+#define BLIS_DEFAULT_L2_MC_C           1000
+#define BLIS_DEFAULT_L2_NC_C           1000
+
+#define BLIS_DEFAULT_L2_MC_Z           1000
+#define BLIS_DEFAULT_L2_NC_Z           1000
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+// -- Default fusing factors for level-1f operations --
+
+// NOTE: Default fusing factors are not used by the reference implementations
+// of level-1f operations. They are here only for use when these operations
+// are optimized.
+
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z
+
+
+
+// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
+
+// -- Default register blocksizes for vectors --
+
+// NOTE: Register blocksizes for vectors are used when packing
+// non-contiguous vectors. Similar to that of KR, they can
+// typically be set to 1.
+
+#define BLIS_DEFAULT_VR_S              1
+#define BLIS_DEFAULT_VR_D              1
+#define BLIS_DEFAULT_VR_C              1
+#define BLIS_DEFAULT_VR_Z              1
+
+
+
+// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
+
+#include "bli_gemm_opt_mxn.h"
+#include "bli_trsm_l_opt_mxn.h"
+#include "bli_trsm_u_opt_mxn.h"
+#include "bli_gemmtrsm_l_opt_mxn.h"
+#include "bli_gemmtrsm_u_opt_mxn.h"
+
+// -- dupl --
+
+#define DUPL_KERNEL          dupl_unb_var1
+
+// -- gemm --
+
+#define GEMM_UKERNEL         gemm_opt_mxn
+
+// -- trsm-related --
+
+#define GEMMTRSM_L_UKERNEL   gemmtrsm_l_opt_mxn
+#define GEMMTRSM_U_UKERNEL   gemmtrsm_u_opt_mxn
+
+#define TRSM_L_UKERNEL       trsm_l_opt_mxn
+#define TRSM_U_UKERNEL       trsm_u_opt_mxn
+
+
+
+// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
+
+// -- packm --
+
+#define PACKM_2XK_KERNEL     packm_ref_2xk
+#define PACKM_4XK_KERNEL     packm_ref_4xk
+#define PACKM_6XK_KERNEL     packm_ref_6xk
+#define PACKM_8XK_KERNEL     packm_ref_8xk
+#define PACKM_10XK_KERNEL    packm_ref_10xk
+#define PACKM_12XK_KERNEL    packm_ref_12xk
+#define PACKM_14XK_KERNEL    packm_ref_14xk
+#define PACKM_16XK_KERNEL    packm_ref_16xk
+
+// -- unpackm --
+
+#define UNPACKM_2XK_KERNEL   unpackm_ref_2xk
+#define UNPACKM_4XK_KERNEL   unpackm_ref_4xk
+#define UNPACKM_6XK_KERNEL   unpackm_ref_6xk
+#define UNPACKM_8XK_KERNEL   unpackm_ref_8xk
+#define UNPACKM_10XK_KERNEL  unpackm_ref_10xk
+#define UNPACKM_12XK_KERNEL  unpackm_ref_12xk
+#define UNPACKM_14XK_KERNEL  unpackm_ref_14xk
+#define UNPACKM_16XK_KERNEL  unpackm_ref_16xk
+
+
+
+// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
+
+#include "bli_axpy2v_opt_var1.h"
+#include "bli_dotaxpyv_opt_var1.h"
+#include "bli_axpyf_opt_var1.h"
+#include "bli_dotxf_opt_var1.h"
+#include "bli_dotxaxpyf_opt_var1.h"
+
+// -- axpy2v --
+
+#define AXPY2V_KERNEL        axpy2v_opt_var1
+
+// -- dotaxpyv --
+
+#define DOTAXPYV_KERNEL      dotaxpyv_opt_var1
+
+// -- axpyf --
+
+#define AXPYF_KERNEL         axpyf_opt_var1
+
+// -- dotxf --
+
+#define DOTXF_KERNEL         dotxf_opt_var1
+
+// -- dotxaxpyf --
+
+#define DOTXAXPYF_KERNEL     dotxaxpyf_opt_var1
+
+
+
+// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
+
+// -- addv --
+
+#define ADDV_KERNEL          addv_unb_var1
+
+// -- axpyv --
+
+#define AXPYV_KERNEL         axpyv_unb_var1
+
+// -- copyv --
+
+#define COPYV_KERNEL         copyv_unb_var1
+
+// -- dotv --
+
+#define DOTV_KERNEL          dotv_unb_var1
+
+// -- dotxv --
+
+#define DOTXV_KERNEL         dotxv_unb_var1
+
+// -- invertv --
+
+#define INVERTV_KERNEL       invertv_unb_var1
+
+// -- scal2v --
+
+#define SCAL2V_KERNEL        scal2v_unb_var1
+
+// -- scalv --
+
+#define SCALV_KERNEL         scalv_unb_var1
+
+// -- setv --
+
+#define SETV_KERNEL          setv_unb_var1
+
+// -- subv --
+
+#define SUBV_KERNEL          subv_unb_var1
+
+// -- swapv --
+
+#define SWAPV_KERNEL         swapv_unb_var1
+
+
+
+#endif
+
--- a/config/template/kernels/1/bli_axpyv_opt_var1.c
+++ b/config/template/kernels/1/bli_axpyv_opt_var1.c
@@ -0,0 +1,308 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sssaxpyv_opt_var1( conj_t             conjx,
+                            dim_t              n,
+                            float*    restrict alpha,
+                            float*    restrict x, inc_t incx,
+                            float*    restrict y, inc_t incy )
+{
+	/* Just call the reference implementation. */
+	bli_sssaxpyv_unb_var1( conjx,
+	                       n,
+	                       alpha,
+	                       x, incx,
+	                       y, incy );
+}
+
+
+
+void bli_dddaxpyv_opt_var1( conj_t             conjx,
+                            dim_t              n,
+                            double*   restrict alpha,
+                            double*   restrict x, inc_t incx,
+                            double*   restrict y, inc_t incy )
+{
+	/* Just call the reference implementation. */
+	bli_dddaxpyv_unb_var1( conjx,
+	                       n,
+	                       alpha,
+	                       x, incx,
+	                       y, incy );
+}
+
+
+
+void bli_cccaxpyv_opt_var1( conj_t             conjx,
+                            dim_t              n,
+                            scomplex* restrict alpha,
+                            scomplex* restrict x, inc_t incx,
+                            scomplex* restrict y, inc_t incy )
+{
+	/* Just call the reference implementation. */
+	bli_cccaxpyv_unb_var1( conjx,
+	                       n,
+	                       alpha,
+	                       x, incx,
+	                       y, incy );
+}
+
+
+
+void bli_zzzaxpyv_opt_var1( conj_t             conjx,
+                            dim_t              n,
+                            dcomplex* restrict alpha,
+                            dcomplex* restrict x, inc_t incx,
+                            dcomplex* restrict y, inc_t incy )
+{
+/*
+  Template axpyv kernel implementation
+
+  This function contains a template implementation for a double-precision
+  complex kernel, coded in C, which can serve as the starting point for one
+  to write an optimized kernel on an arbitrary architecture. (We show a
+  template implementation for only double-precision complex because the
+  templates for the other three floating-point types would be similar, with
+  the real instantiations being noticeably simpler due to the disappearance
+  of conjugation in the real domain.)
+
+  This kernel performs a vector scale and accumulate (axpy) operation:
+
+    y := y + alpha * conjx( x )
+
+  where x and y are vectors of length n and alpha is a scalar.
+
+  Parameters:
+
+  - conjx:  Compute with conjugated values of x?
+  - n:      The number of elements in vectors x and y.
+  - alpha:  The address of a scalar.
+  - x:      The address of vector x.
+  - incx:   The vector increment of x. incx should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - y:      The address of vector y.
+  - incy:   The vector increment of y. incy should be unit unless the
+            implementation makes special accomodation for non-unit values.
+
+  This template code calls the reference implementation if any of the
+  following conditions are true:
+
+  - Either of the strides incx or incy is non-unit.
+  - Vectors x and y are unaligned with different offsets.
+
+  If the vectors are aligned, or unaligned by the same offset, then optimized
+  code can be used for the bulk of the computation. This template shows how
+  the front-edge case can be handled so that the remaining computation is
+  aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE,
+  which is defined in bli_config.h.)
+
+  Additional things to consider:
+
+  - Because conjugation disappears in the real domain, real instances of
+    this kernel can safely ignore the values of any conjugation parameters,
+    thereby simplifying the implementation.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t n_elem_per_reg  = 1;
+	const dim_t n_iter_unroll   = 1;
+
+	const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
+	const siz_t type_size       = sizeof( *x );
+
+	dcomplex*   xp;
+	dcomplex*   yp;
+
+	bool_t      use_ref         = FALSE;
+
+	dim_t       n_pre           = 0;
+	dim_t       n_iter;
+	dim_t       n_left;
+
+	dim_t       off_x, off_y;
+	dim_t       i;
+
+
+	if ( bli_zero_dim1( n ) ) return;
+
+	if ( bli_zeq0( *alpha ) ) return;
+
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( bli_has_nonunit_inc2( incx, incy ) )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
+	{
+		use_ref = TRUE;
+
+		// If a, the second column of a, and y are unaligned by the same
+		// offset, then we can still use an implementation that depends on
+		// alignment for most of the operation.
+		off_x  = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
+		off_y  = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
+
+		if ( off_x == off_y )
+		{
+			use_ref = FALSE;
+			n_pre   = off_x / type_size;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_zzzaxpyv_unb_var1( conjx,
+		                       n,
+		                       alpha,
+		                       x, incx,
+		                       y, incy );
+        return;
+	}
+
+
+	// Compute the number of unrolled and leftover (edge) iterations.
+	n_iter = ( n - n_pre ) / n_elem_per_iter;
+	n_left = ( n - n_pre ) % n_elem_per_iter;
+
+
+	// Initialize pointers into x and y.
+	xp = x;
+	yp = y;
+
+
+	// Iterate over elements of x and y to compute:
+	//  y += alpha * conjx( x );
+	if ( bli_is_noconj( conjx ) )
+	{
+		// Compute front edge cases if x and y were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzaxpys( *alpha, *xp, *yp );
+
+			xp += 1; yp += 1;
+		}
+
+		// The bulk of the operation is executed here. The addresses xp and
+		// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzaxpys( *alpha, *xp, *yp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzaxpys( *alpha, *xp, *yp );
+
+			xp += 1; yp += 1;
+		}
+	}
+	else // if ( bli_is_conj( conjx ) )
+	{
+		// Compute front edge cases if x and y were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzaxpyjs( *alpha, *xp, *yp );
+
+			xp += 1; yp += 1;
+		}
+
+		// The bulk of the operation is executed here. The addresses xp and
+		// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzaxpyjs( *alpha, *xp, *yp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzaxpyjs( *alpha, *xp, *yp );
+
+			xp += 1; yp += 1;
+		}
+	}
+}
+
+
+
+//
+// Define BLAS-like interfaces with heterogeneous-typed operands.
+//
+#undef  GENTFUNC3
+#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \
+\
+void PASTEMAC3(cha,chx,chy,opname)( \
+                                    conj_t            conjx, \
+                                    dim_t             n, \
+                                    ctype_a* restrict alpha, \
+                                    ctype_x* restrict x, inc_t incx, \
+                                    ctype_y* restrict y, inc_t incy \
+                                  ) \
+{ \
+	/* Just call the reference implementation. */ \
+	PASTEMAC3(cha,chx,chy,varname)( conjx, \
+	                                n, \
+	                                alpha, \
+	                                x, incx, \
+	                                y, incy ); \
+}
+
+
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTFUNC3_MIX_D( axpyv_opt_var1, axpyv_unb_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTFUNC3_MIX_P( axpyv_opt_var1, axpyv_unb_var1 )
+#endif
+
--- a/config/template/kernels/1/bli_axpyv_opt_var1.h
+++ b/config/template/kernels/1/bli_axpyv_opt_var1.h
@@ -0,0 +1,59 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype axpyv kernel interfaces.
+//
+#undef  GENTPROT3
+#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
+\
+void PASTEMAC3(cha,chx,chy,varname)( \
+                                     conj_t            conjx, \
+                                     dim_t             n, \
+                                     ctype_a* restrict alpha, \
+                                     ctype_x* restrict x, inc_t incx, \
+                                     ctype_y* restrict y, inc_t incy \
+                                   );
+
+INSERT_GENTPROT3_BASIC( axpyv_opt_var1 )
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 )
+#endif
+
--- a/config/template/kernels/1/bli_dotv_opt_var1.c
+++ b/config/template/kernels/1/bli_dotv_opt_var1.c
@@ -0,0 +1,345 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sssdotv_opt_var1( conj_t             conjx,
+                           conj_t             conjy,
+                           dim_t              n,
+                           float*    restrict x, inc_t incx,
+                           float*    restrict y, inc_t incy,
+                           float*    restrict rho )
+{
+	/* Just call the reference implementation. */
+	bli_sssdotv_unb_var1( conjx,
+	                      conjy,
+	                      n,
+	                      x, incx,
+	                      y, incy,
+	                      rho );
+}
+
+
+
+void bli_ddddotv_opt_var1( conj_t             conjx,
+                           conj_t             conjy,
+                           dim_t              n,
+                           double*   restrict x, inc_t incx,
+                           double*   restrict y, inc_t incy,
+                           double*   restrict rho )
+{
+	/* Just call the reference implementation. */
+	bli_ddddotv_unb_var1( conjx,
+	                      conjy,
+	                      n,
+	                      x, incx,
+	                      y, incy,
+	                      rho );
+}
+
+
+
+void bli_cccdotv_opt_var1( conj_t             conjx,
+                           conj_t             conjy,
+                           dim_t              n,
+                           scomplex* restrict x, inc_t incx,
+                           scomplex* restrict y, inc_t incy,
+                           scomplex* restrict rho )
+{
+	/* Just call the reference implementation. */
+	bli_cccdotv_unb_var1( conjx,
+	                      conjy,
+	                      n,
+	                      x, incx,
+	                      y, incy,
+	                      rho );
+}
+
+
+
+void bli_zzzdotv_opt_var1( conj_t             conjx,
+                           conj_t             conjy,
+                           dim_t              n,
+                           dcomplex* restrict x, inc_t incx,
+                           dcomplex* restrict y, inc_t incy,
+                           dcomplex* restrict rho )
+{
+/*
+  Template dotv kernel implementation
+
+  This function contains a template implementation for a double-precision
+  complex kernel, coded in C, which can serve as the starting point for one
+  to write an optimized kernel on an arbitrary architecture. (We show a
+  template implementation for only double-precision complex because the
+  templates for the other three floating-point types would be similar, with
+  the real instantiations being noticeably simpler due to the disappearance
+  of conjugation in the real domain.)
+
+  This kernel performs an inner (dot) product operation:
+
+    rho := conjx( x^T ) * conjy( y )
+
+  where x and y are vectors of length n and rho is a scalar.
+
+  Parameters:
+
+  - conjx:  Compute with conjugated values of x?
+  - conjy:  Compute with conjugated values of y?
+  - n:      The number of elements in vectors x and y.
+  - x:      The address of vector x.
+  - incx:   The vector increment of x. incx should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - y:      The address of vector y.
+  - incy:   The vector increment of y. incy should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - rho:    The address of the output scalar.
+
+  This template code calls the reference implementation if any of the
+  following conditions are true:
+
+  - Either of the strides incx or incy is non-unit.
+  - Vectors x and y are unaligned with different offsets.
+
+  If the vectors are aligned, or unaligned by the same offset, then optimized
+  code can be used for the bulk of the computation. This template shows how
+  the front-edge case can be handled so that the remaining computation is
+  aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE,
+  which is defined in bli_config.h.)
+
+  Additional things to consider:
+
+  - While four combinations of possible values of conjx and conjy exist, we
+    implement only conjugation on x explicitly; we induce the other two cases
+    by toggling the effective conjugation on x and then conjugating the dot
+    product result.
+  - Because conjugation disappears in the real domain, real instances of
+    this kernel can safely ignore the values of any conjugation parameters,
+    thereby simplifying the implementation.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t n_elem_per_reg  = 1;
+	const dim_t n_iter_unroll   = 1;
+
+	const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
+	const siz_t type_size       = sizeof( *x );
+
+	dcomplex*   xp;
+	dcomplex*   yp;
+	dcomplex    dotxy;
+
+	bool_t      use_ref         = FALSE;
+
+	dim_t       n_pre           = 0;
+	dim_t       n_iter;
+	dim_t       n_left;
+
+	dim_t       off_x, off_y;
+	dim_t       i;
+
+	conj_t      conjx_use;
+
+
+	// If the vector lengths are zero, set rho to zero and return.
+	if ( bli_zero_dim1( n ) )
+	{
+		bli_zset0s( *rho );
+		return;
+	}
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( bli_has_nonunit_inc2( incx, incy ) )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
+	{
+		use_ref = TRUE;
+
+		// If a, the second column of a, and y are unaligned by the same
+		// offset, then we can still use an implementation that depends on
+		// alignment for most of the operation.
+		off_x  = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
+		off_y  = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
+
+		if ( off_x == off_y )
+		{
+			use_ref = FALSE;
+			n_pre   = off_x / type_size;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_zzzdotv_unb_var1( conjx,
+		                      conjy,
+		                      n,
+		                      x, incx,
+		                      y, incy,
+		                      rho );
+        return;
+	}
+
+
+	// Compute the number of unrolled and leftover (edge) iterations.
+	n_iter = ( n - n_pre ) / n_elem_per_iter;
+	n_left = ( n - n_pre ) % n_elem_per_iter;
+
+
+	// Initialize pointers into x and y.
+	xp = x;
+	yp = y;
+
+
+	// Initialize accumulator to zero.
+	bli_zset0s( dotxy );
+
+	
+	conjx_use = conjx;
+
+	// If y must be conjugated, we compute the result indirectly by first
+	// toggling the effective conjugation of x and then conjugating the
+	// resulting dot product.
+	if ( bli_is_conj( conjy ) )
+		bli_toggle_conj( conjx_use );
+
+
+	// Iterate over elements of x and y to compute:
+	//  rho = conjx( x^T ) * conjy( y );
+	if ( bli_is_noconj( conjx_use ) )
+	{
+		// Compute front edge cases if x and y were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+
+			xp += 1; yp += 1;
+		}
+
+		// The bulk of the operation is executed here. The addresses xp and
+		// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+
+			xp += 1; yp += 1;
+		}
+	}
+	else // if ( bli_is_conj( conjx_use ) )
+	{
+		// Compute front edge cases if x and y were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+
+			xp += 1; yp += 1;
+		}
+
+		// The bulk of the operation is executed here. The addresses xp and
+		// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+
+			xp += 1; yp += 1;
+		}
+	}
+
+	// If conjugation on y was requested, we induce it by conjugating
+	// the contents of dotxy.
+	if ( bli_is_conj( conjy ) )
+		bli_zconjs( dotxy );
+
+	bli_zzcopys( dotxy, *rho );
+}
+
+
+
+//
+// Define BLAS-like interfaces with heterogeneous-typed operands.
+//
+#undef  GENTFUNC3
+#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
+\
+void PASTEMAC3(chx,chy,chr,opname)( \
+                                    conj_t            conjx, \
+                                    conj_t            conjy, \
+                                    dim_t             n, \
+                                    ctype_x* restrict x, inc_t incx, \
+                                    ctype_y* restrict y, inc_t incy, \
+                                    ctype_r* restrict rho \
+                                  ) \
+{ \
+	/* Just call the reference implementation. */ \
+    PASTEMAC3(chx,chy,chr,varname)( conjx, \
+                                    conjy, \
+                                    n, \
+                                    x, incx, \
+                                    y, incy, \
+                                    rho ); \
+}
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTFUNC3_MIX_D( dotv_opt_var1, dotv_unb_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTFUNC3_MIX_P( dotv_opt_var1, dotv_unb_var1 )
+#endif
--- a/config/template/kernels/1/bli_dotv_opt_var1.h
+++ b/config/template/kernels/1/bli_dotv_opt_var1.h
@@ -0,0 +1,59 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype dotv kernel interfaces.
+//
+#undef  GENTPROT3
+#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
+\
+void PASTEMAC3(chx,chy,chr,varname)( \
+                                     conj_t            conjx, \
+                                     conj_t            conjy, \
+                                     dim_t             n, \
+                                     ctype_x* restrict x, inc_t incx, \
+                                     ctype_y* restrict y, inc_t incy, \
+                                     ctype_r* restrict rho \
+                                   );
+
+INSERT_GENTPROT3_BASIC( dotv_opt_var1 )
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTPROT3_MIX_D( dotv_opt_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTPROT3_MIX_P( dotv_opt_var1 )
+#endif
--- a/config/template/kernels/1f/bli_axpy2v_opt_var1.c
+++ b/config/template/kernels/1f/bli_axpy2v_opt_var1.c
@@ -0,0 +1,436 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sssaxpy2v_opt_var1(
+                             conj_t             conjx,
+                             conj_t             conjy,
+                             dim_t              n,
+                             float*    restrict alpha1,
+                             float*    restrict alpha2,
+                             float*    restrict x, inc_t incx,
+                             float*    restrict y, inc_t incy,
+                             float*    restrict z, inc_t incz
+                           )
+{
+	/* Just call the reference implementation. */
+	bli_sssaxpy2v_unb_var1( conjx,
+	                        conjy,
+	                        n,
+	                        alpha1,
+	                        alpha2,
+	                        x, incx,
+	                        y, incy,
+	                        z, incz );
+}
+
+
+
+void bli_dddaxpy2v_opt_var1(
+                             conj_t             conjx,
+                             conj_t             conjy,
+                             dim_t              n,
+                             double*   restrict alpha1,
+                             double*   restrict alpha2,
+                             double*   restrict x, inc_t incx,
+                             double*   restrict y, inc_t incy,
+                             double*   restrict z, inc_t incz
+                           )
+{
+	/* Just call the reference implementation. */
+	bli_dddaxpy2v_unb_var1( conjx,
+	                        conjy,
+	                        n,
+	                        alpha1,
+	                        alpha2,
+	                        x, incx,
+	                        y, incy,
+	                        z, incz );
+}
+
+
+
+void bli_cccaxpy2v_opt_var1(
+                             conj_t             conjx,
+                             conj_t             conjy,
+                             dim_t              n,
+                             scomplex* restrict alpha1,
+                             scomplex* restrict alpha2,
+                             scomplex* restrict x, inc_t incx,
+                             scomplex* restrict y, inc_t incy,
+                             scomplex* restrict z, inc_t incz
+                           )
+{
+	/* Just call the reference implementation. */
+	bli_cccaxpy2v_unb_var1( conjx,
+	                        conjy,
+	                        n,
+	                        alpha1,
+	                        alpha2,
+	                        x, incx,
+	                        y, incy,
+	                        z, incz );
+}
+
+
+
+void bli_zzzaxpy2v_opt_var1(
+                             conj_t             conjx,
+                             conj_t             conjy,
+                             dim_t              n,
+                             dcomplex* restrict alpha1,
+                             dcomplex* restrict alpha2,
+                             dcomplex* restrict x, inc_t incx,
+                             dcomplex* restrict y, inc_t incy,
+                             dcomplex* restrict z, inc_t incz
+                           )
+{
+/*
+  Template axpy2v kernel implementation
+
+  This function contains a template implementation for a double-precision
+  complex kernel, coded in C, which can serve as the starting point for one
+  to write an optimized kernel on an arbitrary architecture. (We show a
+  template implementation for only double-precision complex because the
+  templates for the other three floating-point types would be similar, with
+  the real instantiations being noticeably simpler due to the disappearance
+  of conjugation in the real domain.)
+
+  This kernel fuses two axpyv operations:
+
+    z := z + alpha1 * conjx( x )
+    z := z + alpha2 * conjy( y )
+
+  where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars.
+
+  Parameters:
+
+  - conjx:  Compute with conjugated values of x?
+  - conjy:  Compute with conjugated values of y?
+  - n:      The number of elements in vectors x, y, and z.
+  - alpha1: The address of the scalar to be applied to x.
+  - alpha2: The address of the scalar to be applied to y.
+  - x:      The address of vector x.
+  - incx:   The vector increment of x. incx should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - y:      The address of vector y.
+  - incy:   The vector increment of y. incy should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - z:      The address of vector z.
+  - incz:   The vector increment of z. incz should be unit unless the
+            implementation makes special accomodation for non-unit values.
+
+  This template code calls the reference implementation if any of the
+  following conditions are true:
+
+  - Any of the strides incx, incy, or incz is non-unit.
+  - Vectors x, y, and z are unaligned with different offsets.
+
+  If the vectors are aligned, or unaligned by the same offset, then optimized
+  code can be used for the bulk of the computation. This template shows how
+  the front-edge case can be handled so that the remaining computation is
+  aligned. (This template guarantees alignment in the main loops to be
+  BLIS_SIMD_ALIGN_SIZE, which is defined in bli_config.h.)
+
+  Here are a few additional things to consider:
+
+  - Because conjugation disappears in the real domain, real instances of
+    this kernel can safely ignore the values of any conjugation parameters,
+    thereby simplifying the implementation.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t n_elem_per_reg  = 1;
+	const dim_t n_iter_unroll   = 1;
+
+	const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
+	const siz_t type_size       = sizeof( *x );
+
+	dcomplex*   xp;
+	dcomplex*   yp;
+	dcomplex*   zp;
+
+	bool_t      use_ref         = FALSE;
+
+	dim_t       n_pre           = 0;
+	dim_t       n_iter;
+	dim_t       n_left;
+
+	dim_t       off_x, off_y, off_z;
+	dim_t       i;
+
+
+	// Return early if possible.
+	if ( bli_zero_dim1( n ) ) return;
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( bli_has_nonunit_inc3( incx, incy, incz ) )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
+	{
+		use_ref = TRUE;
+
+		// If a, the second column of a, and y are unaligned by the same
+		// offset, then we can still use an implementation that depends on
+		// alignment for most of the operation.
+		off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
+		off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
+		off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
+
+		if ( off_x == off_y && off_x == off_z )
+		{
+			use_ref = FALSE;
+			n_pre   = off_x / type_size;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_zzzaxpy2v_unb_var1( conjx,
+		                        conjy,
+		                        n,
+		                        alpha1,
+		                        alpha2,
+		                        x, incx,
+		                        y, incy,
+		                        z, incz );
+        return;
+	}
+
+
+	// Compute the number of unrolled and leftover (edge) iterations.
+	n_iter = ( n - n_pre ) / n_elem_per_iter;
+	n_left = ( n - n_pre ) % n_elem_per_iter;
+
+
+	// Initialize pointers into x, y, and z.
+	xp = x;
+	yp = y;
+	zp = z;
+
+
+	// Iterate over rows of x, y, and z to compute:
+	//   z += alpha1 * conjx( x ) + alpha2 * conjy( y );
+	if ( bli_is_noconj( conjx ) && bli_is_noconj( conjy ) )
+	{
+		// Compute front edge cases if x, y, and z were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzaxpys( *alpha1, *xp, *zp );
+			bli_zzzaxpys( *alpha2, *yp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// alpha1 and alpha2 should be loaded once prior to the n_iter
+		// loop and the elements of z should be loaded and stored only once
+		// each. The addresses xp, yp, and zp are guaranteed to be aligned
+		// to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzaxpys( *alpha1, *xp, *zp );
+			bli_zzzaxpys( *alpha2, *yp, *zp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+			zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzaxpys( *alpha1, *xp, *zp );
+			bli_zzzaxpys( *alpha2, *yp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+	}
+	else if ( bli_is_noconj( conjx ) && bli_is_conj( conjy ) )
+	{
+		// Compute front edge cases if x, y, and z were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzaxpys(  *alpha1, *xp, *zp );
+			bli_zzzaxpyjs( *alpha2, *yp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// alpha1 and alpha2 should be loaded once prior to the n_iter
+		// loop and the elements of z should be loaded and stored only once
+		// each. The addresses xp, yp, and zp are guaranteed to be aligned
+		// to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzaxpys(  *alpha1, *xp, *zp );
+			bli_zzzaxpyjs( *alpha2, *yp, *zp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+			zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzaxpys(  *alpha1, *xp, *zp );
+			bli_zzzaxpyjs( *alpha2, *yp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+	}
+	else if ( bli_is_conj( conjx ) && bli_is_noconj( conjy ) )
+	{
+		// Compute front edge cases if x, y, and z were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzaxpyjs( *alpha1, *xp, *zp );
+			bli_zzzaxpys(  *alpha2, *yp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// alpha1 and alpha2 should be loaded once prior to the n_iter
+		// loop and the elements of z should be loaded and stored only once
+		// each. The addresses xp, yp, and zp are guaranteed to be aligned
+		// to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzaxpyjs( *alpha1, *xp, *zp );
+			bli_zzzaxpys(  *alpha2, *yp, *zp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+			zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzaxpyjs( *alpha1, *xp, *zp );
+			bli_zzzaxpys(  *alpha2, *yp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+	}
+	else // if ( bli_is_conj( conjx ) && bli_is_conj( conjy ) )
+	{
+		// Compute front edge cases if x, y, and z were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzaxpyjs( *alpha1, *xp, *zp );
+			bli_zzzaxpyjs( *alpha2, *yp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// alpha1 and alpha2 should be loaded once prior to the n_iter
+		// loop and the elements of z should be loaded and stored only once
+		// each. The addresses xp, yp, and zp are guaranteed to be aligned
+		// to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzaxpyjs( *alpha1, *xp, *zp );
+			bli_zzzaxpyjs( *alpha2, *yp, *zp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+			zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzaxpyjs( *alpha1, *xp, *zp );
+			bli_zzzaxpyjs( *alpha2, *yp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+	}
+}
+
+
+
+//
+// Define BLAS-like interfaces with heterogeneous-typed operands.
+//
+#undef  GENTFUNC3U12
+#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
+\
+void PASTEMAC3(chx,chy,chz,varname)( \
+                                     conj_t             conjx, \
+                                     conj_t             conjy, \
+                                     dim_t              n, \
+                                     ctype_xy* restrict alpha1, \
+                                     ctype_xy* restrict alpha2, \
+                                     ctype_x*  restrict x, inc_t incx, \
+                                     ctype_y*  restrict y, inc_t incy, \
+                                     ctype_z*  restrict z, inc_t incz \
+                                   ) \
+{ \
+	/* Just call the reference implementation. */ \
+	PASTEMAC3(chx,chy,chz,kername)( conjx, \
+	                                conjy, \
+	                                n, \
+	                                alpha1, \
+	                                alpha2, \
+	                                x, incx, \
+	                                y, incy, \
+	                                z, incz ); \
+}
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTFUNC3U12_MIX_D( axpy2v_opt_var1, axpy2v_unb_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTFUNC3U12_MIX_P( axpy2v_opt_var1, axpy2v_unb_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_axpy2v_opt_var1.h
+++ b/config/template/kernels/1f/bli_axpy2v_opt_var1.h
@@ -0,0 +1,58 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#undef  GENTPROT3U12
+#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \
+\
+void PASTEMAC3(chx,chy,chz,varname)( \
+                                     conj_t             conjx, \
+                                     conj_t             conjy, \
+                                     dim_t              m, \
+                                     ctype_xy* restrict alpha1, \
+                                     ctype_xy* restrict alpha2, \
+                                     ctype_x*  restrict x, inc_t incx, \
+                                     ctype_y*  restrict y, inc_t incy, \
+                                     ctype_z*  restrict z, inc_t incz \
+                                   );
+
+INSERT_GENTPROT3U12_BASIC( axpy2v_opt_var1 )
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTPROT3U12_MIX_D( axpy2v_opt_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTPROT3U12_MIX_P( axpy2v_opt_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_axpyf_opt_var1.c
+++ b/config/template/kernels/1f/bli_axpyf_opt_var1.c
@@ -0,0 +1,416 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sssaxpyf_opt_var1(
+                            conj_t             conja,
+                            conj_t             conjx,
+                            dim_t              m,
+                            dim_t              b_n,
+                            float*    restrict alpha,
+                            float*    restrict a, inc_t inca, inc_t lda,
+                            float*    restrict x, inc_t incx,
+                            float*    restrict y, inc_t incy
+                          )
+{
+	/* Just call the reference implementation. */
+	bli_sssaxpyf_unb_var1( conja,
+	                       conjx,
+	                       m,
+	                       b_n,
+	                       alpha,
+	                       a, inca, lda,
+	                       x, incx,
+	                       y, incy );
+}
+
+
+
+void bli_dddaxpyf_opt_var1(
+                            conj_t             conja,
+                            conj_t             conjx,
+                            dim_t              m,
+                            dim_t              b_n,
+                            double*   restrict alpha,
+                            double*   restrict a, inc_t inca, inc_t lda,
+                            double*   restrict x, inc_t incx,
+                            double*   restrict y, inc_t incy
+                          )
+{
+	/* Just call the reference implementation. */
+	bli_dddaxpyf_unb_var1( conja,
+	                       conjx,
+	                       m,
+	                       b_n,
+	                       alpha,
+	                       a, inca, lda,
+	                       x, incx,
+	                       y, incy );
+}
+
+
+
+void bli_cccaxpyf_opt_var1(
+                            conj_t             conja,
+                            conj_t             conjx,
+                            dim_t              m,
+                            dim_t              b_n,
+                            scomplex* restrict alpha,
+                            scomplex* restrict a, inc_t inca, inc_t lda,
+                            scomplex* restrict x, inc_t incx,
+                            scomplex* restrict y, inc_t incy
+                          )
+{
+	/* Just call the reference implementation. */
+	bli_cccaxpyf_unb_var1( conja,
+	                       conjx,
+	                       m,
+	                       b_n,
+	                       alpha,
+	                       a, inca, lda,
+	                       x, incx,
+	                       y, incy );
+}
+
+
+void bli_zzzaxpyf_opt_var1(
+                            conj_t             conja,
+                            conj_t             conjx,
+                            dim_t              m,
+                            dim_t              b_n,
+                            dcomplex* restrict alpha,
+                            dcomplex* restrict a, inc_t inca, inc_t lda,
+                            dcomplex* restrict x, inc_t incx,
+                            dcomplex* restrict y, inc_t incy
+                          )
+{
+/*
+  Template axpyf kernel implementation
+
+  This function contains a template implementation for a double-precision
+  complex kernel, coded in C, which can serve as the starting point for one
+  to write an optimized kernel on an arbitrary architecture. (We show a
+  template implementation for only double-precision complex because the
+  templates for the other three floating-point types would be similar, with
+  the real instantiations being noticeably simpler due to the disappearance
+  of conjugation in the real domain.)
+
+  This kernel performs the following gemv-like operation:
+
+    y := y + alpha * conja( A ) * conjx( x )
+
+  where A is an m x b_n matrix, x is a vector of length b_n, y is a vector
+  of length m, and alpha is a scalar. The operation is performed as a series
+  of fused axpyv operations, and therefore A should be column-stored.
+
+  Parameters:
+
+  - conja:  Compute with conjugated values of A?
+  - conjx:  Compute with conjugated values of x?
+  - m:      The number of rows in matrix A.
+  - b_n:    The number of columns in matrix A. Must be equal to or less than
+            the fusing factor.
+  - alpha:  The address of a scalar.
+  - a:      The address of matrix A.
+  - inca:   The row stride of A. inca should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - lda:    The column stride of A.
+  - x:      The address of vector x.
+  - incx:   The vector increment of x.
+  - y:      The address of vector y.
+  - incy:   The vector increment of y. incy should be unit unless the
+            implementation makes special accomodation for non-unit values.
+
+  This template code calls the reference implementation if any of the
+  following conditions are true:
+
+  - Either of the strides inca or incy is non-unit.
+  - The address of A, the second column of A, and y are unaligned with
+    different offsets.
+
+  If the first/second columns of A and address of y are aligned, or unaligned
+  by the same offset, then optimized code can be used for the bulk of the
+  computation. This template shows how the front-edge case can be handled so
+  that the remaining computation is aligned. (This template guarantees
+  alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which is defined
+  in bli_config.h.)
+
+  Additional things to consider:
+
+  - When optimizing, you should fully unroll the loops over b_n. This is the
+    dimension across which we are fusing axpyv operations.
+  - This template code chooses to call the reference implementation whenever
+    b_n is less than the fusing factor, so as to avoid having to handle edge
+    cases. One may choose to optimize this edge case, if desired.
+  - Because conjugation disappears in the real domain, real instances of
+    this kernel can safely ignore the values of any conjugation parameters,
+    thereby simplifying the implementation.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t n_elem_per_reg  = 1;
+	const dim_t n_iter_unroll   = 1;
+
+	const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
+	const siz_t type_size       = sizeof( *a );
+
+	dcomplex*   ap[ bli_zaxpyf_fusefac ];
+	dcomplex*   xp[ bli_zaxpyf_fusefac ];
+	dcomplex*   yp;
+
+	dcomplex    alpha_x[ bli_zaxpyf_fusefac ];
+
+	bool_t      use_ref         = FALSE;
+
+	dim_t       m_pre           = 0;
+	dim_t       m_iter;
+	dim_t       m_left;
+
+	dim_t       off_a, off_a2, off_y;
+	dim_t       i, j;
+
+
+	// Return early if possible.
+	if ( bli_zero_dim2( m, b_n ) ) return;
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( b_n < bli_zaxpyf_fusefac )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_has_nonunit_inc3( inca, incx, incy ) )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( a,     BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( y,     BLIS_SIMD_ALIGN_SIZE ) )
+	{
+		use_ref = TRUE;
+
+		// If a, the second column of a, and y are unaligned by the same
+		// offset, then we can still use an implementation that depends on
+		// alignment for most of the operation.
+		off_a  = bli_offset_from_alignment( a,     BLIS_SIMD_ALIGN_SIZE );
+		off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
+		off_y  = bli_offset_from_alignment( y,     BLIS_SIMD_ALIGN_SIZE );
+
+		if ( off_a == off_y && off_a == off_a2 )
+		{
+			use_ref = FALSE;
+			m_pre   = off_a / type_size;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_zzzaxpyf_unb_var1( conja,
+		                       conjx,
+		                       m,
+		                       b_n,
+		                       alpha,
+		                       a, inca, lda,
+		                       x, incx,
+		                       y, incy );
+        return;
+	}
+
+
+	// Compute the number of unrolled and leftover (edge) iterations.
+	m_iter = ( m - m_pre ) / n_elem_per_iter;
+	m_left = ( m - m_pre ) % n_elem_per_iter;
+
+
+	// Initialize pointers into the columns of A and elements of x.
+	for ( j = 0; j < b_n; ++j )
+	{
+		ap[ j ] = a + (j  )*lda;
+		xp[ j ] = x + (j  )*incx;
+	}
+	yp = y;
+
+
+	// Load elements of x or conj(x) into alpha_x and scale by alpha.
+	if ( bli_is_noconj( conjx ) )
+	{
+		for ( j = 0; j < b_n; ++j )
+		{
+			bli_zzcopys( *xp[ j ], alpha_x[ j ] );
+			bli_zzscals( *alpha, alpha_x[ j ] );
+		}
+	}
+	else // if ( bli_is_conj( conjx ) )
+	{
+		for ( j = 0; j < b_n; ++j )
+		{
+			bli_zzcopyjs( *xp[ j ], alpha_x[ j ] );
+			bli_zzscals( *alpha, alpha_x[ j ] );
+		}
+	}
+
+	// Iterate over rows of A and y to compute:
+	//   y += conja( A )*conjx( x );
+	if ( bli_is_noconj( conja ) )
+	{
+		// Compute front edge cases if a and y were unaligned.
+		for ( i = 0; i < m_pre; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
+
+				ap[ j ] += 1;
+			}
+			yp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// the elements of alpha_x should be loaded once prior to the m_iter
+		// loop, and the b_n loop should be fully unrolled. The addresses in
+		// ap[] and yp are guaranteed to be aligned to
+		// BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < m_iter; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
+
+				ap[ j ] += n_elem_per_iter;
+			}
+			yp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < m_left; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzaxpys( alpha_x[ j ], *ap[ j ], *yp );
+
+				ap[ j ] += 1;
+			}
+			yp += 1;
+		}
+	}
+	else // if ( bli_is_conj( conja ) )
+	{
+		// Compute front edge cases if a and y were unaligned.
+		for ( i = 0; i < m_pre; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
+
+				ap[ j ] += 1;
+			}
+			yp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// the elements of alpha_x should be loaded once prior to the m_iter
+		// loop, and the b_n loop should be fully unrolled. The addresses in
+		// ap[] and yp are guaranteed to be aligned to
+		// BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < m_iter; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
+
+				ap[ j ] += n_elem_per_iter;
+			}
+			yp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases.
+		for ( i = 0; i < m_left; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
+
+				ap[ j ] += 1;
+			}
+			yp += 1;
+		}
+	}
+
+}
+
+
+
+//
+// Define BLAS-like interfaces with heterogeneous-typed operands.
+//
+#undef  GENTFUNC3U12
+#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
+\
+void PASTEMAC3(cha,chx,chy,varname)( \
+                                     conj_t             conja, \
+                                     conj_t             conjx, \
+                                     dim_t              m, \
+                                     dim_t              b_n, \
+                                     ctype_ax* restrict alpha, \
+                                     ctype_a*  restrict a, inc_t inca, inc_t lda, \
+                                     ctype_x*  restrict x, inc_t incx, \
+                                     ctype_y*  restrict y, inc_t incy \
+                                   ) \
+{ \
+	/* Just call the reference implementation. */ \
+	PASTEMAC3(cha,chx,chy,kername)( conja, \
+	                                conjx, \
+	                                m, \
+	                                b_n, \
+	                                alpha, \
+	                                a, inca, lda, \
+	                                x, incx, \
+	                                y, incy ); \
+}
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTFUNC3U12_MIX_D( axpyf_opt_var1, axpyf_unb_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTFUNC3U12_MIX_P( axpyf_opt_var1, axpyf_unb_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_axpyf_opt_var1.h
+++ b/config/template/kernels/1f/bli_axpyf_opt_var1.h
@@ -0,0 +1,62 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype axpyf kernel interfaces.
+//
+#undef  GENTPROT3U12
+#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
+\
+void PASTEMAC3(cha,chx,chy,varname)( \
+                                     conj_t             conja, \
+                                     conj_t             conjx, \
+                                     dim_t              m, \
+                                     dim_t              b_n, \
+                                     ctype_ax* restrict alpha, \
+                                     ctype_a*  restrict a, inc_t inca, inc_t lda, \
+                                     ctype_x*  restrict x, inc_t incx, \
+                                     ctype_y*  restrict y, inc_t incy \
+                                   );
+
+INSERT_GENTPROT3U12_BASIC( axpyf_opt_var1 )
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTPROT3U12_MIX_D( axpyf_opt_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTPROT3U12_MIX_P( axpyf_opt_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_dotaxpyv_opt_var1.c
+++ b/config/template/kernels/1f/bli_dotaxpyv_opt_var1.c
@@ -0,0 +1,470 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sssdotaxpyv_opt_var1( conj_t             conjxt,
+                               conj_t             conjx,
+                               conj_t             conjy,
+                               dim_t              n,
+                               float*    restrict alpha,
+                               float*    restrict x, inc_t incx,
+                               float*    restrict y, inc_t incy,
+                               float*    restrict rho,
+                               float*    restrict z, inc_t incz )
+{
+	/* Just call the reference implementation. */
+	bli_sssdotaxpyv_unb_var1( conjxt,
+	                          conjx,
+	                          conjy,
+	                          n,
+	                          alpha,
+	                          x, incx,
+	                          y, incy,
+	                          rho,
+	                          z, incz );
+}
+
+
+
+void bli_ddddotaxpyv_opt_var1( conj_t             conjxt,
+                               conj_t             conjx,
+                               conj_t             conjy,
+                               dim_t              n,
+                               double*   restrict alpha,
+                               double*   restrict x, inc_t incx,
+                               double*   restrict y, inc_t incy,
+                               double*   restrict rho,
+                               double*   restrict z, inc_t incz )
+{
+	/* Just call the reference implementation. */
+	bli_ddddotaxpyv_unb_var1( conjxt,
+	                          conjx,
+	                          conjy,
+	                          n,
+	                          alpha,
+	                          x, incx,
+	                          y, incy,
+	                          rho,
+	                          z, incz );
+}
+
+
+
+void bli_cccdotaxpyv_opt_var1( conj_t             conjxt,
+                               conj_t             conjx,
+                               conj_t             conjy,
+                               dim_t              n,
+                               scomplex* restrict alpha,
+                               scomplex* restrict x, inc_t incx,
+                               scomplex* restrict y, inc_t incy,
+                               scomplex* restrict rho,
+                               scomplex* restrict z, inc_t incz )
+{
+	/* Just call the reference implementation. */
+	bli_cccdotaxpyv_unb_var1( conjxt,
+	                          conjx,
+	                          conjy,
+	                          n,
+	                          alpha,
+	                          x, incx,
+	                          y, incy,
+	                          rho,
+	                          z, incz );
+}
+
+
+
+void bli_zzzdotaxpyv_opt_var1( conj_t             conjxt,
+                               conj_t             conjx,
+                               conj_t             conjy,
+                               dim_t              n,
+                               dcomplex* restrict alpha,
+                               dcomplex* restrict x, inc_t incx,
+                               dcomplex* restrict y, inc_t incy,
+                               dcomplex* restrict rho,
+                               dcomplex* restrict z, inc_t incz )
+{
+/*
+  Template dotaxpyv kernel implementation
+
+  This function contains a template implementation for a double-precision
+  complex kernel, coded in C, which can serve as the starting point for one
+  to write an optimized kernel on an arbitrary architecture. (We show a
+  template implementation for only double-precision complex because the
+  templates for the other three floating-point types would be similar, with
+  the real instantiations being noticeably simpler due to the disappearance
+  of conjugation in the real domain.)
+
+  This kernel fuses a dotv and axpyv operation:
+
+    rho := conjxt( x^T ) * conjy( y )
+    z   := z + alpha * conjx( x )
+
+  where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars.
+
+  Parameters:
+
+  - conjxt: Compute with conjugated values of x^T?
+  - conjx:  Compute with conjugated values of x?
+  - conjy:  Compute with conjugated values of y?
+  - n:      The number of elements in vectors x, y, and z.
+  - alpha:  The address of the scalar to be applied to x.
+  - x:      The address of vector x.
+  - incx:   The vector increment of x. incx should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - y:      The address of vector y.
+  - incy:   The vector increment of y. incy should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - rho:    The address of the output scalar of the dotv subproblem.
+  - z:      The address of vector z.
+  - incz:   The vector increment of z. incz should be unit unless the
+            implementation makes special accomodation for non-unit values.
+
+  This template code calls the reference implementation if any of the
+  following conditions are true:
+
+  - Any of the strides incx, incy, or incz is non-unit.
+  - Vectors x, y, and z are unaligned with different offsets.
+
+  If the vectors are aligned, or unaligned by the same offset, then optimized
+  code can be used for the bulk of the computation. This template shows how
+  the front-edge case can be handled so that the remaining computation is
+  aligned. (This template guarantees alignment in the main loops to be
+  BLIS_SIMD_ALIGN_SIZE, which is defined in bli_config.h.)
+
+  Here are a few additional things to consider:
+
+  - While four combinations of possible values of conjx and conjy exist, we
+    implement only conjugation on x explicitly; we induce the other two cases
+    by toggling the effective conjugation on x and then conjugating the dot
+    product result.
+  - Because conjugation disappears in the real domain, real instances of
+    this kernel can safely ignore the values of any conjugation parameters,
+    thereby simplifying the implementation.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t n_elem_per_reg  = 1;
+	const dim_t n_iter_unroll   = 1;
+
+	const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
+	const siz_t type_size       = sizeof( *x );
+
+	dcomplex*   xp;
+	dcomplex*   yp;
+	dcomplex*   zp;
+	dcomplex    dotxy;
+
+	bool_t      use_ref         = FALSE;
+
+	dim_t       n_pre           = 0;
+	dim_t       n_iter;
+	dim_t       n_left;
+
+	dim_t       off_x, off_y, off_z;
+	dim_t       i;
+
+	conj_t      conjxt_use;
+
+
+	// If the vector lengths are zero, set rho to zero and return.
+	if ( bli_zero_dim1( n ) )
+	{
+		bli_zset0s( *rho );
+		return;
+	}
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( bli_has_nonunit_inc3( incx, incy, incz ) )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
+	{
+		use_ref = TRUE;
+
+		// If x, y, and z are unaligned by the same offset, then we can
+		// still use an implementation that depends on alignment for most
+		// of the operation.
+		off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
+		off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
+		off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
+
+		if ( off_x == off_y && off_x == off_z )
+		{
+			use_ref = FALSE;
+			n_pre   = off_x / type_size;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_zzzdotaxpyv_unb_var1( conjxt,
+		                          conjx,
+		                          conjy,
+		                          n,
+		                          alpha,
+		                          x, incx,
+		                          y, incy,
+		                          rho,
+		                          z, incz );
+        return;
+	}
+
+
+	// Compute the number of unrolled and leftover (edge) iterations.
+	n_iter = ( n - n_pre ) / n_elem_per_iter;
+	n_left = ( n - n_pre ) % n_elem_per_iter;
+
+
+	// Initialize pointers into x, y, and z.
+	xp = x;
+	yp = y;
+	zp = z;
+
+
+	// Initialize accumulator to zero.
+	bli_zset0s( dotxy );
+
+
+	conjxt_use = conjxt;
+
+	// If y must be conjugated, we compute the result indirectly by first
+	// toggling the effective conjugation of xt and then conjugating the
+	// resulting dot product.
+	if ( bli_is_conj( conjy ) )
+		bli_toggle_conj( conjxt_use );
+
+
+	// Iterate over elements of x, y, and z to compute:
+	//   r = conjxt( x^T ) * conjy( y );
+	//   z += alpha * conjx( x );
+	if ( bli_is_noconj( conjx ) && bli_is_noconj( conjxt_use ) )
+	{
+		// Compute front edge cases if x, y, and z were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+			bli_zzzaxpys( *alpha, *xp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// alpha should be loaded once prior to the n_iter loop, dotxy
+		// should be and kept in registers, and each element of x should be
+		// loaded only once each. The addresses xp, yp, and zp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+			bli_zzzaxpys( *alpha, *xp, *zp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+			zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+			bli_zzzaxpys( *alpha, *xp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+	}
+	else if ( bli_is_noconj( conjx ) && bli_is_conj( conjxt_use ) )
+	{
+		// Compute front edge cases if x, y, and z were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+			bli_zzzaxpys( *alpha, *xp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// alpha should be loaded once prior to the n_iter loop, dotxy
+		// should be and kept in registers, and each element of x should be
+		// loaded only once each. The addresses xp, yp, and zp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+			bli_zzzaxpys( *alpha, *xp, *zp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+			zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+			bli_zzzaxpys( *alpha, *xp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+	}
+	else if ( bli_is_conj( conjx ) && bli_is_noconj( conjxt_use ) )
+	{
+		// Compute front edge cases if x, y, and z were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+			bli_zzzaxpyjs( *alpha, *xp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// alpha should be loaded once prior to the n_iter loop, dotxy
+		// should be and kept in registers, and each element of x should be
+		// loaded only once each. The addresses xp, yp, and zp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+			bli_zzzaxpyjs( *alpha, *xp, *zp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+			zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzdots( *xp, *yp, dotxy );
+			bli_zzzaxpyjs( *alpha, *xp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+	}
+	else // if ( bli_is_conj( conjx ) && bli_is_conj( conjxt_use ) )
+	{
+		// Compute front edge cases if x, y, and z were unaligned.
+		for ( i = 0; i < n_pre; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+			bli_zzzaxpyjs( *alpha, *xp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// alpha should be loaded once prior to the n_iter loop, dotxy
+		// should be and kept in registers, and each element of x should be
+		// loaded only once each. The addresses xp, yp, and zp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < n_iter; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+			bli_zzzaxpyjs( *alpha, *xp, *zp );
+
+			xp += n_elem_per_iter;
+			yp += n_elem_per_iter;
+			zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < n_left; ++i )
+		{
+			bli_zzzdotjs( *xp, *yp, dotxy );
+			bli_zzzaxpyjs( *alpha, *xp, *zp );
+
+			xp += 1; yp += 1; zp += 1;
+		}
+	}
+
+	// If conjugation on y was requested, we induce it by conjugating
+	// the contents of rho.
+	if ( bli_is_conj( conjy ) )
+		bli_zconjs( dotxy );
+
+	bli_zzcopys( dotxy, *rho );
+}
+
+
+//
+// Define BLAS-like interfaces with heterogeneous-typed operands.
+//
+#undef  GENTFUNC3U12
+#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
+\
+void PASTEMAC3(chx,chy,chz,varname)( \
+                                     conj_t             conjxt, \
+                                     conj_t             conjx, \
+                                     conj_t             conjy, \
+                                     dim_t              n, \
+                                     ctype_x*  restrict alpha, \
+                                     ctype_x*  restrict x, inc_t incx, \
+                                     ctype_y*  restrict y, inc_t incy, \
+                                     ctype_xy* restrict rho, \
+                                     ctype_z*  restrict z, inc_t incz \
+                                   ) \
+{ \
+	/* Just call the reference implementation. */ \
+	PASTEMAC3(chx,chy,chz,kername)( conjxt, \
+	                                conjx, \
+	                                conjy, \
+	                                n, \
+	                                alpha, \
+	                                x, incx, \
+	                                y, incy, \
+	                                rho, \
+	                                z, incz ); \
+}
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTFUNC3U12_MIX_D( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTFUNC3U12_MIX_P( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_dotaxpyv_opt_var1.h
+++ b/config/template/kernels/1f/bli_dotaxpyv_opt_var1.h
@@ -0,0 +1,60 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#undef  GENTPROT3U12
+#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \
+\
+void PASTEMAC3(chx,chy,chz,varname)( \
+                                     conj_t             conjxt, \
+                                     conj_t             conjx, \
+                                     conj_t             conjy, \
+                                     dim_t              n, \
+                                     ctype_x*  restrict alpha, \
+                                     ctype_x*  restrict x, inc_t incx, \
+                                     ctype_y*  restrict y, inc_t incy, \
+                                     ctype_xy* restrict rho, \
+                                     ctype_z*  restrict z, inc_t incz \
+                                   );
+
+INSERT_GENTPROT3U12_BASIC( dotaxpyv_opt_var1 )
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTPROT3U12_MIX_D( dotaxpyv_opt_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTPROT3U12_MIX_P( dotaxpyv_opt_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.c
+++ b/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.c
@@ -0,0 +1,610 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sssdotxaxpyf_opt_var1( conj_t             conjat,
+                                conj_t             conja,
+                                conj_t             conjw,
+                                conj_t             conjx,
+                                dim_t              m,
+                                dim_t              b_n,
+                                float*    restrict alpha,
+                                float*    restrict a, inc_t inca, inc_t lda,
+                                float*    restrict w, inc_t incw,
+                                float*    restrict x, inc_t incx,
+                                float*    restrict beta,
+                                float*    restrict y, inc_t incy,
+                                float*    restrict z, inc_t incz )
+{
+	/* Just call the reference implementation. */
+	bli_sssdotxaxpyf_unb_var1( conjat,
+	                           conja,
+	                           conjw,
+	                           conjx,
+	                           m,
+	                           b_n,
+	                           alpha,
+	                           a, inca, lda,
+	                           w, incw,
+	                           x, incx,
+	                           beta,
+	                           y, incy,
+	                           z, incz );
+}
+
+
+
+void bli_ddddotxaxpyf_opt_var1( conj_t             conjat,
+                                conj_t             conja,
+                                conj_t             conjw,
+                                conj_t             conjx,
+                                dim_t              m,
+                                dim_t              b_n,
+                                double*   restrict alpha,
+                                double*   restrict a, inc_t inca, inc_t lda,
+                                double*   restrict w, inc_t incw,
+                                double*   restrict x, inc_t incx,
+                                double*   restrict beta,
+                                double*   restrict y, inc_t incy,
+                                double*   restrict z, inc_t incz )
+{
+	/* Just call the reference implementation. */
+	bli_ddddotxaxpyf_unb_var1( conjat,
+	                           conja,
+	                           conjw,
+	                           conjx,
+	                           m,
+	                           b_n,
+	                           alpha,
+	                           a, inca, lda,
+	                           w, incw,
+	                           x, incx,
+	                           beta,
+	                           y, incy,
+	                           z, incz );
+}
+
+
+
+void bli_cccdotxaxpyf_opt_var1( conj_t             conjat,
+                                conj_t             conja,
+                                conj_t             conjw,
+                                conj_t             conjx,
+                                dim_t              m,
+                                dim_t              b_n,
+                                scomplex* restrict alpha,
+                                scomplex* restrict a, inc_t inca, inc_t lda,
+                                scomplex* restrict w, inc_t incw,
+                                scomplex* restrict x, inc_t incx,
+                                scomplex* restrict beta,
+                                scomplex* restrict y, inc_t incy,
+                                scomplex* restrict z, inc_t incz )
+{
+	/* Just call the reference implementation. */
+	bli_cccdotxaxpyf_unb_var1( conjat,
+	                           conja,
+	                           conjw,
+	                           conjx,
+	                           m,
+	                           b_n,
+	                           alpha,
+	                           a, inca, lda,
+	                           w, incw,
+	                           x, incx,
+	                           beta,
+	                           y, incy,
+	                           z, incz );
+}
+
+
+
+void bli_zzzdotxaxpyf_opt_var1( conj_t             conjat,
+                                conj_t             conja,
+                                conj_t             conjw,
+                                conj_t             conjx,
+                                dim_t              m,
+                                dim_t              b_n,
+                                dcomplex* restrict alpha,
+                                dcomplex* restrict a, inc_t inca, inc_t lda,
+                                dcomplex* restrict w, inc_t incw,
+                                dcomplex* restrict x, inc_t incx,
+                                dcomplex* restrict beta,
+                                dcomplex* restrict y, inc_t incy,
+                                dcomplex* restrict z, inc_t incz )
+
+{
+/*
+  Template dotxaxpyf kernel implementation
+
+  This function contains a template implementation for a double-precision
+  complex kernel, coded in C, which can serve as the starting point for one
+  to write an optimized kernel on an arbitrary architecture. (We show a
+  template implementation for only double-precision complex because the
+  templates for the other three floating-point types would be similar, with
+  the real instantiations being noticeably simpler due to the disappearance
+  of conjugation in the real domain.)
+
+  This kernel performs the following two gemv-like operations:
+
+    y := beta * y + alpha * conjat( A^T ) * conjw( w )
+    z :=        z + alpha * conja( A )    * conjx( x )
+
+  where A is an m x b_n matrix, x and y are vector of length b_n, w and z
+  are vectors of length m, and alpha and beta are scalars. The operation
+  fuses a dotxf and an axpyf operation, and therefore A should be column-
+  stored.
+
+  Parameters:
+
+  - conjat: Compute with conjugated values of A^T?
+  - conja:  Compute with conjugated values of A?
+  - conjw:  Compute with conjugated values of w?
+  - conjx:  Compute with conjugated values of x?
+  - m:      The number of rows in matrix A.
+  - b_n:    The number of columns in matrix A. Must be equal to or less than
+            the fusing factor.
+  - alpha:  The address of the scalar to be applied to A^T*w and A*x.
+  - a:      The address of matrix A.
+  - inca:   The row stride of A. inca should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - lda:    The column stride of A.
+  - w:      The address of vector w.
+  - incw:   The vector increment of w. incw should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - x:      The address of vector x.
+  - incx:   The vector increment of x.
+  - beta:   The address of the scalar to be applied to y.
+  - y:      The address of vector y.
+  - incy:   The vector increment of y.
+  - z:      The address of vector z.
+  - incz:   The vector increment of z. incz should be unit unless the
+            implementation makes special accomodation for non-unit values.
+
+  This template code calls the reference implementation if any of the
+  following conditions are true:
+
+  - Any of the strides inca, incw, or incz is non-unit.
+  - The address of A, the second column of A, w, and z are unaligned with
+    different offsets.
+
+  If the first/second rows of A and addresses of w and z are aligned, or
+  unaligned by the same offset, then optimized code can be used for the bulk
+  of the computation. This template shows how the front-edge case can be
+  handled so that the remaining computation is aligned. (This template
+  guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which
+  is defined in bli_config.h.)
+
+  Additional things to consider:
+
+  - When optimizing, you should fully unroll the loops over b_n. This is the
+    dimension across which we are fusing dotxv operations.
+  - This template code chooses to call the reference implementation whenever
+    b_n is less than the fusing factor, so as to avoid having to handle edge
+    cases. One may choose to optimize this edge case, if desired.
+  - Because conjugation disappears in the real domain, real instances of
+    this kernel can safely ignore the values of any conjugation parameters,
+    thereby simplifying the implementation.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t n_elem_per_reg  = 1;
+	const dim_t n_iter_unroll   = 1;
+
+	const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
+	const siz_t type_size       = sizeof( *a );
+
+	dcomplex*   ap[ bli_zdotxaxpyf_fusefac ];
+	dcomplex*   xp[ bli_zdotxaxpyf_fusefac ];
+	dcomplex*   yp[ bli_zdotxaxpyf_fusefac ];
+	dcomplex*   wp;
+	dcomplex*   zp;
+
+	dcomplex    At_w[ bli_zdotxaxpyf_fusefac ];
+	dcomplex    alpha_x[ bli_zdotxaxpyf_fusefac ];
+
+	bool_t      use_ref         = FALSE;
+
+	dim_t       m_pre           = 0;
+	dim_t       m_iter;
+	dim_t       m_left;
+
+	dim_t       off_a, off_a2, off_w, off_z;
+	dim_t       i, j;
+
+	conj_t      conjat_use;
+
+
+	// Return early if possible.
+	if ( bli_zero_dim2( m, b_n ) ) return;
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( b_n < bli_zdotxaxpyf_fusefac )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_has_nonunit_inc3( inca, incw, incz ) )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( a,     BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( w,     BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( z,     BLIS_SIMD_ALIGN_SIZE ) )
+	{
+		use_ref = TRUE;
+
+		// If a, the second column of a, w, and z are unaligned by the same
+		// offset, then we can still use an implementation that depends on
+		// alignment for most of the operation.
+		off_a  = bli_offset_from_alignment( a,     BLIS_SIMD_ALIGN_SIZE );
+		off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
+		off_w  = bli_offset_from_alignment( w,     BLIS_SIMD_ALIGN_SIZE );
+		off_z  = bli_offset_from_alignment( z,     BLIS_SIMD_ALIGN_SIZE );
+
+		if ( off_a == off_a2 && off_a == off_w && off_a == off_z )
+		{
+			use_ref = FALSE;
+			m_pre   = off_a / type_size;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_zzzdotxaxpyf_unb_var1( conjat,
+		                           conja,
+		                           conjw,
+		                           conjx,
+		                           m,
+		                           b_n,
+		                           alpha,
+		                           a, inca, lda,
+		                           w, incw,
+		                           x, incx,
+		                           beta,
+		                           y, incy,
+		                           z, incz );
+        return;
+	}
+
+
+	// Compute the number of unrolled and leftover (edge) iterations.
+	m_iter = ( m - m_pre ) / n_elem_per_iter;
+	m_left = ( m - m_pre ) % n_elem_per_iter;
+
+
+	// Initialize pointers into the columns of A and elements of x.
+	for ( j = 0; j < b_n; ++j )
+	{
+		ap[ j ] = a + (j  )*lda;
+		xp[ j ] = x + (j  )*incx;
+		yp[ j ] = y + (j  )*incy;
+	}
+	wp = w;
+	zp = z;
+
+	// Load elements of x or conj(x) into alpha_x and scale by alpha.
+	if ( bli_is_noconj( conjx ) )
+	{
+		for ( j = 0; j < b_n; ++j )
+		{
+			bli_zzcopys( *xp[ j ], alpha_x[ j ] );
+			bli_zzscals( *alpha, alpha_x[ j ] );
+		}
+	}
+	else // if ( bli_is_conj( conjx ) )
+	{
+		for ( j = 0; j < b_n; ++j )
+		{
+			bli_zzcopyjs( *xp[ j ], alpha_x[ j ] );
+			bli_zzscals( *alpha, alpha_x[ j ] );
+		}
+	}
+
+	// Initialize our accumulators to zero.
+	for ( j = 0; j < b_n; ++j )
+	{
+		bli_zset0s( At_w[ j ] );
+	}
+
+
+	conjat_use = conjat;
+
+	// If w must be conjugated, we compute the result indirectly by first
+	// toggling the effective conjugation of At and then conjugating the
+	// resulting dot products.
+	if ( bli_is_conj( conjw ) )
+		bli_toggle_conj( conjat_use );
+
+
+	// Iterate over the columns of A and elements of w and z to compute:
+	//   y = beta * y + alpha * conjat( A^T ) * conjw( w );
+    //   z =        z + alpha * conja( A )    * conjx( x );
+	// where A is m x b_n.
+	if ( bli_is_noconj( conja ) && bli_is_noconj( conjat_use ) )
+	{
+		// Compute front edge cases if A, w, and z were unaligned.
+		for ( i = 0; i < m_pre; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += 1;
+			}
+			wp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// the elements of alpha_x should be loaded once prior to the m_iter
+		// loop, At_w should be kept in registers, and the b_n loop should
+		// be fully unrolled. The addresses in ap[], wp, and zp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < m_iter; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += n_elem_per_iter;
+			}
+			wp += n_elem_per_iter; zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < m_left; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += 1;
+			}
+			wp += 1; zp += 1;
+		}
+	}
+	else if ( bli_is_noconj( conja ) && bli_is_conj( conjat_use ) )
+	{
+		// Compute front edge cases if A, w, and z were unaligned.
+		for ( i = 0; i < m_pre; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += 1;
+			}
+			wp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// the elements of alpha_x should be loaded once prior to the m_iter
+		// loop, At_w should be kept in registers, and the b_n loop should
+		// be fully unrolled. The addresses in ap[], wp, and zp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < m_iter; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += n_elem_per_iter;
+			}
+			wp += n_elem_per_iter; zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < m_left; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdots( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += 1;
+			}
+			wp += 1; zp += 1;
+		}
+	}
+	else if ( bli_is_conj( conja ) && bli_is_noconj( conjat_use ) )
+	{
+		// Compute front edge cases if A, w, and z were unaligned.
+		for ( i = 0; i < m_pre; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += 1;
+			}
+			wp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// the elements of alpha_x should be loaded once prior to the m_iter
+		// loop, At_w should be kept in registers, and the b_n loop should
+		// be fully unrolled. The addresses in ap[], wp, and zp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < m_iter; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += n_elem_per_iter;
+			}
+			wp += n_elem_per_iter; zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < m_left; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdots( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += 1;
+			}
+			wp += 1; zp += 1;
+		}
+	}
+	else if ( bli_is_conj( conja ) && bli_is_conj( conjat_use ) )
+	{
+		// Compute front edge cases if A, w, and z were unaligned.
+		for ( i = 0; i < m_pre; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += 1;
+			}
+			wp += 1; zp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// the elements of alpha_x should be loaded once prior to the m_iter
+		// loop, At_w should be kept in registers, and the b_n loop should
+		// be fully unrolled. The addresses in ap[], wp, and zp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( i = 0; i < m_iter; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += n_elem_per_iter;
+			}
+			wp += n_elem_per_iter; zp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( i = 0; i < m_left; ++i )
+		{
+			for ( j = 0; j < b_n; ++j )
+			{
+				bli_zzzdotjs( *ap[ j ], *wp, At_w[ j ] );
+				bli_zzzdotjs( *ap[ j ], alpha_x[ j ], *zp );
+
+				ap[ j ] += 1;
+			}
+			wp += 1; zp += 1;
+		}
+	}
+
+
+	// If conjugation on w was requested, we induce it by conjugating
+	// the contents of At_w.
+	if ( bli_is_conj( conjw ) )
+	{
+		for ( j = 0; j < b_n; ++j )
+		{
+			bli_zconjs( At_w[ j ] );
+		}
+	}
+
+	// Scale the At_w product by alpha and accumulate into y after
+	// scaling by beta.
+	for ( j = 0; j < b_n; ++j )
+	{
+		bli_zzscals( *beta, *yp[ j ] );
+		bli_zzzaxpys( *alpha, At_w[ j ], *yp[ j ] );
+	}
+}
+
+
+
+//
+// Define BLAS-like interfaces with heterogeneous-typed operands.
+//
+#undef  GENTFUNC3U12
+#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, kername ) \
+\
+void PASTEMAC3(cha,chb,chc,varname)( \
+                                     conj_t             conjat, \
+                                     conj_t             conja, \
+                                     conj_t             conjw, \
+                                     conj_t             conjx, \
+                                     dim_t              m, \
+                                     dim_t              b_n, \
+                                     ctype_ab* restrict alpha, \
+                                     ctype_a*  restrict a, inc_t inca, inc_t lda, \
+                                     ctype_b*  restrict w, inc_t incw, \
+                                     ctype_b*  restrict x, inc_t incx, \
+                                     ctype_c*  restrict beta, \
+                                     ctype_c*  restrict y, inc_t incy, \
+                                     ctype_c*  restrict z, inc_t incz \
+                                   ) \
+{ \
+	/* Just call the reference implementation. */ \
+	PASTEMAC3(cha,chx,chy,kername)( conjat, \
+	                                conja, \
+	                                conjw, \
+	                                conjx, \
+	                                m, \
+	                                b_n, \
+	                                alpha, \
+	                                a, inca, lda, \
+	                                w, incw, \
+	                                x, incx, \
+	                                beta, \
+	                                y, incy, \
+	                                z, incz ); \
+}
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.h
+++ b/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.h
@@ -0,0 +1,64 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#undef  GENTPROT3U12
+#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \
+\
+void PASTEMAC3(cha,chb,chc,varname)( \
+                                     conj_t             conjat, \
+                                     conj_t             conja, \
+                                     conj_t             conjw, \
+                                     conj_t             conjx, \
+                                     dim_t              m, \
+                                     dim_t              b_n, \
+                                     ctype_ab* restrict alpha, \
+                                     ctype_a*  restrict a, inc_t inca, inc_t lda, \
+                                     ctype_b*  restrict w, inc_t incw, \
+                                     ctype_b*  restrict x, inc_t incx, \
+                                     ctype_c*  restrict beta, \
+                                     ctype_c*  restrict y, inc_t incy, \
+                                     ctype_c*  restrict z, inc_t incz \
+                                   );
+
+INSERT_GENTPROT3U12_BASIC( dotxaxpyf_opt_var1 )
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_opt_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_opt_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_dotxf_opt_var1.c
+++ b/config/template/kernels/1f/bli_dotxf_opt_var1.c
@@ -0,0 +1,456 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sssdotxf_opt_var1(
+                            conj_t             conjat,
+                            conj_t             conjx,
+                            dim_t              m,
+                            dim_t              b_n,
+                            float*    restrict alpha,
+                            float*    restrict a, inc_t inca, inc_t lda,
+                            float*    restrict x, inc_t incx,
+                            float*    restrict beta,
+                            float*    restrict y, inc_t incy
+                          )
+{
+	/* Just call the reference implementation. */
+	bli_sssdotxf_unb_var1( conjat,
+	                       conjx,
+	                       m,
+	                       b_n,
+	                       alpha,
+	                       a, inca, lda,
+	                       x, incx,
+	                       beta,
+	                       y, incy );
+}
+
+
+
+void bli_ddddotxf_opt_var1(
+                            conj_t             conjat,
+                            conj_t             conjx,
+                            dim_t              m,
+                            dim_t              b_n,
+                            double*   restrict alpha,
+                            double*   restrict a, inc_t inca, inc_t lda,
+                            double*   restrict x, inc_t incx,
+                            double*   restrict beta,
+                            double*   restrict y, inc_t incy
+                          )
+{
+	/* Just call the reference implementation. */
+	bli_ddddotxf_unb_var1( conjat,
+	                       conjx,
+	                       m,
+	                       b_n,
+	                       alpha,
+	                       a, inca, lda,
+	                       x, incx,
+	                       beta,
+	                       y, incy );
+}
+
+
+
+void bli_cccdotxf_opt_var1(
+                            conj_t             conjat,
+                            conj_t             conjx,
+                            dim_t              m,
+                            dim_t              b_n,
+                            scomplex* restrict alpha,
+                            scomplex* restrict a, inc_t inca, inc_t lda,
+                            scomplex* restrict x, inc_t incx,
+                            scomplex* restrict beta,
+                            scomplex* restrict y, inc_t incy
+                          )
+{
+	/* Just call the reference implementation. */
+	bli_cccdotxf_unb_var1( conjat,
+	                       conjx,
+	                       m,
+	                       b_n,
+	                       alpha,
+	                       a, inca, lda,
+	                       x, incx,
+	                       beta,
+	                       y, incy );
+}
+
+
+
+void bli_zzzdotxf_opt_var1(
+                            conj_t             conjat,
+                            conj_t             conjx,
+                            dim_t              m,
+                            dim_t              b_n,
+                            dcomplex* restrict alpha,
+                            dcomplex* restrict a, inc_t inca, inc_t lda,
+                            dcomplex* restrict x, inc_t incx,
+                            dcomplex* restrict beta,
+                            dcomplex* restrict y, inc_t incy
+                          )
+{
+/*
+  Template dotxf kernel implementation
+
+  This function contains a template implementation for a double-precision
+  complex kernel, coded in C, which can serve as the starting point for one
+  to write an optimized kernel on an arbitrary architecture. (We show a
+  template implementation for only double-precision complex because the
+  templates for the other three floating-point types would be similar, with
+  the real instantiations being noticeably simpler due to the disappearance
+  of conjugation in the real domain.)
+
+  This kernel performs the following gemv-like operation:
+
+    y := beta * y + alpha * conjat( A^T ) * conjx( x )
+
+  where A is an m x b_n matrix, x is a vector of length m, y is a vector
+  of length b_n, and alpha and beta are scalars. The operation is performed
+  as a series of fused dotxv operations, and therefore A should be column-
+  stored.
+
+  Parameters:
+
+  - conjat: Compute with conjugated values of A^T?
+  - conjx:  Compute with conjugated values of x?
+  - m:      The number of rows in matrix A.
+  - b_n:    The number of columns in matrix A. Must be equal to or less than
+            the fusing factor.
+  - alpha:  The address of the scalar to be applied to A*x.
+  - a:      The address of matrix A.
+  - inca:   The row stride of A. inca should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - lda:    The column stride of A.
+  - x:      The address of vector x.
+  - incx:   The vector increment of x. incx should be unit unless the
+            implementation makes special accomodation for non-unit values.
+  - beta:   The address of the scalar to be applied to y.
+  - y:      The address of vector y.
+  - incy:   The vector increment of y.
+
+  This template code calls the reference implementation if any of the
+  following conditions are true:
+
+  - Either of the strides inca or incx is non-unit.
+  - The address of A, the second column of A, and x are unaligned with
+    different offsets.
+
+  If the first/second columns of A and address of x are aligned, or unaligned
+  by the same offset, then optimized code can be used for the bulk of the
+  computation. This template shows how the front-edge case can be handled so
+  that the remaining computation is aligned. (This template guarantees
+  alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE, which is defined
+  in bli_config.h.)
+
+  Additional things to consider:
+
+  - When optimizing, you should fully unroll the loops over b_n. This is the
+    dimension across which we are fusing dotxv operations.
+  - This template code chooses to call the reference implementation whenever
+    b_n is less than the fusing factor, so as to avoid having to handle edge
+    cases. One may choose to optimize this edge case, if desired.
+  - Because conjugation disappears in the real domain, real instances of
+    this kernel can safely ignore the values of any conjugation parameters,
+    thereby simplifying the implementation.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t n_elem_per_reg  = 1;
+	const dim_t n_iter_unroll   = 1;
+
+	const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
+	const siz_t type_size       = sizeof( *x );
+
+	dcomplex*   ap[ bli_zdotxf_fusefac ];
+	dcomplex*   xp;
+	dcomplex*   yp[ bli_zdotxf_fusefac ];
+
+	dcomplex    Atx[ bli_zdotxf_fusefac ];
+
+	bool_t      use_ref         = FALSE;
+
+	dim_t       m_pre           = 0;
+	dim_t       m_iter;
+	dim_t       m_left;
+
+	dim_t       off_a, off_a2, off_x;
+	dim_t       i, j;
+
+	conj_t      conjat_use;
+
+
+	// Return early if possible.
+	if ( bli_zero_dim1( b_n ) ) return;
+
+	// If the vector lengths are zero, scale r by beta and return.
+	if ( bli_zero_dim1( m ) )
+	{
+		bli_zzscalv( BLIS_NO_CONJUGATE,
+		             b_n,
+		             beta,
+		             y, incy );
+		return;
+	}
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( b_n < bli_zdotxf_fusefac )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_has_nonunit_inc2( inca, incx ) )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( a,     BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
+	          bli_is_unaligned_to( x,     BLIS_SIMD_ALIGN_SIZE ) )
+	{
+		use_ref = TRUE;
+
+		// If a, the second column of a, and x are unaligned by the same
+		// offset, then we can still use an implementation that depends on
+		// alignment for most of the operation.
+		off_a  = bli_offset_from_alignment( a,     BLIS_SIMD_ALIGN_SIZE );
+		off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
+		off_x  = bli_offset_from_alignment( x,     BLIS_SIMD_ALIGN_SIZE );
+
+		if ( off_a == off_a2 && off_a == off_x )
+		{
+			use_ref = FALSE;
+			m_pre   = off_x / type_size;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_zzzdotxf_unb_var1( conjat,
+		                       conjx,
+		                       m,
+		                       b_n,
+		                       alpha,
+		                       a, inca, lda,
+		                       x, incx,
+		                       beta,
+		                       y, incy );
+        return;
+	}
+
+
+	// Compute the number of unrolled and leftover (edge) iterations.
+	m_iter = ( m - m_pre ) / n_elem_per_iter;
+	m_left = ( m - m_pre ) % n_elem_per_iter;
+
+
+	// Initialize pointers into the rows of A and elements of y.
+	for ( i = 0; i < b_n; ++i )
+	{
+		ap[ i ] = a + (i  )*lda;
+		yp[ i ] = y + (i  )*incy;
+	}
+	xp = x;
+
+
+	// Initialize our accumulators to zero.
+	for ( i = 0; i < b_n; ++i )
+	{
+		bli_zset0s( Atx[ i ] );
+	}
+
+
+	conjat_use = conjat;
+
+	// If x must be conjugated, we compute the result indirectly by first
+	// toggling the effective conjugation of A and then conjugating the
+	// resulting product A^T*x.
+	if ( bli_is_conj( conjx ) )
+		bli_toggle_conj( conjat_use );
+
+	
+	// Iterate over columns of A and rows of x to compute:
+	//   Atx = conjat_use( A^T ) * x;
+	if ( bli_is_noconj( conjat_use ) )
+	{
+		// Compute front edge cases if A and y were unaligned.
+		for ( j = 0; j < m_pre; ++j )
+		{
+			for ( i = 0; i < b_n; ++i )
+			{
+				bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
+
+				ap[ i ] += 1;
+			}
+			xp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// the elements of Atx should be kept in registers, and the b_n loop
+		// should be fully unrolled. The addresses in ap[] and xp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( j = 0; j < m_iter; ++j )
+		{
+			for ( i = 0; i < b_n; ++i )
+			{
+				bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
+
+				ap[ i ] += n_elem_per_iter;
+			}
+			xp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( j = 0; j < m_left; ++j )
+		{
+			for ( i = 0; i < b_n; ++i )
+			{
+				bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
+
+				ap[ i ] += 1;
+			}
+			xp += 1;
+		}
+	}
+	else // if ( bli_is_conj( conjat_use ) )
+	{
+		// Compute front edge cases if A and y were unaligned.
+		for ( j = 0; j < m_pre; ++j )
+		{
+			for ( i = 0; i < b_n; ++i )
+			{
+				bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
+
+				ap[ i ] += 1;
+			}
+			xp += 1;
+		}
+
+		// The bulk of the operation is executed here. For best performance,
+		// the elements of Atx should be kept in registers, and the b_n loop
+		// should be fully unrolled. The addresses in ap[] and xp are
+		// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
+		for ( j = 0; j < m_iter; ++j )
+		{
+			for ( i = 0; i < b_n; ++i )
+			{
+				bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
+
+				ap[ i ] += n_elem_per_iter;
+			}
+			xp += n_elem_per_iter;
+		}
+
+		// Compute tail edge cases, if applicable.
+		for ( j = 0; j < m_left; ++j )
+		{
+			for ( i = 0; i < b_n; ++i )
+			{
+				bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
+
+				ap[ i ] += 1;
+			}
+			xp += 1;
+		}
+	}
+
+
+	// If conjugation on y was requested, we induce it by conjugating
+	// the contents of Atx.
+	if ( bli_is_conj( conjx ) )
+	{
+		for ( i = 0; i < b_n; ++i )
+		{
+			bli_zconjs( Atx[ i ] );
+		}
+	}
+
+
+	// Scale the Atx product by alpha and accumulate into y after
+	// scaling by beta.
+	for ( i = 0; i < b_n; ++i )
+	{
+		bli_zzscals( *beta, *yp[ i ] );
+		bli_zzzaxpys( *alpha, Atx[ i ], *yp[ i ] );
+	}
+}
+
+
+
+//
+// Define BLAS-like interfaces with heterogeneous-typed operands.
+//
+#undef  GENTFUNC3U12
+#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
+\
+void PASTEMAC3(chx,chy,chr,varname)( \
+                                     conj_t             conjat, \
+                                     conj_t             conjx, \
+                                     dim_t              m, \
+                                     dim_t              b_n, \
+                                     ctype_xy* restrict alpha, \
+                                     ctype_x*  restrict a, inc_t inca, inc_t lda, \
+                                     ctype_y*  restrict x, inc_t incx, \
+                                     ctype_r*  restrict beta, \
+                                     ctype_r*  restrict y, inc_t incy \
+                                   ) \
+{ \
+	/* Just call the reference implementation. */ \
+	PASTEMAC3(cha,chx,chy,kername)( conjat, \
+	                                conjx, \
+	                                m, \
+	                                b_n, \
+	                                alpha, \
+	                                a, inca, lda, \
+	                                x, incx, \
+	                                beta, \
+	                                y, incy ); \
+}
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTFUNC3U12_MIX_D( dotxf_opt_var1, dotxf_unb_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTFUNC3U12_MIX_P( dotxf_opt_var1, dotxf_unb_var1 )
+#endif
+
--- a/config/template/kernels/1f/bli_dotxf_opt_var1.h
+++ b/config/template/kernels/1f/bli_dotxf_opt_var1.h
@@ -0,0 +1,63 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype dotxf kernel interfaces.
+//
+#undef  GENTPROT3U12
+#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
+\
+void PASTEMAC3(cha,chx,chy,varname)( \
+                                     conj_t             conjat, \
+                                     conj_t             conjx, \
+                                     dim_t              m, \
+                                     dim_t              b_n, \
+                                     ctype_ax* restrict alpha, \
+                                     ctype_a*  restrict a, inc_t inca, inc_t lda, \
+                                     ctype_x*  restrict x, inc_t incx, \
+                                     ctype_y*  restrict beta, \
+                                     ctype_y*  restrict y, inc_t incy \
+                                   );
+
+INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 )
+
+#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+INSERT_GENTPROT3U12_MIX_D( dotxf_opt_var1 )
+#endif
+
+#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+INSERT_GENTPROT3U12_MIX_P( dotxf_opt_var1 )
+#endif
+
--- a/config/template/kernels/3/bli_gemm_opt_mxn.c
+++ b/config/template/kernels/3/bli_gemm_opt_mxn.c
@@ -0,0 +1,290 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sgemm_opt_mxn(
+                        dim_t              k,
+                        float*    restrict alpha,
+                        float*    restrict a,
+                        float*    restrict b,
+                        float*    restrict beta,
+                        float*    restrict c, inc_t rs_c, inc_t cs_c,
+                        float*    restrict a_next,
+                        float*    restrict b_next 
+                      )
+{
+	/* Just call the reference implementation. */
+	bli_sgemm_ref_mxn( k,
+	                   alpha,
+	                   a,
+	                   b,
+	                   beta,
+	                   c, rs_c, cs_c,
+	                   a_next,
+	                   b_next );
+}
+
+
+
+void bli_dgemm_opt_mxn(
+                        dim_t              k,
+                        double*   restrict alpha,
+                        double*   restrict a,
+                        double*   restrict b,
+                        double*   restrict beta,
+                        double*   restrict c, inc_t rs_c, inc_t cs_c,
+                        double*   restrict a_next,
+                        double*   restrict b_next 
+                      )
+{
+/*
+  Template gemm micro-kernel implementation
+
+  This function contains a template implementation for a double-precision
+  real micro-kernel, coded in C, which can serve as the starting point for
+  one to write an optimized micro-kernel on an arbitrary architecture. (We
+  show a template implementation for only double-precision real because
+  the templates for the other three floating-point types would be nearly
+  identical.)
+
+  This micro-kernel performs a matrix-matrix multiplication of the form:
+
+    C := beta * C + alpha * A * B
+
+  where A is MR x k, B is k x NR, C is MR x NR, and alpha and beta are
+  scalars.
+
+  Parameters:
+
+  - k:      The number of columns of A and rows of B.
+  - alpha:  The address of a scalar to the A*B product.
+  - a:      The address of a micro-panel of matrix A of dimension MR x k,
+            stored by columns.
+  - b:      The address of a micro-panel of matrix B of dimension k x NR,
+            stored by rows.
+  - beta:   The address of a scalar to the input value of matrix C.
+  - c:      The address of a block of matrix C of dimension MR x NR,
+            stored according to rs_c and cs_c.
+  - rs_c:   The row stride of matrix C (ie: the distance to the next row,
+            in units of matrix elements).
+  - cs_c:   The column stride of matrix C (ie: the distance to the next
+            column, in units of matrix elements).
+  - a_next: The address of the micro-panel of A that will be used the next
+            time the gemm micro-kernel will be called.
+  - b_next: The address of the micro-panel of B that will be used the next
+            time the gemm micro-kernel will be called.
+
+  The diagram below shows the packed micro-panel operands and how elements
+  of each would be stored when MR == NR == 4. (The hex digits indicate the
+  order of the elements in memory.) Note that the storage of C is not shown
+  since it is determined by the row and column strides of C.
+
+         c:             a:                         b:                   
+         _______        ______________________     _______              
+        |       |      |0 4 8 C               |   |0 1 2 3|             
+    MR  |       |      |1 5 9 D . . .         |   |4 5 6 7|             
+        |       |  +=  |2 6 A E               |   |8 9 A B|             
+        |_______|      |3_7_B_F_______________|   |C D E F|             
+                                                  |   .   |             
+            NR                    k               |   .   |             
+                                                  |   .   |             
+                                                  |       |             
+                                                  |       |             
+                                                  |_______|             
+                                                                        
+                                                      NR                
+  Here are a few things to consider:
+
+  - bli_?mr and bli_?nr give the MR and NR register blocksizes for the
+    datatype corresponding to the '?' character.
+  - bli_?packmr and bli_?packnr are usually equal to bli_?mr and bli_?nr,
+    respectively. (They are only not equal if the register blocksize
+    extensions are non-zero. See bli_config.h for more details.)
+  - You may assume that the addresses a and b are aligned according to
+    the alignment value BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in
+    bli_config.h.
+  - Here, we use a local array, ab, as temporary accumulator elements as
+    we compute the a*b product. In an optimized micro-kernel, ab is held
+    in registers rather than memory.
+  - In column-major storage (or column storage), the "leading dimension"
+    of a matrix is equivalent to its column stride, and the row stride is
+    unit. In row-major storage (row storage), the "leading dimension" is
+    equivalent to the row stride and the column stride is unit.
+  - While all three loops are exposed in this template micro-kernel, the
+    loops over MR and NR typically disappear in an optimized code because
+    they are fully unrolled, leaving only the loop over k.
+  - Some optimized micro-kernels will need the loop over k to be unrolled
+    a few times (4x seems to be a common unrolling factor).
+  - a_next and b_next can be used to perform prefetching, if prefetching
+    is supported by the architecture. They may be safely ignored by the
+    micro-kernel implementation, though.
+  - If beta == 0.0 (or 0.0 + 0.0i for complex), then the micro-kernel
+    should NOT use it explicitly, as C may contain uninitialized memory
+    (including NaNs). This case should be detected and handled separately,
+    preferably by simply overwriting C with the alpha*A*B product. An
+    example of how to perform this "beta is zero" handling is included in
+    this template implementation.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t        mr    = bli_dmr;
+	const dim_t        nr    = bli_dnr;
+
+	const inc_t        cs_a  = bli_dpackmr;
+
+	const inc_t        rs_b  = bli_dpacknr;
+
+	const inc_t        rs_ab = 1;
+	const inc_t        cs_ab = bli_dmr;
+
+	dim_t              l, j, i;
+
+	double             ab[ bli_dmr *
+	                       bli_dnr ];
+	double*            abij;
+	double             ai, bj;
+
+
+	/* Initialize the accumulator elements in ab to zero. */
+	for ( i = 0; i < mr * nr; ++i )
+	{
+		bli_dset0s( *(ab + i) );
+	}
+
+	/* Perform a series of k rank-1 updates into ab. */
+	for ( l = 0; l < k; ++l )
+	{
+		abij = ab;
+
+		/* In an optimized implementation, these two loops over MR and NR
+		   are typically fully unrolled. */
+		for ( j = 0; j < nr; ++j )
+		{
+			bj = *(b + j);
+
+			for ( i = 0; i < mr; ++i )
+			{
+				ai = *(a + i);
+
+				bli_ddots( ai, bj, *abij );
+
+				abij += rs_ab;
+			}
+		}
+
+		a += cs_a;
+		b += rs_b;
+	}
+
+	/* Scale each element of ab by alpha. */
+	for ( i = 0; i < mr * nr; ++i )
+	{
+		bli_dscals( *alpha, *(ab + i) );
+	}
+
+	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
+	   scale c by beta and then add the scaled result in ab. */
+	if ( bli_deq0( *beta ) )
+	{
+		/* c := ab */
+		bli_dcopys_mxn( mr,
+		                nr,
+		                ab, rs_ab, cs_ab,
+		                c,  rs_c,  cs_c );
+	}
+	else
+	{
+		/* c := beta * c + ab */
+		bli_dxpbys_mxn( mr,
+		                nr,
+		                ab, rs_ab, cs_ab,
+		                beta,
+		                c,  rs_c,  cs_c );
+	}
+}
+
+
+
+void bli_cgemm_opt_mxn(
+                        dim_t              k,
+                        scomplex* restrict alpha,
+                        scomplex* restrict a,
+                        scomplex* restrict b,
+                        scomplex* restrict beta,
+                        scomplex* restrict c, inc_t rs_c, inc_t cs_c,
+                        scomplex* restrict a_next,
+                        scomplex* restrict b_next 
+                      )
+{
+	/* Just call the reference implementation. */
+	bli_cgemm_ref_mxn( k,
+	                   alpha,
+	                   a,
+	                   b,
+	                   beta,
+	                   c, rs_c, cs_c,
+	                   a_next,
+	                   b_next );
+}
+
+
+
+void bli_zgemm_opt_mxn(
+                        dim_t              k,
+                        dcomplex* restrict alpha,
+                        dcomplex* restrict a,
+                        dcomplex* restrict b,
+                        dcomplex* restrict beta,
+                        dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
+                        dcomplex* restrict a_next,
+                        dcomplex* restrict b_next 
+                      )
+{
+	/* Just call the reference implementation. */
+	bli_zgemm_ref_mxn( k,
+	                   alpha,
+	                   a,
+	                   b,
+	                   beta,
+	                   c, rs_c, cs_c,
+	                   a_next,
+	                   b_next );
+}
+
--- a/config/template/kernels/3/bli_gemm_opt_mxn.h
+++ b/config/template/kernels/3/bli_gemm_opt_mxn.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype micro-kernel interfaces.
+//
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           dim_t           k, \
+                           ctype* restrict alpha, \
+                           ctype* restrict a, \
+                           ctype* restrict b, \
+                           ctype* restrict beta, \
+                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
+                           ctype* restrict a_next, \
+                           ctype* restrict b_next  \
+                         );
+
+INSERT_GENTPROT_BASIC( gemm_opt_mxn )
+
--- a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c
@@ -0,0 +1,303 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sgemmtrsm_l_opt_mxn(
+                              dim_t              k,
+                              float*    restrict alpha,
+                              float*    restrict a10,
+                              float*    restrict a11,
+                              float*    restrict bd01,
+                              float*    restrict bd11,
+                              float*    restrict b11,
+                              float*    restrict c11, inc_t rs_c, inc_t cs_c,
+                              float*    restrict a_next,
+                              float*    restrict b_next 
+                            )
+{
+	const inc_t        rs_b      = bli_spacknr;
+	const inc_t        cs_b      = 1;
+
+	float*    restrict minus_one = bli_sm1;
+
+
+	bli_sgemm_opt_mxn( k,
+	                   minus_one,
+	                   a10,
+	                   bd01,
+	                   alpha,
+	                   b11, rs_b, cs_b,
+	                   a_next,
+	                   b_next );
+
+	bli_strsm_l_opt_mxn( a11,
+	                     b11,
+	                     bd11,
+	                     c11, rs_c, cs_c );
+}
+
+
+
+void bli_dgemmtrsm_l_opt_mxn(
+                              dim_t              k,
+                              double*   restrict alpha,
+                              double*   restrict a10,
+                              double*   restrict a11,
+                              double*   restrict bd01,
+                              double*   restrict bd11,
+                              double*   restrict b11,
+                              double*   restrict c11, inc_t rs_c, inc_t cs_c,
+                              double*   restrict a_next,
+                              double*   restrict b_next 
+                            )
+{
+/*
+  Template gemmtrsm_l micro-kernel implementation
+
+  This function contains a template implementation for a double-precision
+  real micro-kernel that fuses a gemm with a trsm_l subproblem. 
+
+  This micro-kernel implements the following sequence of operations:
+
+    B11 := alpha * B11 - A10 * B01    (gemm)
+    B11 := inv(A11) * B11             (trsm)
+
+  where B11 is MR x NR, A10 is MR x k, B01 is k x NR, A11 is MR x MR and
+  lower triangular, and alpha is a scalar. Here, inv() denotes matrix
+  inverse.
+
+  NOTE: Here, this gemmtrsm micro-kernel supports element "duplication", a
+  feature that is enabled or disabled in bli_kernel.h. Duplication factors
+  are also defined in the aforementioned header. Duplication is NOT
+  commonly used and most developers may assume it is disabled.
+
+  Parameters:
+
+  - k:      The number of columns of A10 and rows of B01.
+  - alpha:  The address of a scalar to be applied to B11.
+  - a10:    The address of A10, which is the MR x k subpartition of the
+            packed (column-stored) micro-panel of A that is situated to the
+            left of the MR x MR lower triangular block.
+  - a11:    The address of A11, which is the MR x MR lower triangular block
+            within the packed micro-panel of A that is situated to the
+            right of A10. By the time this gemmtrsm kernel is called, the
+            diagonal of A11 has already been inverted and the strictly upper
+            triangle contains zeros.
+  - bd01:   The address of B01, which is the k x NR subpartition situated
+            above the current MR x NR block B11. bd01 is row-stored. If
+            duplication is enabled, then each element occurs d times,
+            effectively increasing the dimension to k x d*NR. If duplication
+            is disabled, then bd01 is simply the address of the top part of
+            the current packed (row-stored) micro-panel of B (labeled b01
+            in the diagram below).
+  - bd11:   The address of B11, which is the MR x NR subpartition situated
+            below B01. If duplication is enabled, then each element occurs
+            d times, effectively increasing the dimension to MR x d*NR. If
+            duplication is disabled, then bd11 is simply the address of the
+            current MR x NR block witin the packed (row-stored) micro-panel
+            of B.
+  - b11:    The address of the current MR x NR block within the packed
+            micro-panel of B. It exists in duplicated form as bd11. If
+            duplication is disabled, then b11 and bd11 refer to the same
+            MR x NR block within the packed (row-stored) micro-panel of B.
+  - c11:    The address of C11, which is the MR x NR block of the output
+            matrix (ie: the matrix provided by the user to the highest-level
+            trsm API call). C11 corresponds to the elements that exist in
+            packed form in B11, and is stored according to rs_c and cs_c.
+  - rs_c:   The row stride of C11 (ie: the distance to the next row of C11,
+            in units of matrix elements).
+  - cs_c:   The column stride of C11 (ie: the distance to the next column of
+            C11, in units of matrix elements).
+  - a_next: The address of the packed micro-panel of A that will be used the
+            next time the gemmtrsm micro-kernel will be called.
+  - b_next: The address of the packed micro-panel of B that will be used the
+            next time the gemmtrsm micro-kernel will be called.
+
+  The diagram below shows the packed micro-panel operands and how elements
+  of each would be stored when MR == NR == 4. (The hex digits indicate the
+  order of the elements in memory.) We also show a B duplication buffer (bd)
+  that contains a copy of the packed micro-panel of B with a duplication
+  factor of 2. If duplication is disabled (as is commonly the case), then
+  bd01 == b01 and bd11 == b11.
+
+                                             NR                 2*NR       
+    NOTE: If duplication is disabled       _______         _______________ 
+    then bd01 and bd11 simply refer   b01:|0 1 2 3|  bd01:|0 0 1 1 2 2 3 3|
+    to b01 and b11, respectively.         |4 5 6 7|       |4 4 5 5 6 6 7 7|
+                                          |8 9 A B|       |8 8 9 9 A A B B|
+                                          |C D E F|       |C C D D E E F F|
+                                        k |   .   |       |       .       |
+                                          |   .   |       |       .       |
+       a10:                a11:           |   .   |       |       .       |
+       ___________________  _______       |_______|       |_______________|
+      |0 4 8 C            |`.      |  b11:|       |  bd11:|               |
+  MR  |1 5 9 D . . .      |  `.    |      |       |       |               |
+      |2 6 A E            |    `.  |   MR |       |       |               |
+      |3_7_B_F____________|______`.|      |_______|       |_______________|
+                                                                           
+                k             MR                                           
+
+  Thus, with duplication enabled, the operation takes the form of:
+
+    b11  = alpha * b11 - a10 * bd01;
+    b11  = inv(a11) * b11;
+    bd11 = b11;  (skipped if duplication is disabled)
+    c11  = b11;
+                                                                        
+  And if duplication is disabled, the operation reduces to:
+
+    b11 = alpha * b11 - a10 * b01;  (Note: Here, b01 == bd01.)
+    b11 = inv(a11) * b11;
+    c11 = b11;
+
+  A note on optimization:
+  - This implementation simply calls the gemm micro-kernel and then the
+    trsm micro-kernel. Let's assume that the gemm micro-kernel has already
+    been optimized. You have two options with regards to optimizing the
+    fused gemmtrsm kernel.
+    (1) Optimize only the trsm kernel and continue to call the gemm and
+        trsm micro-kernels in sequence, as is done in this template
+        implementation.
+    (2) Fuse the implementation of the gemm micro-kernel with that of the
+        trsm micro-kernel by inlining both into this gemmtrsm function.
+    The latter option is more labor-intensive, but also more likely to
+    yield higher performance because it allows you to eliminate redundant
+    memory operations on the packed MR x NR block B11.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const inc_t        rs_b      = bli_dpacknr;
+	const inc_t        cs_b      = 1;
+
+	double*   restrict minus_one = bli_dm1;
+
+	/* Reminder: if duplication is disabled, then bd01 == b01, bd11 == b11. */
+
+	/* b11 = alpha * b11 - a10 * bd01; */
+	bli_dgemm_opt_mxn( k,
+	                   minus_one,
+	                   a10,
+	                   bd01,
+	                   alpha,
+	                   b11, rs_b, cs_b,
+	                   a_next,
+	                   b_next );
+
+	/* b11  = inv(a11) * b11;
+	   bd11 = b11; (skipped if duplication is disabled)
+	   c11  = b11; */
+	bli_dtrsm_l_opt_mxn( a11,
+	                     b11,
+	                     bd11,
+	                     c11, rs_c, cs_c );
+}
+
+
+
+void bli_cgemmtrsm_l_opt_mxn(
+                              dim_t              k,
+                              scomplex* restrict alpha,
+                              scomplex* restrict a10,
+                              scomplex* restrict a11,
+                              scomplex* restrict bd01,
+                              scomplex* restrict bd11,
+                              scomplex* restrict b11,
+                              scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
+                              scomplex* restrict a_next,
+                              scomplex* restrict b_next 
+                            )
+{
+	const inc_t        rs_b      = bli_cpacknr;
+	const inc_t        cs_b      = 1;
+
+	scomplex* restrict minus_one = bli_cm1;
+
+
+	bli_cgemm_opt_mxn( k,
+	                   minus_one,
+	                   a10,
+	                   bd01,
+	                   alpha,
+	                   b11, rs_b, cs_b,
+	                   a_next,
+	                   b_next );
+
+	bli_ctrsm_l_opt_mxn( a11,
+	                     b11,
+	                     bd11,
+	                     c11, rs_c, cs_c );
+}
+
+
+
+void bli_zgemmtrsm_l_opt_mxn(
+                              dim_t              k,
+                              dcomplex* restrict alpha,
+                              dcomplex* restrict a10,
+                              dcomplex* restrict a11,
+                              dcomplex* restrict bd01,
+                              dcomplex* restrict bd11,
+                              dcomplex* restrict b11,
+                              dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
+                              dcomplex* restrict a_next,
+                              dcomplex* restrict b_next 
+                            )
+{
+	const inc_t        rs_b      = bli_zpacknr;
+	const inc_t        cs_b      = 1;
+
+	dcomplex* restrict minus_one = bli_zm1;
+
+
+	bli_zgemm_opt_mxn( k,
+	                   minus_one,
+	                   a10,
+	                   bd01,
+	                   alpha,
+	                   b11, rs_b, cs_b,
+	                   a_next,
+	                   b_next );
+
+	bli_ztrsm_l_opt_mxn( a11,
+	                     b11,
+	                     bd11,
+	                     c11, rs_c, cs_c );
+}
+
--- a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h
+++ b/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype micro-kernel interfaces.
+//
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           dim_t           k, \
+                           ctype* restrict alpha, \
+                           ctype* restrict a10, \
+                           ctype* restrict a11, \
+                           ctype* restrict bd01, \
+                           ctype* restrict bd11, \
+                           ctype* restrict b11, \
+                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
+                           ctype* restrict a_next, \
+                           ctype* restrict b_next  \
+                         );
+
+INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_mxn )
+
--- a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c
@@ -0,0 +1,302 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_sgemmtrsm_u_opt_mxn(
+                              dim_t              k,
+                              float*    restrict alpha,
+                              float*    restrict a12,
+                              float*    restrict a11,
+                              float*    restrict bd21,
+                              float*    restrict bd11,
+                              float*    restrict b11,
+                              float*    restrict c11, inc_t rs_c, inc_t cs_c,
+                              float*    restrict a_next,
+                              float*    restrict b_next 
+                            )
+{
+	const inc_t        rs_b      = bli_spacknr;
+	const inc_t        cs_b      = 1;
+
+	float*    restrict minus_one = bli_sm1;
+
+
+	bli_sgemm_opt_mxn( k,
+	                   minus_one,
+	                   a12,
+	                   bd21,
+	                   alpha,
+	                   b11, rs_b, cs_b,
+	                   a_next,
+	                   b_next );
+
+	bli_strsm_u_opt_mxn( a11,
+	                     b11,
+	                     bd11,
+	                     c11, rs_c, cs_c );
+}
+
+
+
+void bli_dgemmtrsm_u_opt_mxn(
+                              dim_t              k,
+                              double*   restrict alpha,
+                              double*   restrict a12,
+                              double*   restrict a11,
+                              double*   restrict bd21,
+                              double*   restrict bd11,
+                              double*   restrict b11,
+                              double*   restrict c11, inc_t rs_c, inc_t cs_c,
+                              double*   restrict a_next,
+                              double*   restrict b_next 
+                            )
+{
+/*
+  Template gemmtrsm_u micro-kernel implementation
+
+  This function contains a template implementation for a double-precision
+  real micro-kernel that fuses a gemm with a trsm_u subproblem. 
+
+  This micro-kernel implements the following sequence of operations:
+
+    B11 := alpha * B11 - A12 * B21    (gemm)
+    B11 := inv(A11) * B11             (trsm)
+
+  where B11 is MR x NR, A12 is MR x k, B21 is k x NR, A11 is MR x MR and
+  upper triangular, and alpha is a scalar. Here, inv() denotes matrix
+  inverse.
+
+  NOTE: Here, this gemmtrsm micro-kernel supports element "duplication", a
+  feature that is enabled or disabled in bli_kernel.h. Duplication factors
+  are also defined in the aforementioned header. Duplication is NOT
+  commonly used and most developers may assume it is disabled.
+
+  Parameters:
+
+  - k:      The number of columns of A12 and rows of B21.
+  - alpha:  The address of a scalar to be applied to B11.
+  - a12:    The address of A12, which is the MR x k subpartition of the
+            packed (column-stored) micro-panel of A that is situated to the
+            right of the MR x MR upper triangular block.
+  - a11:    The address of A11, which is the MR x MR upper triangular block
+            within the packed micro-panel of A that is situated to the
+            left of A12. By the time this gemmtrsm kernel is called, the
+            diagonal of A11 has already been inverted and the strictly lower
+            triangle contains zeros.
+  - bd21:   The address of B21, which is the k x NR subpartition situated
+            above the current MR x NR block B11. bd21 is row-stored. If
+            duplication is enabled, then each element occurs d times,
+            effectively increasing the dimension to k x d*NR. If duplication
+            is disabled, then bd21 is simply the address of the top part of
+            the current packed (row-stored) micro-panel of B (labeled b21
+            in the diagram below).
+  - bd11:   The address of B11, which is the MR x NR subpartition situated
+            above B21. If duplication is enabled, then each element occurs
+            d times, effectively increasing the dimension to MR x d*NR. If
+            duplication is disabled, then bd11 is simply the address of the
+            current MR x NR block witin the packed (row-stored) micro-panel
+            of B.
+  - b11:    The address of the current MR x NR block within the packed
+            micro-panel of B. It exists in duplicated form as bd11. If
+            duplication is disabled, then b11 and bd11 refer to the same
+            MR x NR block within the packed (row-stored) micro-panel of B.
+  - c11:    The address of C11, which is the MR x NR block of the output
+            matrix (ie: the matrix provided by the user to the highest-level
+            trsm API call). C11 corresponds to the elements that exist in
+            packed form in B11, and is stored according to rs_c and cs_c.
+  - rs_c:   The row stride of C11 (ie: the distance to the next row of C11,
+            in units of matrix elements).
+  - cs_c:   The column stride of C11 (ie: the distance to the next column of
+            C11, in units of matrix elements).
+  - a_next: The address of the packed micro-panel of A that will be used the
+            next time the gemmtrsm micro-kernel will be called.
+  - b_next: The address of the packed micro-panel of B that will be used the
+            next time the gemmtrsm micro-kernel will be called.
+
+  The diagram below shows the packed micro-panel operands and how elements
+  of each would be stored when MR == NR == 4. (The hex digits indicate the
+  order of the elements in memory.) We also show a B duplication buffer (bd)
+  that contains a copy of the packed micro-panel of B with a duplication
+  factor of 2. If duplication is disabled (as is commonly the case), then
+  bd01 == b01 and bd11 == b11.
+
+       a11:     a12:                         NR                 2*NR        
+       ________ ___________________        _______         _______________  
+      |`.      |0 4 8              |  b11:|0 1 2 3|  bd11:|0 0 1 1 2 2 3 3| 
+  MR  |  `.    |1 5 9 . . .        |      |4 5 6 7|       |4 4 5 5 6 6 7 7| 
+      |    `.  |2 6 A              |   MR |8 9 A B|       |8 8 9 9 A A B B| 
+      |______`.|3_7_B______________|      |___.___|       |_______._______| 
+                                      b21:|   .   |  bd21:|       .       | 
+          MR             k                |   .   |       |       .       | 
+                                          |       |       |               | 
+    NOTE: If duplication is disabled      |       |       |               | 
+    then bd21 and bd11 simply refer     k |       |       |               | 
+    to b21 and b11, respectively.         |       |       |               | 
+    ALSO: Storage digits are shown        |       |       |               | 
+    starting with a12 to avoid            |_______|       |_______________| 
+    obscuring triangular structure of                                       
+    a11.                                                                    
+                                                                            
+  Thus, with duplication enabled, the operation takes the form of:
+
+    b11  = alpha * b11 - a12 * bd21;
+    b11  = inv(a11) * b11;
+    bd11 = b11;  (skipped if duplication is disabled)
+    c11  = b11;
+                                                                        
+  And if duplication is disabled, the operation reduces to:
+
+    b11 = alpha * b11 - a12 * b21;  (Note: Here, b21 == bd21.)
+    b11 = inv(a11) * b11;
+    c11 = b11;
+
+  A note on optimization:
+  - This implementation simply calls the gemm micro-kernel and then the
+    trsm micro-kernel. Let's assume that the gemm micro-kernel has already
+    been optimized. You have two options with regards to optimizing the
+    fused gemmtrsm kernel.
+    (1) Optimize only the trsm kernel and continue to call the gemm and
+        trsm micro-kernels in sequence, as is done in this template
+        implementation.
+    (2) Fuse the implementation of the gemm micro-kernel with that of the
+        trsm micro-kernel by inlining both into this gemmtrsm function.
+    The latter option is more labor-intensive, but also more likely to
+    yield higher performance because it allows you to eliminate redundant
+    memory operations on the packed MR x NR block B11.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+*/
+	const inc_t        rs_b      = bli_dpacknr;
+	const inc_t        cs_b      = 1;
+
+	double*   restrict minus_one = bli_dm1;
+
+	/* Reminder: if duplication is disabled, then bd21 == b21, bd11 == b11. */
+
+	/* b11 = alpha * b11 - a12 * bd21; */
+	bli_dgemm_opt_mxn( k,
+	                   minus_one,
+	                   a12,
+	                   bd21,
+	                   alpha,
+	                   b11, rs_b, cs_b,
+	                   a_next,
+	                   b_next );
+
+	/* b11  = inv(a11) * b11;
+	   bd11 = b11; (skipped if duplication is disabled)
+	   c11  = b11; */
+	bli_dtrsm_u_opt_mxn( a11,
+	                     b11,
+	                     bd11,
+	                     c11, rs_c, cs_c );
+}
+
+
+
+void bli_cgemmtrsm_u_opt_mxn(
+                              dim_t              k,
+                              scomplex* restrict alpha,
+                              scomplex* restrict a12,
+                              scomplex* restrict a11,
+                              scomplex* restrict bd21,
+                              scomplex* restrict bd11,
+                              scomplex* restrict b11,
+                              scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
+                              scomplex* restrict a_next,
+                              scomplex* restrict b_next 
+                            )
+{
+	const inc_t        rs_b      = bli_cpacknr;
+	const inc_t        cs_b      = 1;
+
+	scomplex* restrict minus_one = bli_cm1;
+
+
+	bli_cgemm_opt_mxn( k,
+	                   minus_one,
+	                   a12,
+	                   bd21,
+	                   alpha,
+	                   b11, rs_b, cs_b,
+	                   a_next,
+	                   b_next );
+
+	bli_ctrsm_u_opt_mxn( a11,
+	                     b11,
+	                     bd11,
+	                     c11, rs_c, cs_c );
+}
+
+
+
+void bli_zgemmtrsm_u_opt_mxn(
+                              dim_t              k,
+                              dcomplex* restrict alpha,
+                              dcomplex* restrict a12,
+                              dcomplex* restrict a11,
+                              dcomplex* restrict bd21,
+                              dcomplex* restrict bd11,
+                              dcomplex* restrict b11,
+                              dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
+                              dcomplex* restrict a_next,
+                              dcomplex* restrict b_next 
+                            )
+{
+	const inc_t        rs_b      = bli_zpacknr;
+	const inc_t        cs_b      = 1;
+
+	dcomplex* restrict minus_one = bli_zm1;
+
+
+	bli_zgemm_opt_mxn( k,
+	                   minus_one,
+	                   a12,
+	                   bd21,
+	                   alpha,
+	                   b11, rs_b, cs_b,
+	                   a_next,
+	                   b_next );
+
+	bli_ztrsm_u_opt_mxn( a11,
+	                     b11,
+	                     bd11,
+	                     c11, rs_c, cs_c );
+}
+
--- a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h
+++ b/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype micro-kernel interfaces.
+//
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           dim_t           k, \
+                           ctype* restrict alpha, \
+                           ctype* restrict a12, \
+                           ctype* restrict a11, \
+                           ctype* restrict bd21, \
+                           ctype* restrict bd11, \
+                           ctype* restrict b11, \
+                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
+                           ctype* restrict a_next, \
+                           ctype* restrict b_next  \
+                         );
+
+INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_mxn )
+
--- a/config/template/kernels/3/bli_trsm_l_opt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_l_opt_mxn.c
@@ -0,0 +1,218 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_strsm_l_opt_mxn(
+                          float*    restrict a,
+                          float*    restrict b,
+                          float*    restrict bd,
+                          float*    restrict c, inc_t rs_c, inc_t cs_c 
+                        )
+{
+	/* Just call the reference implementation. */
+	bli_strsm_l_ref_mxn( a,
+	                     b,
+	                     bd,
+	                     c, rs_c, cs_c );
+}
+
+
+
+void bli_dtrsm_l_opt_mxn(
+                          double*   restrict a,
+                          double*   restrict b,
+                          double*   restrict bd,
+                          double*   restrict c, inc_t rs_c, inc_t cs_c 
+                        )
+{
+/*
+  Template trsm_l micro-kernel implementation
+
+  This function contains a template implementation for a double-precision
+  real trsm micro-kernel, coded in C, which can serve as the starting point
+  for one to write an optimized micro-kernel on an arbitrary architecture.
+  (We show a template implementation for only double-precision real because
+  the templates for the other three floating-point types would be nearly
+  identical.)
+
+  This micro-kernel performs a triangular solve with NR right-hand sides:
+
+    C := inv(A) * B
+
+  where A is MR x MR and lower triangular, B is MR x NR, and C is MR x NR.
+
+  NOTE: Here, this trsm micro-kernel supports element "duplication", a
+  feature that is enabled or disabled in bli_kernel.h. Duplication factors
+  are also defined in the aforementioned header. Duplication is NOT
+  commonly used and most developers may assume it is disabled.
+
+  Parameters:
+
+  - a:      The address of A, which is the MR x MR lower triangular block
+            within the packed (column-stored) micro-panel of A. By the time
+            this trsm micro-kernel is called, the diagonal of A has already
+            been inverted and the strictly upper triangle contains zeros.
+  - b:      The address of B, which is the MR x NR subpartition of the
+            current packed (row-stored) micro-panel of B.
+  - bd:     The address of the duplicated copy of B. If duplication is
+            disabled, then bd == b.
+  - c:      The address of C, which is the MR x NR block of the output
+            matrix (ie: the matrix provided by the user to the highest-level
+            trsm API call). C corresponds to the elements that exist in
+            packed form in B, and is stored according to rs_c and cs_c.
+  - rs_c:   The row stride of C (ie: the distance to the next row of C11,
+            in units of matrix elements).
+  - cs_c:   The column stride of C (ie: the distance to the next column of
+            C11, in units of matrix elements).
+
+  Please see the comments in bli_gemmtrsm_l_opt_mxn.c for a diagram of the
+  trsm operation and where it fits in with the preceding gemm subproblem.
+
+  Here are a few things to consider:
+  - While all three loops are exposed in this template micro-kernel, all
+    three loops typically disappear in an optimized code because they are
+    fully unrolled.
+  - Note that the diagonal of the triangular matrix A contains the INVERSE
+    of those elements. This is done during packing so that we can avoid
+    expensive division instructions within this micro-kernel.
+  - This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
+    then the result must be written to three places: the sub-block within the
+    duplicated copy of B, the sub-block of the original packed micro-panel of
+    B, and the sub-block of the output matrix C. When duplication is not
+    used, the micro-kernel should update only the latter two locations.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t        m     = bli_dmr;
+	const dim_t        n     = bli_dnr;
+
+	const inc_t        rs_a  = 1;
+	const inc_t        cs_a  = bli_dpackmr;
+
+	const inc_t        rs_b  = bli_dpacknr;
+	const inc_t        cs_b  = 1;
+
+	dim_t              iter, i, j, l;
+	dim_t              n_behind;
+
+	double*   restrict alpha11;
+	double*   restrict a10t;
+	double*   restrict alpha10;
+	double*   restrict X0;
+	double*   restrict x1;
+	double*   restrict x01;
+	double*   restrict chi01;
+	double*   restrict chi11;
+	double*   restrict gamma11;
+	double             rho11;
+
+	for ( iter = 0; iter < m; ++iter )
+	{
+		i        = iter;
+		n_behind = i;
+		alpha11  = a + (i  )*rs_a + (i  )*cs_a;
+		a10t     = a + (i  )*rs_a + (0  )*cs_a;
+		X0       = b + (0  )*rs_b + (0  )*cs_b;
+		x1       = b + (i  )*rs_b + (0  )*cs_b;
+
+		/* x1 = x1 - a10t * X0; */
+		/* x1 = x1 / alpha11; */
+		for ( j = 0; j < n; ++j )
+		{
+			x01     = X0 + (0  )*rs_b + (j  )*cs_b;
+			chi11   = x1 + (0  )*rs_b + (j  )*cs_b;
+			gamma11 = c  + (i  )*rs_c + (j  )*cs_c;
+
+			/* chi11 = chi11 - a10t * x01; */
+			bli_dset0s( rho11 );
+			for ( l = 0; l < n_behind; ++l )
+			{
+				alpha10 = a10t + (l  )*cs_a;
+				chi01   = x01  + (l  )*rs_b;
+
+				bli_daxpys( *alpha10, *chi01, rho11 );
+			}
+			bli_dsubs( rho11, *chi11 );
+
+			/* chi11 = chi11 / alpha11; */
+			/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
+			   of alpha11, so we can multiply rather than divide. We store 
+			   the inverse of alpha11 intentionally to avoid expensive
+			   division instructions within the micro-kernel. */
+			bli_dscals( *alpha11, *chi11 );
+
+			/* Output final result to matrix C. */
+			bli_dcopys( *chi11, *gamma11 );
+		}
+	}
+}
+
+
+
+void bli_ctrsm_l_opt_mxn(
+                          scomplex* restrict a,
+                          scomplex* restrict b,
+                          scomplex* restrict bd,
+                          scomplex* restrict c, inc_t rs_c, inc_t cs_c 
+                        )
+{
+	/* Just call the reference implementation. */
+	bli_ctrsm_l_ref_mxn( a,
+	                     b,
+	                     bd,
+	                     c, rs_c, cs_c );
+}
+
+
+
+void bli_ztrsm_l_opt_mxn(
+                          dcomplex* restrict a,
+                          dcomplex* restrict b,
+                          dcomplex* restrict bd,
+                          dcomplex* restrict c, inc_t rs_c, inc_t cs_c 
+                        )
+{
+	/* Just call the reference implementation. */
+	bli_ztrsm_l_ref_mxn( a,
+	                     b,
+	                     bd,
+	                     c, rs_c, cs_c );
+}
+
--- a/config/template/kernels/3/bli_trsm_l_opt_mxn.h
+++ b/config/template/kernels/3/bli_trsm_l_opt_mxn.h
@@ -0,0 +1,50 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype micro-kernel interfaces.
+//
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           ctype* restrict a, \
+                           ctype* restrict b, \
+                           ctype* restrict bd, \
+                           ctype* restrict c, inc_t rs_c, inc_t cs_c  \
+                         );
+
+INSERT_GENTPROT_BASIC( trsm_l_opt_mxn )
+
--- a/config/template/kernels/3/bli_trsm_u_opt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_u_opt_mxn.c
@@ -0,0 +1,218 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+
+
+void bli_strsm_u_opt_mxn(
+                          float*    restrict a,
+                          float*    restrict b,
+                          float*    restrict bd,
+                          float*    restrict c, inc_t rs_c, inc_t cs_c
+                        )
+{
+    /* Just call the reference implementation. */
+    bli_strsm_u_ref_mxn( a,
+                         b,
+                         bd,
+                         c, rs_c, cs_c );
+}
+
+
+
+void bli_dtrsm_u_opt_mxn(
+                          double*   restrict a,
+                          double*   restrict b,
+                          double*   restrict bd,
+                          double*   restrict c, inc_t rs_c, inc_t cs_c
+                        )
+{
+/*
+  Template trsm_u micro-kernel implementation
+
+  This function contains a template implementation for a double-precision
+  real trsm micro-kernel, coded in C, which can serve as the starting point
+  for one to write an optimized micro-kernel on an arbitrary architecture.
+  (We show a template implementation for only double-precision real because
+  the templates for the other three floating-point types would be nearly
+  identical.)
+
+  This micro-kernel performs a triangular solve with NR right-hand sides:
+
+    C := inv(A) * B
+
+  where A is MR x MR and upper triangular, B is MR x NR, and C is MR x NR.
+
+  NOTE: Here, this trsm micro-kernel supports element "duplication", a
+  feature that is enabled or disabled in bli_kernel.h. Duplication factors
+  are also defined in the aforementioned header. Duplication is NOT
+  commonly used and most developers may assume it is disabled.
+
+  Parameters:
+
+  - a:      The address of A, which is the MR x MR upper triangular block
+            within the packed (column-stored) micro-panel of A. By the time
+            this trsm micro-kernel is called, the diagonal of A has already
+            been inverted and the strictly lower triangle contains zeros.
+  - b:      The address of B, which is the MR x NR subpartition of the
+            current packed (row-stored) micro-panel of B.
+  - bd:     The address of the duplicated copy of B. If duplication is
+            disabled, then bd == b.
+  - c:      The address of C, which is the MR x NR block of the output
+            matrix (ie: the matrix provided by the user to the highest-level
+            trsm API call). C corresponds to the elements that exist in
+            packed form in B, and is stored according to rs_c and cs_c.
+  - rs_c:   The row stride of C (ie: the distance to the next row of C11,
+            in units of matrix elements).
+  - cs_c:   The column stride of C (ie: the distance to the next column of
+            C11, in units of matrix elements).
+
+  Please see the comments in bli_gemmtrsm_u_opt_mxn.c for a diagram of the
+  trsm operation and where it fits in with the preceding gemm subproblem.
+
+  Here are a few things to consider:
+  - While all three loops are exposed in this template micro-kernel, all
+    three loops typically disappear in an optimized code because they are
+    fully unrolled.
+  - Note that the diagonal of the triangular matrix A contains the INVERSE
+    of those elements. This is done during packing so that we can avoid
+    expensive division instructions within this micro-kernel.
+  - This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
+    then the result must be written to three places: the sub-block within the
+    duplicated copy of B, the sub-block of the original packed micro-panel of
+    B, and the sub-block of the output matrix C. When duplication is not
+    used, the micro-kernel should update only the latter two locations.
+
+  For more info, please refer to the BLIS website and/or contact the
+  blis-devel mailing list.
+
+  -FGVZ
+*/
+	const dim_t        m     = bli_dmr;
+	const dim_t        n     = bli_dnr;
+
+	const inc_t        rs_a  = 1;
+	const inc_t        cs_a  = bli_dpackmr;
+
+	const inc_t        rs_b  = bli_dpacknr;
+	const inc_t        cs_b  = 1;
+
+	dim_t              iter, i, j, l;
+	dim_t              n_behind;
+
+	double*   restrict alpha11;
+	double*   restrict a12t;
+	double*   restrict alpha12;
+	double*   restrict X2;
+	double*   restrict x1;
+	double*   restrict x21;
+	double*   restrict chi21;
+	double*   restrict chi11;
+	double*   restrict gamma11;
+	double             rho11;
+
+	for ( iter = 0; iter < m; ++iter )
+	{
+		i        = m - iter - 1;
+		n_behind = iter;
+		alpha11  = a + (i  )*rs_a + (i  )*cs_a;
+		a12t     = a + (i  )*rs_a + (i+1)*cs_a;
+		x1       = b + (i  )*rs_b + (0  )*cs_b;
+		X2       = b + (i+1)*rs_b + (0  )*cs_b;
+
+		/* x1 = x1 - a12t * X2; */
+		/* x1 = x1 / alpha11; */
+		for ( j = 0; j < n; ++j )
+		{
+			chi11   = x1 + (0  )*rs_b + (j  )*cs_b;
+			x21     = X2 + (0  )*rs_b + (j  )*cs_b;
+			gamma11 = c  + (i  )*rs_c + (j  )*cs_c;
+
+			/* chi11 = chi11 - a12t * x21; */
+			bli_dset0s( rho11 );
+			for ( l = 0; l < n_behind; ++l )
+			{
+				alpha12 = a12t + (l  )*cs_a;
+				chi21   = x21  + (l  )*rs_b;
+
+				bli_daxpys( *alpha12, *chi21, rho11 );
+			}
+			bli_dsubs( rho11, *chi11 );
+
+			/* chi11 = chi11 / alpha11; */
+			/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
+			   of alpha11, so we can multiply rather than divide. We store 
+			   the inverse of alpha11 intentionally to avoid expensive
+			   division instructions within the micro-kernel. */
+			bli_dscals( *alpha11, *chi11 );
+
+			/* Output final result to matrix C. */
+			bli_dcopys( *chi11, *gamma11 );
+		}
+	}
+}
+
+
+
+void bli_ctrsm_u_opt_mxn(
+                          scomplex* restrict a,
+                          scomplex* restrict b,
+                          scomplex* restrict bd,
+                          scomplex* restrict c, inc_t rs_c, inc_t cs_c
+                        )
+{
+    /* Just call the reference implementation. */
+    bli_ctrsm_u_ref_mxn( a,
+                         b,
+                         bd,
+                         c, rs_c, cs_c );
+}
+
+
+
+void bli_ztrsm_u_opt_mxn(
+                          dcomplex* restrict a,
+                          dcomplex* restrict b,
+                          dcomplex* restrict bd,
+                          dcomplex* restrict c, inc_t rs_c, inc_t cs_c
+                        )
+{
+    /* Just call the reference implementation. */
+    bli_ztrsm_u_ref_mxn( a,
+                         b,
+                         bd,
+                         c, rs_c, cs_c );
+}
+
--- a/config/template/kernels/3/bli_trsm_u_opt_mxn.h
+++ b/config/template/kernels/3/bli_trsm_u_opt_mxn.h
@@ -0,0 +1,50 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2013, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+//
+// Prototype micro-kernel interfaces.
+//
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           ctype* restrict a, \
+                           ctype* restrict b, \
+                           ctype* restrict bd, \
+                           ctype* restrict c, inc_t rs_c, inc_t cs_c  \
+                         );
+
+INSERT_GENTPROT_BASIC( trsm_u_opt_mxn )
+
--- a/config/template/make_defs.mk
+++ b/config/template/make_defs.mk
@@ -0,0 +1,107 @@
+#!/bin/bash
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2013, The University of Texas
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name of The University of Texas nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# Only include this block of code once.
+ifndef MAKE_DEFS_MK_INCLUDED
+MAKE_DEFS_MK_INCLUDED := yes
+
+
+
+#
+# --- Build definitions --------------------------------------------------------
+#
+
+# Variables corresponding to other configure-time options.
+BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
+BLIS_ENABLE_STATIC_BUILD        := yes
+BLIS_ENABLE_DYNAMIC_BUILD       := no
+
+
+
+#
+# --- Utility program definitions ----------------------------------------------
+#
+
+SH         := /bin/sh
+MV         := mv
+MKDIR      := mkdir -p
+RM_F       := rm -f
+RM_RF      := rm -rf
+SYMLINK    := ln -sf
+FIND       := find
+XARGS      := xargs
+RANLIB     := ranlib
+INSTALL    := install -c
+
+# Used to refresh CHANGELOG.
+GIT        := git
+GIT_LOG    := $(GIT) log --decorate
+
+
+
+#
+# --- Development tools definitions --------------------------------------------
+#
+
+# --- Determine the C compiler and related flags ---
+CC             := gcc
+# Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
+# NOTE: This is needed to enable posix_memalign().
+CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
+CMISCFLAGS     := -std=c99 # -fopenmp -pg
+CDBGFLAGS      := -g
+CWARNFLAGS     := -Wall
+COPTFLAGS      := -O2
+CKOPTFLAGS     := $(COPTFLAGS)
+CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
+
+# Aggregate all of the flags into multiple groups: one for standard
+# compilation, and one for each of the supported "special" compilation
+# modes.
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+
+# --- Determine the archiver and related flags ---
+AR             := ar
+ARFLAGS        := cru
+
+# --- Determine the linker and related flags ---
+LINKER         := $(CC)
+LDFLAGS        := 
+
+
+
+# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
+endif
--- a/frame/1f/axpy2v/bli_axpy2v_unb_var1.c
+++ b/frame/1f/axpy2v/bli_axpy2v_unb_var1.c
@@ -34,75 +34,9 @@

 #include "blis.h"

-/*
-#define FUNCPTR_T axpy2v_fp
-
-typedef void (*FUNCPTR_T)(
-                           conj_t conjx,
-                           conj_t conjy,
-                           dim_t  n,
-                           void*  alpha,
-                           void*  x, inc_t incx,
-                           void*  y, inc_t incy
-                         );
-
-// If some mixed datatype functions will not be compiled, we initialize
-// the corresponding elements of the function array to NULL.
-#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-static FUNCPTR_T GENARRAY3_ALL(ftypes,axpy2v_unb_var1);
-#else
-#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-static FUNCPTR_T GENARRAY3_EXT(ftypes,axpy2v_unb_var1);
-#else
-static FUNCPTR_T GENARRAY3_MIN(ftypes,axpy2v_unb_var1);
-#endif
-#endif
-
-
-void bli_axpy2v_unb_var1( obj_t*  alpha,
-                         obj_t*  x,
-                         obj_t*  y )
-{
-	num_t     dt_x      = bli_obj_datatype( *x );
-	num_t     dt_y      = bli_obj_datatype( *y );
-
-	conj_t    conjx     = bli_obj_conj_status( *x );
-	conj_t    conjy     = bli_obj_conj_status( *y );
-	dim_t     n         = bli_obj_vector_dim( *x );
-
-	inc_t     inc_x     = bli_obj_vector_inc( *x );
-	void*     buf_x     = bli_obj_buffer_at_off( *x );
-
-	inc_t     inc_y     = bli_obj_vector_inc( *y );
-	void*     buf_y     = bli_obj_buffer_at_off( *y );
-
-	num_t     dt_alpha;
-	void*     buf_alpha;
-
-	FUNCPTR_T f;
-
-	// If alpha is a scalar constant, use dt_x to extract the address of the
-	// corresponding constant value; otherwise, use the datatype encoded
-	// within the alpha object and extract the buffer at the alpha offset.
-	bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_alpha][dt_x][dt_y];
-
-	// Invoke the function.
-	f( conjx,
-	   conjy,
-	   n,
-	   buf_alpha,
-	   buf_x, inc_x,
-	   buf_y, inc_y );
-}
-*/
-

 #undef  GENTFUNC3U12
-#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, opname, varname ) \
+#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
 \
 void PASTEMAC3(chx,chy,chz,varname)( \
                                     conj_t conjx, \
@@ -121,27 +55,27 @@ void PASTEMAC3(chx,chy,chz,varname)( \
 	ctype_y*  y_cast      = y; \
 	ctype_z*  z_cast      = z; \
 \
-	PASTEMAC3(chxy,chx,chz,axpyv)( conjx, \
-	                               n, \
-	                               alpha1_cast, \
-	                               x_cast, incx, \
-	                               z_cast, incz ); \
-	PASTEMAC3(chxy,chy,chz,axpyv)( conjy, \
-	                               n, \
-	                               alpha2_cast, \
-	                               y_cast, incy, \
-	                               z_cast, incz ); \
+	PASTEMAC3(chxy,chx,chz,kername)( conjx, \
+	                                 n, \
+	                                 alpha1_cast, \
+	                                 x_cast, incx, \
+	                                 z_cast, incz ); \
+	PASTEMAC3(chxy,chy,chz,kername)( conjy, \
+	                                 n, \
+	                                 alpha2_cast, \
+	                                 y_cast, incy, \
+	                                 z_cast, incz ); \
 }

 // Define the basic set of functions unconditionally, and then also some
 // mixed datatype functions if requested.
-INSERT_GENTFUNC3U12_BASIC( axpy2v, axpy2v_unb_var1 )
+INSERT_GENTFUNC3U12_BASIC( axpy2v_unb_var1, AXPYV_KERNEL )

 #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-INSERT_GENTFUNC3U12_MIX_D( axpy2v, axpy2v_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_D( axpy2v_unb_var1, AXPYV_KERNEL )
 #endif

 #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-INSERT_GENTFUNC3U12_MIX_P( axpy2v, axpy2v_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_P( axpy2v_unb_var1, AXPYV_KERNEL )
 #endif

--- a/frame/1f/axpy2v/bli_axpy2v_unb_var1.h
+++ b/frame/1f/axpy2v/bli_axpy2v_unb_var1.h
@@ -32,12 +32,6 @@

 */

-/*
-void bli_axpy2v_unb_var1( obj_t* alpha,
-                         obj_t* x,
-                         obj_t* y );
-*/
-

 #undef  GENTPROT3
 #define GENTPROT3( ctype_x, ctype_y, ctype_z, chx, chy, chz, varname ) \
--- a/frame/1f/axpyf/bli_axpyf.c
+++ b/frame/1f/axpyf/bli_axpyf.c
@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
                          conj_t conja, \
                          conj_t conjx, \
                          dim_t  m, \
-                          dim_t  n, \
+                          dim_t  b_n, \
                          ctype* alpha, \
                          ctype* a, inc_t inca, inc_t lda, \
                          ctype* x, inc_t incx, \
@@ -55,7 +55,7 @@ void PASTEMAC(ch,opname)( \
 	PASTEMAC3(ch,ch,ch,varname)( conja, \
 	                             conjx, \
 	                             m, \
-	                             n, \
+	                             b_n, \
 	                             alpha, \
 	                             a, inca, lda, \
 	                             x, incx, \
@@ -75,7 +75,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
                                    conj_t    conja, \
                                    conj_t    conjx, \
                                    dim_t     m, \
-                                    dim_t     n, \
+                                    dim_t     b_n, \
                                    ctype_ax* alpha, \
                                    ctype_a*  a, inc_t inca, inc_t lda, \
                                    ctype_x*  x, inc_t incx, \
@@ -85,7 +85,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
 	PASTEMAC3(cha,chx,chy,varname)( conja, \
 	                                conjx, \
 	                                m, \
-	                                n, \
+	                                b_n, \
 	                                alpha, \
 	                                a, inca, lda, \
 	                                x, incx, \
--- a/frame/1f/axpyf/bli_axpyf.h
+++ b/frame/1f/axpyf/bli_axpyf.h
@@ -35,24 +35,6 @@
 #include "bli_axpyf_unb_var1.h"


-//
-// Define fusing factors (if they are not already defined by the user
-// in bli_kernel.h).
-//
-#ifndef bli_saxpyf_fuse_fac
-#define bli_saxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
-#endif
-#ifndef bli_daxpyf_fuse_fac
-#define bli_daxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
-#endif
-#ifndef bli_caxpyf_fuse_fac
-#define bli_caxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
-#endif
-#ifndef bli_zaxpyf_fuse_fac
-#define bli_zaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
-#endif
-
-
 //
 // Prototype BLAS-like interfaces with homogeneous-typed operands.
 //
@@ -63,7 +45,7 @@ void PASTEMAC(ch,opname)( \
                          conj_t conja, \
                          conj_t conjx, \
                          dim_t  m, \
-                          dim_t  n, \
+                          dim_t  b_n, \
                          ctype* alpha, \
                          ctype* a, inc_t inca, inc_t lda, \
                          ctype* x, inc_t incx, \
@@ -83,7 +65,7 @@ void PASTEMAC3(cha,chx,chy,opname)( \
                                    conj_t    conja, \
                                    conj_t    conjx, \
                                    dim_t     m, \
-                                    dim_t     n, \
+                                    dim_t     b_n, \
                                    ctype_ax* alpha, \
                                    ctype_a*  a, inc_t inca, inc_t lda, \
                                    ctype_x*  x, inc_t incx, \
--- a/frame/1f/axpyf/bli_axpyf_unb_var1.c
+++ b/frame/1f/axpyf/bli_axpyf_unb_var1.c
@@ -34,71 +34,9 @@

 #include "blis.h"

-/*
-#define FUNCPTR_T axpyf_fp
-
-typedef void (*FUNCPTR_T)(
-                           conj_t conjx,
-                           dim_t  n,
-                           void*  alpha,
-                           void*  x, inc_t incx,
-                           void*  y, inc_t incy
-                         );
-
-// If some mixed datatype functions will not be compiled, we initialize
-// the corresponding elements of the function array to NULL.
-#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyf_unb_var1);
-#else
-#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyf_unb_var1);
-#else
-static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyf_unb_var1);
-#endif
-#endif
-
-
-void bli_axpyf_unb_var1( obj_t*  alpha,
-                         obj_t*  x,
-                         obj_t*  y )
-{
-	num_t     dt_x      = bli_obj_datatype( *x );
-	num_t     dt_y      = bli_obj_datatype( *y );
-
-	conj_t    conjx     = bli_obj_conj_status( *x );
-	dim_t     n         = bli_obj_vector_dim( *x );
-
-	inc_t     inc_x     = bli_obj_vector_inc( *x );
-	void*     buf_x     = bli_obj_buffer_at_off( *x );
-
-	inc_t     inc_y     = bli_obj_vector_inc( *y );
-	void*     buf_y     = bli_obj_buffer_at_off( *y );
-
-	num_t     dt_alpha;
-	void*     buf_alpha;
-
-	FUNCPTR_T f;
-
-	// If alpha is a scalar constant, use dt_x to extract the address of the
-	// corresponding constant value; otherwise, use the datatype encoded
-	// within the alpha object and extract the buffer at the alpha offset.
-	bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_alpha][dt_x][dt_y];
-
-	// Invoke the function.
-	f( conjx,
-	   n,
-	   buf_alpha,
-	   buf_x, inc_x,
-	   buf_y, inc_y );
-}
-*/

 #undef  GENTFUNC3U12
-#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
+#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
 \
 void PASTEMAC3(cha,chx,chy,varname)( \
                                     conj_t conja, \
@@ -130,23 +68,23 @@ void PASTEMAC3(cha,chx,chy,varname)( \
 		PASTEMAC2(chx,chax,copycjs)( conjx, *chi1, alpha_chi1 ); \
 		PASTEMAC2(chax,chax,scals)( *alpha_cast, alpha_chi1 ); \
 \
-		PASTEMAC3(chax,cha,chy,axpyv)( conja, \
-		                               m, \
-		                               &alpha_chi1, \
-		                               a1, inca, \
-		                               y1, incy ); \
+		PASTEMAC3(chax,cha,chy,kername)( conja, \
+		                                 m, \
+		                                 &alpha_chi1, \
+		                                 a1, inca, \
+		                                 y1, incy ); \
 	} \
 }

 // Define the basic set of functions unconditionally, and then also some
 // mixed datatype functions if requested.
-INSERT_GENTFUNC3U12_BASIC( axpyf, axpyf_unb_var1 )
+INSERT_GENTFUNC3U12_BASIC( axpyf_unb_var1, AXPYV_KERNEL )

 #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-INSERT_GENTFUNC3U12_MIX_D( axpyf, axpyf_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_D( axpyf_unb_var1, AXPYV_KERNEL )
 #endif

 #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-INSERT_GENTFUNC3U12_MIX_P( axpyf, axpyf_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_P( axpyf_unb_var1, AXPYV_KERNEL )
 #endif

--- a/frame/1f/axpyf/bli_axpyf_unb_var1.h
+++ b/frame/1f/axpyf/bli_axpyf_unb_var1.h
@@ -32,12 +32,6 @@

 */

-/*
-void bli_axpyf_unb_var1( obj_t* alpha,
-                         obj_t* x,
-                         obj_t* y );
-*/
-

 #undef  GENTPROT3U12
 #define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
@@ -46,7 +40,7 @@ void PASTEMAC3(cha,chx,chy,varname)( \
                                     conj_t conja, \
                                     conj_t conjx, \
                                     dim_t  m, \
-                                     dim_t  n, \
+                                     dim_t  b_n, \
                                     void*  alpha, \
                                     void*  a, inc_t inca, inc_t lda, \
                                     void*  x, inc_t incx, \
--- a/frame/1f/dotaxpyv/bli_dotaxpyv.c
+++ b/frame/1f/dotaxpyv/bli_dotaxpyv.c
@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
                          conj_t conjxt, \
                          conj_t conjx, \
                          conj_t conjy, \
-                          dim_t  n, \
+                          dim_t  m, \
                          ctype* alpha, \
                          ctype* x, inc_t incx, \
                          ctype* y, inc_t incy, \
@@ -56,7 +56,7 @@ void PASTEMAC(ch,opname)( \
 	PASTEMAC3(ch,ch,ch,varname)( conjxt, \
 	                             conjx, \
 	                             conjy, \
-	                             n, \
+	                             m, \
 	                             alpha, \
 	                             x, incx, \
 	                             y, incy, \
@@ -77,8 +77,8 @@ void PASTEMAC3(chx,chy,chz,opname)( \
                                    conj_t    conjxt, \
                                    conj_t    conjx, \
                                    conj_t    conjy, \
-                                    dim_t     n, \
-                                    ctype_xy* alpha, \
+                                    dim_t     m, \
+                                    ctype_x*  alpha, \
                                    ctype_x*  x, inc_t incx, \
                                    ctype_y*  y, inc_t incy, \
                                    ctype_xy* rho, \
@@ -88,7 +88,7 @@ void PASTEMAC3(chx,chy,chz,opname)( \
 	PASTEMAC3(chx,chy,chz,varname)( conjxt, \
 	                                conjx, \
 	                                conjy, \
-	                                n, \
+	                                m, \
 	                                alpha, \
 	                                x, incx, \
 	                                y, incy, \
--- a/frame/1f/dotaxpyv/bli_dotaxpyv.h
+++ b/frame/1f/dotaxpyv/bli_dotaxpyv.h
@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname)( \
                          conj_t conjxt, \
                          conj_t conjx, \
                          conj_t conjy, \
-                          dim_t  n, \
+                          dim_t  m, \
                          ctype* alpha, \
                          ctype* x, inc_t incx, \
                          ctype* y, inc_t incy, \
@@ -66,8 +66,8 @@ void PASTEMAC3(chx,chy,chz,opname)( \
                                    conj_t    conjxt, \
                                    conj_t    conjx, \
                                    conj_t    conjy, \
-                                    dim_t     n, \
-                                    ctype_xy* alpha, \
+                                    dim_t     m, \
+                                    ctype_x*  alpha, \
                                    ctype_x*  x, inc_t incx, \
                                    ctype_y*  y, inc_t incy, \
                                    ctype_xy* rho, \
--- a/frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.c
+++ b/frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.c
@@ -36,13 +36,13 @@


 #undef  GENTFUNC3U12
-#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, opname, varname ) \
+#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, dotxvker, axpyvker ) \
 \
 void PASTEMAC3(chx,chy,chz,varname)( \
                                     conj_t conjxt, \
                                     conj_t conjx, \
                                     conj_t conjy, \
-                                     dim_t  n, \
+                                     dim_t  m, \
                                     void*  alpha, \
                                     void*  x, inc_t incx, \
                                     void*  y, inc_t incy, \
@@ -52,36 +52,36 @@ void PASTEMAC3(chx,chy,chz,varname)( \
 { \
 	ctype_xy* one        = PASTEMAC(chxy,1); \
 	ctype_xy* zero       = PASTEMAC(chxy,0); \
-	ctype_xy* alpha_cast = alpha; \
+	ctype_x*  alpha_cast = alpha; \
 	ctype_x*  x_cast     = x; \
 	ctype_y*  y_cast     = y; \
 	ctype_xy* rho_cast   = rho; \
 	ctype_z*  z_cast     = z; \
 \
-	PASTEMAC3(chx,chy,chxy,dotxv)( conjxt, \
-	                               conjy, \
-	                               n, \
-	                               one, \
-	                               x_cast, incx, \
-	                               y_cast, incy, \
-	                               zero, \
-	                               rho_cast ); \
-	PASTEMAC3(chxy,chx,chz,axpyv)( conjx, \
-	                               n, \
-	                               alpha_cast, \
-	                               x_cast, incx, \
-	                               z_cast, incz ); \
+	PASTEMAC3(chx,chy,chxy,dotxvker)( conjxt, \
+	                                  conjy, \
+	                                  m, \
+	                                  one, \
+	                                  x_cast, incx, \
+	                                  y_cast, incy, \
+	                                  zero, \
+	                                  rho_cast ); \
+	PASTEMAC3(chx,chx,chz,axpyvker)( conjx, \
+	                                 m, \
+	                                 alpha_cast, \
+	                                 x_cast, incx, \
+	                                 z_cast, incz ); \
 }

 // Define the basic set of functions unconditionally, and then also some
 // mixed datatype functions if requested.
-INSERT_GENTFUNC3U12_BASIC( dotaxpyv, dotaxpyv_unb_var1 )
+INSERT_GENTFUNC3U12_BASIC2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )

 #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-INSERT_GENTFUNC3U12_MIX_D( dotaxpyv, dotaxpyv_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_D2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
 #endif

 #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-INSERT_GENTFUNC3U12_MIX_P( dotaxpyv, dotaxpyv_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_P2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
 #endif

--- a/frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.h
+++ b/frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.h
@@ -40,7 +40,7 @@ void PASTEMAC3(chx,chy,chz,varname)( \
                                     conj_t conjxt, \
                                     conj_t conjx, \
                                     conj_t conjy, \
-                                     dim_t  n, \
+                                     dim_t  m, \
                                     void*  alpha, \
                                     void*  x, inc_t incx, \
                                     void*  y, inc_t incy, \
--- a/frame/1f/dotxaxpyf/bli_dotxaxpyf.h
+++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf.h
@@ -35,24 +35,6 @@
 #include "bli_dotxaxpyf_unb_var1.h"


-//
-// Define fusing factors (if they are not already defined by the user
-// in bli_kernel.h).
-//
-#ifndef bli_sdotxaxpyf_fuse_fac
-#define bli_sdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
-#endif
-#ifndef bli_ddotxaxpyf_fuse_fac
-#define bli_ddotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
-#endif
-#ifndef bli_cdotxaxpyf_fuse_fac
-#define bli_cdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
-#endif
-#ifndef bli_zdotxaxpyf_fuse_fac
-#define bli_zdotxaxpyf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
-#endif
-
-
 //
 // Prototype BLAS-like interfaces with homogeneous-typed operands.
 //
--- a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.c
+++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.c
@@ -36,7 +36,7 @@


 #undef  GENTFUNC3U12
-#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, opname, varname ) \
+#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, dotxvker, axpyvker ) \
 \
 void PASTEMAC3(cha,chb,chc,varname)( \
                                     conj_t conjat, \
@@ -107,13 +107,13 @@ void PASTEMAC3(cha,chb,chc,varname)( \

 // Define the basic set of functions unconditionally, and then also some
 // mixed datatype functions if requested.
-INSERT_GENTFUNC3U12_BASIC( dotxaxpyf, dotxaxpyf_unb_var1 )
+INSERT_GENTFUNC3U12_BASIC2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )

 #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf, dotxaxpyf_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_D2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
 #endif

 #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf, dotxaxpyf_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_P2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL )
 #endif

--- a/frame/1f/dotxf/bli_dotxf.c
+++ b/frame/1f/dotxf/bli_dotxf.c
@@ -42,26 +42,26 @@
 #define GENTFUNC( ctype, ch, opname, varname ) \
 \
 void PASTEMAC(ch,opname)( \
+                          conj_t conjat, \
                          conj_t conjx, \
-                          conj_t conjy, \
                          dim_t  m, \
-                          dim_t  n, \
+                          dim_t  b_n, \
                          ctype* alpha, \
-                          ctype* x, inc_t incx, inc_t ldx, \
-                          ctype* y, inc_t incy, \
+                          ctype* a, inc_t inca, inc_t lda, \
+                          ctype* x, inc_t incx, \
                          ctype* beta, \
-                          ctype* r, inc_t incr \
+                          ctype* y, inc_t incy \
                        ) \
 { \
-	PASTEMAC3(ch,ch,ch,varname)( conjx, \
-	                             conjy, \
+	PASTEMAC3(ch,ch,ch,varname)( conjat, \
+	                             conjx, \
 	                             m, \
-	                             n, \
+	                             b_n, \
 	                             alpha, \
-	                             x, incx, ldx, \
-	                             y, incy, \
+	                             a, inca, lda, \
+	                             x, incx, \
 	                             beta, \
-	                             r, incr ); \
+	                             y, incy ); \
 }

 INSERT_GENTFUNC_BASIC( dotxf, DOTXF_KERNEL )
@@ -71,29 +71,29 @@ INSERT_GENTFUNC_BASIC( dotxf, DOTXF_KERNEL )
 // Define BLAS-like interfaces with heterogeneous-typed operands.
 //
 #undef  GENTFUNC3U12
-#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
+#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
 \
-void PASTEMAC3(chx,chy,chr,opname)( \
+void PASTEMAC3(cha,chx,chy,opname)( \
+                                    conj_t    conjat, \
                                    conj_t    conjx, \
-                                    conj_t    conjy, \
                                    dim_t     m, \
-                                    dim_t     n, \
-                                    ctype_xy* alpha, \
-                                    ctype_x*  x, inc_t incx, inc_t ldx, \
-                                    ctype_y*  y, inc_t incy, \
-                                    ctype_r*  beta, \
-                                    ctype_r*  r, inc_t incr \
+                                    dim_t     b_n, \
+                                    ctype_ax* alpha, \
+                                    ctype_a*  a, inc_t inca, inc_t lda, \
+                                    ctype_x*  x, inc_t incx, \
+                                    ctype_y*  beta, \
+                                    ctype_y*  y, inc_t incy \
                                  ) \
 { \
-	PASTEMAC3(chx,chy,chr,varname)( conjx, \
-	                                conjy, \
+	PASTEMAC3(cha,chx,chy,varname)( conjat, \
+	                                conjx, \
 	                                m, \
-	                                n, \
+	                                b_n, \
 	                                alpha, \
-	                                x, incx, ldx, \
-	                                y, incy, \
+	                                a, inca, lda, \
+	                                x, incx, \
 	                                beta, \
-	                                r, incr ); \
+	                                y, incy ); \
 }

 // Define the basic set of functions unconditionally, and then also some
--- a/frame/1f/dotxf/bli_dotxf.h
+++ b/frame/1f/dotxf/bli_dotxf.h
@@ -35,24 +35,6 @@
 #include "bli_dotxf_unb_var1.h"


-//
-// Define fusing factors (if they are not already defined by the user
-// in bli_kernel.h).
-//
-#ifndef bli_sdotxf_fuse_fac
-#define bli_sdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_S
-#endif
-#ifndef bli_ddotxf_fuse_fac
-#define bli_ddotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_D
-#endif
-#ifndef bli_cdotxf_fuse_fac
-#define bli_cdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_C
-#endif
-#ifndef bli_zdotxf_fuse_fac
-#define bli_zdotxf_fuse_fac BLIS_DEFAULT_FUSING_FACTOR_Z
-#endif
-
-
 //
 // Prototype BLAS-like interfaces with homogeneous-typed operands.
 //
@@ -60,15 +42,15 @@
 #define GENTPROT( ctype, ch, opname ) \
 \
 void PASTEMAC(ch,opname)( \
+                          conj_t conjat, \
                          conj_t conjx, \
-                          conj_t conjy, \
                          dim_t  m, \
-                          dim_t  n, \
+                          dim_t  b_n, \
                          ctype* alpha, \
-                          ctype* x, inc_t incx, inc_t ldx, \
-                          ctype* y, inc_t incy, \
+                          ctype* a, inc_t inca, inc_t lda, \
+                          ctype* x, inc_t incx, \
                          ctype* beta, \
-                          ctype* r, inc_t incr \
+                          ctype* y, inc_t incy \
                        );

 INSERT_GENTPROT_BASIC( dotxf )
@@ -78,18 +60,18 @@ INSERT_GENTPROT_BASIC( dotxf )
 // Prototype BLAS-like interfaces with heterogeneous-typed operands.
 //
 #undef  GENTPROT3U12
-#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname ) \
+#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname ) \
 \
-void PASTEMAC3(chx,chy,chr,opname)( \
+void PASTEMAC3(cha,chx,chy,opname)( \
+                                    conj_t    conjat, \
                                    conj_t    conjx, \
-                                    conj_t    conjy, \
                                    dim_t     m, \
-                                    dim_t     n, \
-                                    ctype_xy* alpha, \
-                                    ctype_x*  x, inc_t incx, inc_t ldx, \
-                                    ctype_y*  y, inc_t incy, \
-                                    ctype_r*  beta, \
-                                    ctype_r*  r, inc_t incr \
+                                    dim_t     b_n, \
+                                    ctype_ax* alpha, \
+                                    ctype_a*  a, inc_t inca, inc_t lda, \
+                                    ctype_x*  x, inc_t incx, \
+                                    ctype_y*  beta, \
+                                    ctype_y*  y, inc_t incy \
                                  );


--- a/frame/1f/dotxf/bli_dotxf_unb_var1.c
+++ b/frame/1f/dotxf/bli_dotxf_unb_var1.c
@@ -34,139 +34,58 @@

 #include "blis.h"

-/*
-#define FUNCPTR_T dotxf_fp
-
-typedef void (*FUNCPTR_T)(
-                           conj_t conjx,
-                           conj_t conjy,
-                           dim_t  n,
-                           void*  alpha,
-                           void*  x, inc_t incx,
-                           void*  y, inc_t incy,
-                           void*  beta,
-                           void*  rho
-                         );
-
-// If some mixed datatype functions will not be compiled, we initialize
-// the corresponding elements of the function array to NULL.
-#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_unb_var1);
-#else
-#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_unb_var1);
-#else
-static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_unb_var1);
-#endif
-#endif
-
-
-void bli_dotxf_unb_var1( obj_t*  alpha,
-                         obj_t*  x,
-                         obj_t*  y,
-                         obj_t*  beta,
-                         obj_t*  rho )
-{
-	num_t     dt_x      = bli_obj_datatype( *x );
-	num_t     dt_y      = bli_obj_datatype( *y );
-	num_t     dt_rho    = bli_obj_datatype( *rho );
-
-	conj_t    conjx     = bli_obj_conj_status( *x );
-	conj_t    conjy     = bli_obj_conj_status( *y );
-	dim_t     n         = bli_obj_vector_dim( *x );
-
-	inc_t     inc_x     = bli_obj_vector_inc( *x );
-	void*     buf_x     = bli_obj_buffer_at_off( *x );
-
-	inc_t     inc_y     = bli_obj_vector_inc( *y );
-	void*     buf_y     = bli_obj_buffer_at_off( *y );
-
-	void*     buf_rho   = bli_obj_buffer_at_off( *rho );
-
-	num_t     dt_alpha;
-	void*     buf_alpha;
-
-	num_t     dt_beta;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// The datatype of alpha MUST be the type union of x and y. This is to
-	// prevent any unnecessary loss of information during computation.
-	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
-
-	// The datatype of beta MUST be the same as the datatype of rho.
-	dt_beta   = dt_rho;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_x][dt_y][dt_rho];
-
-	// Invoke the function.
-	f( conjx,
-	   conjy,
-	   n,
-	   buf_alpha, 
-	   buf_x, inc_x, 
-	   buf_y, inc_y,
-	   buf_beta, 
-	   buf_rho );
-}
-*/

 #undef  GENTFUNC3U12
-#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
+#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
 \
-void PASTEMAC3(chx,chy,chr,varname)( \
+void PASTEMAC3(cha,chx,chy,varname)( \
+                                     conj_t conjat, \
                                     conj_t conjx, \
-                                     conj_t conjy, \
-                                     dim_t  b_m, \
-                                     dim_t  n, \
+                                     dim_t  m, \
+                                     dim_t  b_n, \
                                     void*  alpha, \
-                                     void*  x, inc_t incx, inc_t ldx, \
-                                     void*  y, inc_t incy, \
+                                     void*  a, inc_t inca, inc_t lda, \
+                                     void*  x, inc_t incx, \
                                     void*  beta, \
-                                     void*  r, inc_t incr \
+                                     void*  y, inc_t incy \
                                   ) \
 { \
-	ctype_xy* alpha_cast = alpha; \
+	ctype_ax* alpha_cast = alpha; \
+	ctype_a*  a_cast     = a; \
 	ctype_x*  x_cast     = x; \
+	ctype_y*  beta_cast  = beta; \
 	ctype_y*  y_cast     = y; \
-	ctype_r*  beta_cast  = beta; \
-	ctype_r*  r_cast     = r; \
+	ctype_a*  a1; \
 	ctype_x*  x1; \
-	ctype_y*  y1; \
-	ctype_r*  rho1; \
+	ctype_y*  psi1; \
 	dim_t     i; \
 \
-	for ( i = 0; i < b_m; ++i ) \
+	for ( i = 0; i < b_n; ++i ) \
 	{ \
-		x1   = x_cast + (0  )*incx + (i  )*ldx; \
-		y1   = y_cast + (0  )*incy; \
-		rho1 = r_cast + (i  )*incr; \
+		a1   = a_cast + (0  )*inca + (i  )*lda; \
+		x1   = x_cast + (0  )*incx; \
+		psi1 = y_cast + (i  )*incy; \
 \
-		PASTEMAC3(chx,chy,chr,dotxv)( conjx, \
-		                              conjy, \
-		                              n, \
-		                              alpha_cast, \
-		                              x1,   incx, \
-		                              y1,   incy, \
-		                              beta_cast, \
-		                              rho1 ); \
+		PASTEMAC3(cha,chx,chy,kername)( conjat, \
+		                                conjx, \
+		                                m, \
+		                                alpha_cast, \
+		                                a1,   inca, \
+		                                x1,   incx, \
+		                                beta_cast, \
+		                                psi1 ); \
 	} \
 }

 // Define the basic set of functions unconditionally, and then also some
 // mixed datatype functions if requested.
-INSERT_GENTFUNC3U12_BASIC( dotxf, dotxf_unb_var1 )
+INSERT_GENTFUNC3U12_BASIC( dotxf_unb_var1, DOTXV_KERNEL )

 #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-INSERT_GENTFUNC3U12_MIX_D( dotxf, dotxf_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_D( dotxf_unb_var1, DOTXV_KERNEL )
 #endif

 #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-INSERT_GENTFUNC3U12_MIX_P( dotxf, dotxf_unb_var1 )
+INSERT_GENTFUNC3U12_MIX_P( dotxf_unb_var1, DOTXV_KERNEL )
 #endif

--- a/frame/1f/dotxf/bli_dotxf_unb_var1.h
+++ b/frame/1f/dotxf/bli_dotxf_unb_var1.h
@@ -32,26 +32,20 @@

 */

-void bli_dotxf_unb_var1( obj_t* alpha,
-                         obj_t* x,
-                         obj_t* y,
-                         obj_t* beta,
-                         obj_t* rho );
-

 #undef  GENTPROT3U12
-#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
+#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
 \
-void PASTEMAC3(chx,chy,chr,varname)( \
+void PASTEMAC3(cha,chx,chy,varname)( \
+                                     conj_t conjat, \
                                     conj_t conjx, \
-                                     conj_t conjy, \
                                     dim_t  m, \
-                                     dim_t  n, \
+                                     dim_t  b_n, \
                                     void*  alpha, \
-                                     void*  x, inc_t incx, inc_t ldx, \
-                                     void*  y, inc_t incy, \
+                                     void*  a, inc_t inca, inc_t lda, \
+                                     void*  x, inc_t incx, \
                                     void*  beta, \
-                                     void*  r, inc_t incr \
+                                     void*  y, inc_t incy \
                                   );

 INSERT_GENTPROT3U12_BASIC( dotxf_unb_var1 )
--- a/frame/2/gemv/bli_gemv_unf_var1.c
+++ b/frame/2/gemv/bli_gemv_unf_var1.c
@@ -159,8 +159,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
 \
 	conja = bli_extract_conj( transa ); \
 \
-	/* Query the fusing factor from the dotxf implementation. */ \
-	b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
+	/* Query the fusing factor for the dotxf implementation. */ \
+	b_fuse = PASTEMAC(chax,dotxf_fusefac); \
 \
 	for ( i = 0; i < n_iter; i += f ) \
 	{ \
@@ -173,8 +173,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
 		/* y1 = beta * y1 + alpha * A1 * x; */ \
 		PASTEMAC3(cha,chx,chy,kername)( conja, \
 		                                conjx, \
-		                                f, \
 		                                n_elem, \
+		                                f, \
 		                                alpha_cast, \
 		                                A1,   cs_at, rs_at, \
 		                                x1,   incx, \
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -177,8 +177,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
 		                          y_cast, incy ); \
 	} \
 \
-	/* Query the fusing factor from the axpyf implementation. */ \
-	b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
+	/* Query the fusing factor for the axpyf implementation. */ \
+	b_fuse = PASTEMAC(chax,axpyf_fusefac); \
 \
 	for ( i = 0; i < n_iter; i += f ) \
 	{ \
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -210,8 +210,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
 		                          y_cast, incy ); \
 	} \
 \
-	/* Query the fusing factor from the dotxaxpyf implementation. */ \
-	b_fuse = PASTEMAC(chax,dotxaxpyf_fuse_fac); \
+	/* Query the fusing factor for the dotxaxpyf implementation. */ \
+	b_fuse = PASTEMAC(chax,dotxaxpyf_fusefac); \
 \
 	for ( i = 0; i < m; i += f ) \
 	{ \
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -228,8 +228,8 @@ void PASTEMAC3(cha,chx,chy,varname)( \
 		                          y_cast, incy ); \
 	} \
 \
-	/* Query the fusing factor from the dotxaxpyf implementation. */ \
-	b_fuse = PASTEMAC(chax,dotxaxpyf_fuse_fac); \
+	/* Query the fusing factor for the dotxaxpyf implementation. */ \
+	b_fuse = PASTEMAC(chax,dotxaxpyf_fusefac); \
 \
 	for ( i = 0; i < m; i += f ) \
 	{ \
--- a/frame/2/trmv/bli_trmv_unf_var1.c
+++ b/frame/2/trmv/bli_trmv_unf_var1.c
@@ -160,8 +160,8 @@ void PASTEMAC2(cha,chx,varname)( \
 \
 	conja = bli_extract_conj( trans ); \
 \
-	/* Query the fusing factor from the dotxf implementation. */ \
-	b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
+	/* Query the fusing factor for the dotxf implementation. */ \
+	b_fuse = PASTEMAC(chax,dotxf_fusefac); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uplo_trans ) ) \
@@ -208,15 +208,15 @@ void PASTEMAC2(cha,chx,varname)( \
 			} \
 \
 			/* x1 = x1 + alpha * A12 * x2; */ \
-			PASTEMAC3(cha,chx,chx,dotxf)( conja, \
-			                              BLIS_NO_CONJUGATE, \
-			                              f, \
-			                              n_ahead, \
-			                              alpha_cast, \
-			                              A12, cs_at, rs_at, \
-			                              x2,  incx, \
-			                              one, \
-			                              x1,  incx ); \
+			PASTEMAC3(cha,chx,chx,kername)( conja, \
+			                                BLIS_NO_CONJUGATE, \
+			                                n_ahead, \
+			                                f, \
+			                                alpha_cast, \
+			                                A12, cs_at, rs_at, \
+			                                x2,  incx, \
+			                                one, \
+			                                x1,  incx ); \
 		} \
 	} \
 	else /* if ( bli_is_lower( uplo_trans ) ) */ \
@@ -265,8 +265,8 @@ void PASTEMAC2(cha,chx,varname)( \
 			/* x1 = x1 + alpha * A10 * x0; */ \
 			PASTEMAC3(cha,chx,chx,kername)( conja, \
 			                                BLIS_NO_CONJUGATE, \
-			                                f, \
 			                                n_ahead, \
+			                                f, \
 			                                alpha_cast, \
 			                                A10, cs_at, rs_at, \
 			                                x0,  incx, \
--- a/frame/2/trmv/bli_trmv_unf_var2.c
+++ b/frame/2/trmv/bli_trmv_unf_var2.c
@@ -159,8 +159,8 @@ void PASTEMAC2(cha,chx,varname)( \
 \
 	conja = bli_extract_conj( trans ); \
 \
-	/* Query the fusing factor from the axpyf implementation. */ \
-	b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
+	/* Query the fusing factor for the axpyf implementation. */ \
+	b_fuse = PASTEMAC(chax,axpyf_fusefac); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uplo_trans ) ) \
@@ -176,14 +176,14 @@ void PASTEMAC2(cha,chx,varname)( \
 			x0       = x_cast + (0  )*incx; \
 \
 			/* x0 = x0 + alpha * A01 * x1; */ \
-			PASTEMAC3(cha,chx,chx,axpyf)( conja, \
-			                              BLIS_NO_CONJUGATE, \
-			                              n_behind, \
-			                              f, \
-			                              alpha_cast, \
-			                              A01, rs_at, cs_at, \
-			                              x1,  incx, \
-			                              x0,  incx ); \
+			PASTEMAC3(cha,chx,chx,kername)( conja, \
+			                                BLIS_NO_CONJUGATE, \
+			                                n_behind, \
+			                                f, \
+			                                alpha_cast, \
+			                                A01, rs_at, cs_at, \
+			                                x1,  incx, \
+			                                x0,  incx ); \
 \
 			/* x1 = alpha * A11 * x1; */ \
 			for ( k = 0; k < f; ++k ) \
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -161,8 +161,8 @@ void PASTEMAC2(cha,chx,varname)( \
 \
 	conja = bli_extract_conj( trans ); \
 \
-	/* Query the fusing factor from the dotxf implementation. */ \
-	b_fuse = PASTEMAC(chax,dotxf_fuse_fac); \
+	/* Query the fusing factor for the dotxf implementation. */ \
+	b_fuse = PASTEMAC(chax,dotxf_fusefac); \
 \
 	/* x = alpha * x; */ \
 	PASTEMAC2(chax,chx,scalv)( BLIS_NO_CONJUGATE, \
@@ -186,8 +186,8 @@ void PASTEMAC2(cha,chx,varname)( \
 			/* x1 = x1 - A12 * x2; */ \
 			PASTEMAC3(cha,chx,chx,kername)( conja, \
 			                                BLIS_NO_CONJUGATE, \
-			                                f, \
 			                                n_behind, \
+			                                f, \
 			                                minus_one, \
 			                                A12, cs_at, rs_at, \
 			                                x2,  incx, \
@@ -242,8 +242,8 @@ void PASTEMAC2(cha,chx,varname)( \
 			/* x1 = x1 - A10 * x0; */ \
 			PASTEMAC3(cha,chx,chx,kername)( conja, \
 			                                BLIS_NO_CONJUGATE, \
-			                                f, \
 			                                n_behind, \
+			                                f, \
 			                                minus_one, \
 			                                A10, cs_at, rs_at, \
 			                                x0,  incx, \
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -160,8 +160,8 @@ void PASTEMAC2(cha,chx,varname)( \
 \
 	conja = bli_extract_conj( trans ); \
 \
-	/* Query the fusing factor from the axpyf implementation. */ \
-	b_fuse = PASTEMAC(chax,axpyf_fuse_fac); \
+	/* Query the fusing factor for the axpyf implementation. */ \
+	b_fuse = PASTEMAC(chax,axpyf_fusefac); \
 \
 	/* x = alpha * x; */ \
 	PASTEMAC2(chax,chx,scalv)( BLIS_NO_CONJUGATE, \
--- a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c
+++ b/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c
@@ -59,38 +59,39 @@ void PASTEMAC(ch,varname)( \
 	const inc_t     rs_ab = 1; \
 	const inc_t     cs_ab = PASTEMAC(ch,mr); \
 \
-	dim_t           k0, j0, i0; \
+	dim_t           l, j, i; \
 \
 	ctype           ab[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ]; \
-	ctype* restrict ab00; \
-	ctype           a0; \
-	ctype           b0; \
+	ctype* restrict abij; \
+	ctype           ai; \
+	ctype           bj; \
 \
 \
 	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( i0 = 0; i0 < m * n; ++i0 ) \
+	for ( i = 0; i < m * n; ++i ) \
 	{ \
-		PASTEMAC(ch,set0s)( *(ab + i0) ); \
+		PASTEMAC(ch,set0s)( *(ab + i) ); \
 	} \
 \
 	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( k0 = 0; k0 < k; ++k0 ) \
+	for ( l = 0; l < k; ++l ) \
 	{ \
-		ab00 = ab; \
+		abij = ab; \
 \
-		for ( j0 = 0; j0 < n; ++j0 ) \
+		/* In an optimized implementation, these two loops over MR and NR
+		   are typically fully unrolled. */ \
+		for ( j = 0; j < n; ++j ) \
 		{ \
-			b0 = *(b + j0); \
+			bj = *(b + j); \
 \
-			for ( i0 = 0; i0 < m; ++i0 ) \
+			for ( i = 0; i < m; ++i ) \
 			{ \
-				a0 = *(a + i0); \
+				ai = *(a + i); \
 \
-				PASTEMAC(ch,dots)( a0, \
-				                   b0, \
-				                   *ab00 ); \
-				ab00 += rs_ab; \
+				PASTEMAC(ch,dots)( ai, bj, *abij ); \
+\
+				abij += rs_ab; \
 			} \
 		} \
 \
@@ -99,9 +100,9 @@ void PASTEMAC(ch,varname)( \
 	} \
 \
 	/* Scale the result in ab by alpha. */ \
-	for ( i0 = 0; i0 < m * n; ++i0 ) \
+	for ( i = 0; i < m * n; ++i ) \
 	{ \
-		PASTEMAC(ch,scals)( *alpha, *(ab + i0) ); \
+		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
 	} \
 \
 	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
--- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c
+++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c
@@ -41,12 +41,12 @@
 void PASTEMAC(ch,varname)( \
                           dim_t           k, \
                           ctype* restrict alpha, \
-                           ctype* restrict aL, \
-                           ctype* restrict a, \
-                           ctype* restrict bdT, \
-                           ctype* restrict bd, \
-                           ctype* restrict b, \
-                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
+                           ctype* restrict a10, \
+                           ctype* restrict a11, \
+                           ctype* restrict bd01, \
+                           ctype* restrict bd11, \
+                           ctype* restrict b11, \
+                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
                           ctype* restrict b_next  \
                         ) \
@@ -56,23 +56,23 @@ void PASTEMAC(ch,varname)( \
 \
 	ctype* restrict minus_one = PASTEMAC(ch,m1); \
 \
-	/* b = alpha * b - aL * bdT; */ \
+	/* b11 = alpha * b11 - a10 * bd01; */ \
 	PASTEMAC(ch,gemmukr)( k, \
 	                      minus_one, \
-	                      aL, \
-	                      bdT, \
+	                      a10, \
+	                      bd01, \
 	                      alpha, \
-	                      b, rs_b, cs_b, \
+	                      b11, rs_b, cs_b, \
 	                      a_next, \
 	                      b_next ); \
 \
-	/* b = inv(a) * b;
-	   bd = b; (if gemm ukernel needs duplicated B)
-	   c = b;                       */ \
-	PASTEMAC(ch,trsmukr)( a, \
-	                      b, \
-	                      bd, \
-	                      c, rs_c, cs_c ); \
+	/* b11  = inv(a11) * b11;
+	   bd11 = b11; (skipped if duplication is disabled)
+	   c11  = b11; */ \
+	PASTEMAC(ch,trsmukr)( a11, \
+	                      b11, \
+	                      bd11, \
+	                      c11, rs_c, cs_c ); \
 }

 INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ref_mxn, GEMM_UKERNEL, TRSM_L_UKERNEL )
--- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h
+++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h
@@ -42,12 +42,12 @@
 void PASTEMAC(ch,varname)( \
                           dim_t           k, \
                           ctype* restrict alpha, \
-                           ctype* restrict aL, \
-                           ctype* restrict a, \
-                           ctype* restrict bdT, \
-                           ctype* restrict bd, \
-                           ctype* restrict b, \
-                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
+                           ctype* restrict a10, \
+                           ctype* restrict a11, \
+                           ctype* restrict bd01, \
+                           ctype* restrict bd11, \
+                           ctype* restrict b11, \
+                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
                           ctype* restrict b_next  \
                         );
--- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c
+++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c
@@ -41,12 +41,12 @@
 void PASTEMAC(ch,varname)( \
                           dim_t           k, \
                           ctype* restrict alpha, \
-                           ctype* restrict aR, \
-                           ctype* restrict a, \
-                           ctype* restrict bdB, \
-                           ctype* restrict bd, \
-                           ctype* restrict b, \
-                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
+                           ctype* restrict a12, \
+                           ctype* restrict a11, \
+                           ctype* restrict bd21, \
+                           ctype* restrict bd11, \
+                           ctype* restrict b11, \
+                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
                           ctype* restrict b_next  \
                         ) \
@@ -56,23 +56,23 @@ void PASTEMAC(ch,varname)( \
 \
 	ctype* restrict minus_one = PASTEMAC(ch,m1); \
 \
-	/* b = alpha * b - aR * bdB; */ \
+	/* b11 = alpha * b11 - a12 * bd21; */ \
 	PASTEMAC(ch,gemmukr)( k, \
 	                      minus_one, \
-	                      aR, \
-	                      bdB, \
+	                      a12, \
+	                      bd21, \
 	                      alpha, \
-	                      b, rs_b, cs_b, \
+	                      b11, rs_b, cs_b, \
 	                      a_next, \
 	                      b_next ); \
 \
-	/* b = inv(a) * b;
-	   bd = b; (if gemm ukernel needs duplicated B)
-	   c = b;                       */ \
-	PASTEMAC(ch,trsmukr)( a, \
-	                      b, \
-	                      bd, \
-	                      c, rs_c, cs_c ); \
+	/* b11  = inv(a11) * b11;
+	   bd11 = b11; (skipped if duplication is disabled)
+	   c11  = b11; */ \
+	PASTEMAC(ch,trsmukr)( a11, \
+	                      b11, \
+	                      bd11, \
+	                      c11, rs_c, cs_c ); \
 }

 INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ref_mxn, GEMM_UKERNEL, TRSM_U_UKERNEL )
--- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h
+++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h
@@ -42,12 +42,12 @@
 void PASTEMAC(ch,varname)( \
                           dim_t           k, \
                           ctype* restrict alpha, \
-                           ctype* restrict aR, \
-                           ctype* restrict a, \
-                           ctype* restrict bdB, \
-                           ctype* restrict bd, \
-                           ctype* restrict b, \
-                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
+                           ctype* restrict a12, \
+                           ctype* restrict a11, \
+                           ctype* restrict bd21, \
+                           ctype* restrict bd11, \
+                           ctype* restrict b11, \
+                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
                           ctype* restrict b_next  \
                         );
--- a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c
+++ b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c
@@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \
 	const inc_t     rs_b  = PASTEMAC(ch,packnr); \
 	const inc_t     cs_b  = 1; \
 \
-	dim_t           iter, i, j, k; \
+	dim_t           iter, i, j, l; \
 	dim_t           n_behind; \
 \
 	ctype* restrict alpha11; \
@@ -87,18 +87,20 @@ void PASTEMAC(ch,varname)( \
 \
 			/* chi11 = chi11 - a10t * x01; */ \
 			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( k = 0; k < n_behind; ++k ) \
+			for ( l = 0; l < n_behind; ++l ) \
 			{ \
-				alpha10 = a10t + (k  )*cs_a; \
-				chi01   = x01  + (k  )*rs_b; \
+				alpha10 = a10t + (l  )*cs_a; \
+				chi01   = x01  + (l  )*rs_b; \
 \
 				PASTEMAC(ch,axpys)( *alpha10, *chi01, rho11 ); \
 			} \
 			PASTEMAC(ch,subs)( rho11, *chi11 ); \
 \
 			/* chi11 = chi11 / alpha11; */ \
-			/* NOTE: 1.0/alpha11 is stored instead of alpha11, so we
-			   need to multiply rather than divide. */ \
+			/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
+			   of alpha11, so we can multiply rather than divide. We store 
+			   the inverse of alpha11 intentionally to avoid expensive
+			   division instructions within the micro-kernel. */ \
 			PASTEMAC(ch,scals)( *alpha11, *chi11 ); \
 \
 			/* Output final result to matrix C. */ \
--- a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c
+++ b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c
@@ -54,7 +54,7 @@ void PASTEMAC(ch,varname)( \
 	const inc_t     rs_b  = PASTEMAC(ch,packnr); \
 	const inc_t     cs_b  = 1; \
 \
-	dim_t           iter, i, j, k; \
+	dim_t           iter, i, j, l; \
 	dim_t           n_behind; \
 \
 	ctype* restrict alpha11; \
@@ -87,18 +87,20 @@ void PASTEMAC(ch,varname)( \
 \
 			/* chi11 = chi11 - a12t * x21; */ \
 			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( k = 0; k < n_behind; ++k ) \
+			for ( l = 0; l < n_behind; ++l ) \
 			{ \
-				alpha12 = a12t + (k  )*cs_a; \
-				chi21   = x21  + (k  )*rs_b; \
+				alpha12 = a12t + (l  )*cs_a; \
+				chi21   = x21  + (l  )*rs_b; \
 \
 				PASTEMAC(ch,axpys)( *alpha12, *chi21, rho11 ); \
 			} \
 			PASTEMAC(ch,subs)( rho11, *chi11 ); \
 \
 			/* chi11 = chi11 / alpha11; */ \
-			/* NOTE: 1.0/alpha11 is stored instead of alpha11, so we
-			   need to multiply rather than divide. */ \
+			/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
+			   of alpha11, so we can multiply rather than divide. We store 
+			   the inverse of alpha11 intentionally to avoid expensive
+			   division instructions within the micro-kernel. */ \
 			PASTEMAC(ch,scals)( *alpha11, *chi11 ); \
 \
 			/* Output final result to matrix C. */ \
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -248,5 +248,21 @@
 #define bli_cnifac   BLIS_DEFAULT_NI_FAC
 #define bli_znifac   BLIS_DEFAULT_NI_FAC

+// Default Level-1f fusing factors
+
+#define bli_sdotxf_fusefac       BLIS_DOTXF_FUSE_FAC_S
+#define bli_ddotxf_fusefac       BLIS_DOTXF_FUSE_FAC_D
+#define bli_cdotxf_fusefac       BLIS_DOTXF_FUSE_FAC_C
+#define bli_zdotxf_fusefac       BLIS_DOTXF_FUSE_FAC_Z
+
+#define bli_saxpyf_fusefac       BLIS_AXPYF_FUSE_FAC_S
+#define bli_daxpyf_fusefac       BLIS_AXPYF_FUSE_FAC_D
+#define bli_caxpyf_fusefac       BLIS_AXPYF_FUSE_FAC_C
+#define bli_zaxpyf_fusefac       BLIS_AXPYF_FUSE_FAC_Z
+
+#define bli_sdotxaxpyf_fusefac   BLIS_DOTXAXPYF_FUSE_FAC_S
+#define bli_ddotxaxpyf_fusefac   BLIS_DOTXAXPYF_FUSE_FAC_D
+#define bli_cdotxaxpyf_fusefac   BLIS_DOTXAXPYF_FUSE_FAC_C
+#define bli_zdotxaxpyf_fusefac   BLIS_DOTXAXPYF_FUSE_FAC_Z

 #endif 
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -370,6 +370,18 @@
 \
 	( rs < cs )

+#define bli_has_nonunit_inc1( inc1 ) \
+\
+	( inc1 != 1 )
+
+#define bli_has_nonunit_inc2( inc1, inc2 ) \
+\
+	( inc1 != 1 || inc2 != 1 )
+
+#define bli_has_nonunit_inc3( inc1, inc2, inc3 ) \
+\
+	( inc1 != 1 || inc2 != 1 || inc3 != 1 )
+

 // diag offset-related

--- a/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.c
+++ b/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.c
@@ -34,127 +34,46 @@

 #include "blis.h"

-/*
-#define FUNCPTR_T dotxf_fp
-
-typedef void (*FUNCPTR_T)(
-                           conj_t conjx,
-                           conj_t conjy,
-                           dim_t  n,
-                           void*  alpha,
-                           void*  x, inc_t incx,
-                           void*  y, inc_t incy,
-                           void*  beta,
-                           void*  rho
-                         );
-
-// If some mixed datatype functions will not be compiled, we initialize
-// the corresponding elements of the function array to NULL.
-#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
-static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_opt_var1);
-#else
-#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
-static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_opt_var1);
-#else
-static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_opt_var1);
-#endif
-#endif
-
-
-void bli_dotxf_opt_var1( obj_t*  alpha,
-                         obj_t*  x,
-                         obj_t*  y,
-                         obj_t*  beta,
-                         obj_t*  rho )
-{
-	num_t     dt_x      = bli_obj_datatype( *x );
-	num_t     dt_y      = bli_obj_datatype( *y );
-	num_t     dt_rho    = bli_obj_datatype( *rho );
-
-	conj_t    conjx     = bli_obj_conj_status( *x );
-	conj_t    conjy     = bli_obj_conj_status( *y );
-	dim_t     n         = bli_obj_vector_dim( *x );
-
-	inc_t     inc_x     = bli_obj_vector_inc( *x );
-	void*     buf_x     = bli_obj_buffer_at_off( *x );
-
-	inc_t     inc_y     = bli_obj_vector_inc( *y );
-	void*     buf_y     = bli_obj_buffer_at_off( *y );
-
-	void*     buf_rho   = bli_obj_buffer_at_off( *rho );
-
-	num_t     dt_alpha;
-	void*     buf_alpha;
-
-	num_t     dt_beta;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// The datatype of alpha MUST be the type union of x and y. This is to
-	// prevent any unnecessary loss of information during computation.
-	dt_alpha  = bli_datatype_union( dt_x, dt_y );
-	buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha );
-
-	// The datatype of beta MUST be the same as the datatype of rho.
-	dt_beta   = dt_rho;
-	buf_beta  = bli_obj_scalar_buffer( dt_beta, *beta );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_x][dt_y][dt_rho];
-
-	// Invoke the function.
-	f( conjx,
-	   conjy,
-	   n,
-	   buf_alpha, 
-	   buf_x, inc_x, 
-	   buf_y, inc_y,
-	   buf_beta, 
-	   buf_rho );
-}
-*/

 #undef  GENTFUNC3U12
-#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
+#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, opname, varname ) \
 \
 void PASTEMAC3(chx,chy,chr,varname)( \
+                                     conj_t conjat, \
                                     conj_t conjx, \
-                                     conj_t conjy, \
-                                     dim_t  b_m, \
-                                     dim_t  n, \
+                                     dim_t  m, \
+                                     dim_t  b_n, \
                                     void*  alpha, \
-                                     void*  x, inc_t incx, inc_t ldx, \
-                                     void*  y, inc_t incy, \
+                                     void*  a, inc_t inca, inc_t lda, \
+                                     void*  x, inc_t incx, \
                                     void*  beta, \
-                                     void*  r, inc_t incr \
+                                     void*  y, inc_t incy \
                                   ) \
 { \
 	ctype_xy* alpha_cast = alpha; \
-	ctype_x*  x_cast     = x; \
-	ctype_y*  y_cast     = y; \
+	ctype_x*  a_cast     = a; \
+	ctype_y*  x_cast     = x; \
 	ctype_r*  beta_cast  = beta; \
-	ctype_r*  r_cast     = r; \
-	ctype_x*  x1; \
-	ctype_y*  y1; \
-	ctype_r*  rho1; \
+	ctype_r*  y_cast     = y; \
+	ctype_x*  a1; \
+	ctype_y*  x1; \
+	ctype_r*  psi1; \
 	dim_t     i; \
 \
-	for ( i = 0; i < b_m; ++i ) \
+	for ( i = 0; i < b_n; ++i ) \
 	{ \
-		x1   = x_cast + (0  )*incx + (i  )*ldx; \
-		y1   = y_cast + (0  )*incy; \
-		rho1 = r_cast + (i  )*incr; \
+		a1   = a_cast + (0  )*inca + (i  )*lda; \
+		x1   = x_cast + (0  )*incx; \
+		psi1 = y_cast + (i  )*incy; \
 \
-		PASTEMAC3(chx,chy,chr,dotxv)( conjx, \
-		                              conjy, \
-		                              n, \
+		PASTEMAC3(cha,chx,chy,dotxv)( conjat, \
+		                              conjx, \
+		                              m, \
 		                              alpha_cast, \
+		                              a1,   inca, \
 		                              x1,   incx, \
-		                              y1,   incy, \
 		                              beta_cast, \
-		                              rho1 ); \
+		                              psi1 ); \
 	} \
 }

@@ -184,30 +103,30 @@ typedef union


 void bli_ddddotxf_opt_var1(
+                            conj_t conjat,
                            conj_t conjx,
-                            conj_t conjy,
-                            dim_t  b_m,
-                            dim_t  n,
+                            dim_t  m,
+                            dim_t  b_n,
                            void*  alpha,
-                            void*  x, inc_t incx, inc_t ldx,
-                            void*  y, inc_t incy,
+                            void*  a, inc_t inca, inc_t lda,
+                            void*  x, inc_t incx,
                            void*  beta,
-                            void*  r, inc_t incr
+                            void*  y, inc_t incy
                          ) 
 { 
 	double*  restrict alpha_cast = alpha; 
 	double*  restrict beta_cast = beta; 
+	double*  restrict a_cast = a; 
 	double*  restrict x_cast = x; 
 	double*  restrict y_cast = y; 
-	double*  restrict r_cast = r; 
 	dim_t             i; 

 	const dim_t       n_elem_per_reg = 2;
 	const dim_t       n_iter_unroll  = 4;

-	dim_t             n_pre;
-	dim_t             n_run;
-	dim_t             n_left;
+	dim_t             m_pre;
+	dim_t             m_run;
+	dim_t             m_left;

 	double*  restrict x0;
 	double*  restrict x1;
@@ -223,76 +142,76 @@ void bli_ddddotxf_opt_var1(
 	bool_t            use_ref = FALSE;


-	if ( bli_zero_dim1( b_m ) ) return;
+	if ( bli_zero_dim1( b_n ) ) return;

 	// If the vector lengths are zero, scale r by beta and return.
-	if ( bli_zero_dim1( n ) ) 
+	if ( bli_zero_dim1( m ) ) 
 	{ 
 		PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
-		                      b_m,
+		                      b_n,
 		                      beta_cast,
-		                      r_cast, incr );
+		                      y_cast, incy );
 		return; 
 	} 

-    n_pre = 0;
+    m_pre = 0;

    // If there is anything that would interfere with our use of aligned
    // vector loads/stores, call the reference implementation.
-	if ( b_m < PASTEMAC(d,dotxf_fuse_fac) )
+	if ( b_n < PASTEMAC(d,dotxf_fuse_fac) )
 	{
 		use_ref = TRUE;
 	}
-    else if ( incx != 1 || incy != 1 || incr != 1 )
+    else if ( inca != 1 || incx != 1 || incy != 1 )
    {
        use_ref = TRUE;
    }
-	else if ( bli_is_unaligned_to( x, 16 ) ||
-	          bli_is_unaligned_to( y, 16 ) ||
-	          bli_is_unaligned_to( r, 16 ) )
+	else if ( bli_is_unaligned_to( a, 16 ) ||
+	          bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( y, 16 ) )
 	{
 		use_ref = TRUE;

-		if ( bli_is_unaligned_to( x, 16 ) &&
-		     bli_is_unaligned_to( y, 16 ) &&
-		     bli_is_aligned_to( r, 16 ) ) // Note: r is not affected by x and y being unaligned. 
+		if ( bli_is_unaligned_to( a, 16 ) &&
+		     bli_is_unaligned_to( x, 16 ) &&
+		     bli_is_aligned_to( y, 16 ) ) // Note: r is not affected by x and y being unaligned. 
 		{
 			use_ref = FALSE;
-			n_pre   = 1;
+			m_pre   = 1;
 		}
 	}

 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		PASTEMAC3(d,d,d,dotxf_unb_var1)( conjx,
-		                                 conjy,
-		                                 b_m,
-		                                 n,
+		PASTEMAC3(d,d,d,dotxf_unb_var1)( conjat,
+		                                 conjx,
+		                                 m,
+		                                 b_n,
 		                                 alpha_cast,
-		                                 x_cast, incx, ldx,
-		                                 y_cast, incy,
+		                                 a_cast, inca, lda,
+		                                 x_cast, incx,
 		                                 beta_cast,
-		                                 r_cast, incr );
+		                                 y_cast, incy );
 		return;
 	}


-	n_run       = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
-	n_left      = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
+	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
+	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );

-	x0 = x_cast;
-	x1 = x_cast +   ldx;
-	x2 = x_cast + 2*ldx;
-	x3 = x_cast + 3*ldx;
-	y0 = y_cast;
+	x0 = a_cast;
+	x1 = a_cast +   lda;
+	x2 = a_cast + 2*lda;
+	x3 = a_cast + 3*lda;
+	y0 = x_cast;

 	PASTEMAC(d,set0s)( rho0 ); 
 	PASTEMAC(d,set0s)( rho1 ); 
 	PASTEMAC(d,set0s)( rho2 ); 
 	PASTEMAC(d,set0s)( rho3 ); 

-	if ( n_pre == 1 )
+	if ( m_pre == 1 )
 	{
 		x0c = *x0;
 		x1c = *x1;
@@ -305,11 +224,11 @@ void bli_ddddotxf_opt_var1(
 		rho2 += x2c * y0c;
 		rho3 += x3c * y0c;

-		x0 += incx;
-		x1 += incx;
-		x2 += incx;
-		x3 += incx;
-		y0 += incy;
+		x0 += inca;
+		x1 += inca;
+		x2 += inca;
+		x3 += inca;
+		y0 += incx;
 	}

 	rho0v.v = _mm_setzero_pd();
@@ -317,7 +236,7 @@ void bli_ddddotxf_opt_var1(
 	rho2v.v = _mm_setzero_pd();
 	rho3v.v = _mm_setzero_pd();

-	for ( i = 0; i < n_run; ++i )
+	for ( i = 0; i < m_run; ++i )
 	{
 		x0v.v = _mm_load_pd( ( double* )(x0 + 0*n_elem_per_reg) );
 		x1v.v = _mm_load_pd( ( double* )(x1 + 0*n_elem_per_reg) );
@@ -376,9 +295,9 @@ void bli_ddddotxf_opt_var1(
 	rho2 += rho2v.d[0] + rho2v.d[1];
 	rho3 += rho3v.d[0] + rho3v.d[1];

-	if ( n_left > 0 )
+	if ( m_left > 0 )
 	{
-		for ( i = 0; i < n_left; ++i )
+		for ( i = 0; i < m_left; ++i )
 		{
 			x0c = *x0;
 			x1c = *x1;
@@ -391,23 +310,23 @@ void bli_ddddotxf_opt_var1(
 			rho2 += x2c * y0c;
 			rho3 += x3c * y0c;

-			x0 += incx;
-			x1 += incx;
-			x2 += incx;
-			x3 += incx;
-			y0 += incy;
+			x0 += inca;
+			x1 += inca;
+			x2 += inca;
+			x3 += inca;
+			y0 += incx;
 		}
 	}
 /*
-	PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast  ) ); \
-	PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+1) ); \
-	PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+2) ); \
-	PASTEMAC2(d,d,scals)( *beta_cast, *(r_cast+3) ); \
+	PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast  ) ); \
+	PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+1) ); \
+	PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+2) ); \
+	PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+3) ); \

-	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho1, *(r_cast  ) ); \
-	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho2, *(r_cast+1) ); \
-	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho3, *(r_cast+2) ); \
-	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho4, *(r_cast+3) ); \
+	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho1, *(y_cast  ) ); \
+	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho2, *(y_cast+1) ); \
+	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho3, *(y_cast+2) ); \
+	PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho4, *(y_cast+3) ); \
 */

 	rho1v.d[0] = rho0;
@@ -418,8 +337,8 @@ void bli_ddddotxf_opt_var1(
 	betav.v  = _mm_loaddup_pd( ( double* ) beta_cast );
 	alphav.v = _mm_loaddup_pd( ( double* ) alpha_cast );

-	rho0v.v = _mm_load_pd( ( double* )(r_cast + 0*n_elem_per_reg) );
-	rho2v.v = _mm_load_pd( ( double* )(r_cast + 1*n_elem_per_reg) );
+	rho0v.v = _mm_load_pd( ( double* )(y_cast + 0*n_elem_per_reg) );
+	rho2v.v = _mm_load_pd( ( double* )(y_cast + 1*n_elem_per_reg) );

 	rho0v.v *= betav.v;
 	rho2v.v *= betav.v;
@@ -427,7 +346,7 @@ void bli_ddddotxf_opt_var1(
 	rho0v.v += alphav.v * rho1v.v;
 	rho2v.v += alphav.v * rho3v.v;

-	_mm_store_pd( ( double* )(r_cast + 0*n_elem_per_reg), rho0v.v );
-	_mm_store_pd( ( double* )(r_cast + 1*n_elem_per_reg), rho2v.v );
+	_mm_store_pd( ( double* )(y_cast + 0*n_elem_per_reg), rho0v.v );
+	_mm_store_pd( ( double* )(y_cast + 1*n_elem_per_reg), rho2v.v );

 }
--- a/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.h
+++ b/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.h
@@ -32,12 +32,6 @@

 */

-void bli_dotxf_opt_var1( obj_t* alpha,
-                         obj_t* x,
-                         obj_t* y,
-                         obj_t* beta,
-                         obj_t* rho );
-

 //
 // Define fusing factors for dotxf operation.
@@ -49,18 +43,18 @@ void bli_dotxf_opt_var1( obj_t* alpha,


 #undef  GENTPROT3U12
-#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
+#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
 \
 void PASTEMAC3(chx,chy,chr,varname)( \
+                                     conj_t conjat, \
                                     conj_t conjx, \
-                                     conj_t conjy, \
                                     dim_t  m, \
-                                     dim_t  n, \
+                                     dim_t  b_n, \
                                     void*  alpha, \
-                                     void*  x, inc_t incx, inc_t ldx, \
-                                     void*  y, inc_t incy, \
+                                     void*  a, inc_t inca, inc_t lda, \
+                                     void*  x, inc_t incx, \
                                     void*  beta, \
-                                     void*  r, inc_t incr \
+                                     void*  y, inc_t incy \
                                   );

 INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 )
--- a/testsuite/input.general
+++ b/testsuite/input.general
@@ -3,9 +3,9 @@ c #rg     # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major;
 c #rji    # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
 0       # Test all combinations of storage schemes?
 32      # General stride spacing (for cases when testing general stride)
-d #sdcz    # Datatype(s) to test
+sdcz #sdcz    # Datatype(s) to test
 100     # Problem size: first to test
-500     # Problem size: maximum to test
+300     # Problem size: maximum to test
 100     # Problem size: increment between experiments
 1       # Error-checking level (0 = disable error checking; 1 = full error checking)
 i       # Reaction to test failure ('i' = ignore; 's' = sleep() and continue; 'a' = abort)
--- a/testsuite/input.operations
+++ b/testsuite/input.operations
@@ -1,60 +1,60 @@
 # --- Utility ------------------------------------------------------------------

-0         randv                                         (0 = disable all; 1 = specify)
+1         randv                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)

-0         randm                                         (0 = disable all; 1 = specify)
+1         randm                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -1       dimensions: m n                             (-1 = bind to problem size)


 # --- Level-1v -----------------------------------------------------------------

-0         addv                                          (0 = disable all; 1 = specify)
+1         addv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ?           parameters: conjx                           (? = test all values)

-0         axpyv                                         (0 = disable all; 1 = specify)
+1         axpyv                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ?           parameters: conjx                           (? = test all values)

-0         copyv                                         (0 = disable all; 1 = specify)
+1         copyv                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ?           parameters: conjx                           (? = test all values)

-0         dotv                                          (0 = disable all; 1 = specify)
+1         dotv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ??          parameters: conjx conjy                     (? = test all values)

-0         dotxv                                         (0 = disable all; 1 = specify)
+1         dotxv                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ??          parameters: conjx conjy                     (? = test all values)

-0         fnormv                                        (0 = disable all; 1 = specify)
+1         fnormv                                        (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)

-0         scalv                                         (0 = disable all; 1 = specify)
+1         scalv                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ?           parameters: conjbeta                        (? = test all values)

-0         scal2v                                        (0 = disable all; 1 = specify)
+1         scal2v                                        (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ?           parameters: conjx                           (? = test all values)

-0         setv                                          (0 = disable all; 1 = specify)
+1         setv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)

-0         subv                                          (0 = disable all; 1 = specify)
+1         subv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ?           parameters: conjx                           (? = test all values)
@@ -62,40 +62,40 @@

 # --- Level-1m -----------------------------------------------------------------

-0         addm                                          (0 = disable all; 1 = specify)
+1         addm                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)
 ?           parameters: transa                          (? = test all values)

-0         axpym                                         (0 = disable all; 1 = specify)
+1         axpym                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -1       dimensions: m n                             (-1 = bind to problem size)
 ?           parameters: transa                          (? = test all values)

-0         copym                                         (0 = disable all; 1 = specify)
+1         copym                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)
 ?           parameters: transa                          (? = test all values)

-0         fnormm                                        (0 = disable all; 1 = specify)
+1         fnormm                                        (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)

-0         scalm                                         (0 = disable all; 1 = specify)
+1         scalm                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)
 ?           parameters: conjbeta                        (? = test all values)

-0         scal2m                                        (0 = disable all; 1 = specify)
+1         scal2m                                        (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)
 ?           parameters: transa                          (? = test all values)

-0         setm                                          (0 = disable all; 1 = specify)
+1         setm                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)

-0         subm                                          (0 = disable all; 1 = specify)
+1         subm                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)
 ?           parameters: transa                          (? = test all values)
@@ -103,52 +103,52 @@

 # --- Level-2 ------------------------------------------------------------------

-0         gemv                                          (0 = disable all; 1 = specify)
+1         gemv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)
 ??          parameters: transa conjx                    (? = test all values)

-0         ger                                           (0 = disable all; 1 = specify)
+1         ger                                           (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m n                             (-1 = bind to problem size)
 ??          parameters: conjx conjy                     (? = test all values)

-0         hemv                                          (0 = disable all; 1 = specify)
+1         hemv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ???         parameters: uploa conja conjx               (? = test all values)

-0         her                                           (0 = disable all; 1 = specify)
+1         her                                           (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ??          parameters: uploc conjx                     (? = test all values)

-0         her2                                          (0 = disable all; 1 = specify)
+1         her2                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ???         parameters: uploc conjx conjy               (? = test all values)

-0         symv                                          (0 = disable all; 1 = specify)
+1         symv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ???         parameters: uploa conja conjx               (? = test all values)

-0         syr                                           (0 = disable all; 1 = specify)
+1         syr                                           (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ??          parameters: uploc conjx                     (? = test all values)

-0         syr2                                          (0 = disable all; 1 = specify)
+1         syr2                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ???         parameters: uploc conjx conjy               (? = test all values)

-0         trmv                                          (0 = disable all; 1 = specify)
+1         trmv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ???         parameters: uploa transa diaga              (? = test all values)

-0         trsv                                          (0 = disable all; 1 = specify)
+1         trsv                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1          dimensions: m                               (-1 = bind to problem size)
 ???         parameters: uploa transa diaga              (? = test all values)
@@ -158,37 +158,37 @@

 1         gemm                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
-1 -1 -1    dimensions: m n k                           (-1 = bind to problem size)
+-1 -1 -2    dimensions: m n k                           (-1 = bind to problem size)
 ??          parameters: transa transb                   (? = test all values)

 1         hemm                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
-1 -1       dimensions: m n                             (-1 = bind to problem size)
+-1 -2       dimensions: m n                             (-1 = bind to problem size)
 ????        parameters: side uploa conja transb         (? = test all values)

 1         herk                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
-1 -1       dimensions: m k                             (-1 = bind to problem size)
+-1 -2       dimensions: m k                             (-1 = bind to problem size)
 ??          parameters: uploc transa                    (? = test all values)

 1         her2k                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
-1 -1       dimensions: m k                             (-1 = bind to problem size)
+-1 -2       dimensions: m k                             (-1 = bind to problem size)
 ???         parameters: uploc transa transb             (? = test all values)

 1         symm                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
-1 -1       dimensions: m n                             (-1 = bind to problem size)
+-1 -2       dimensions: m n                             (-1 = bind to problem size)
 ????        parameters: side uploa conja transb         (? = test all values)

 1         syrk                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
-1 -1       dimensions: m k                             (-1 = bind to problem size)
+-1 -2       dimensions: m k                             (-1 = bind to problem size)
 ??          parameters: uploc transa                    (? = test all values)

 1         syr2k                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
-1 -1       dimensions: m k                             (-1 = bind to problem size)
+-1 -2       dimensions: m k                             (-1 = bind to problem size)
 ???         parameters: uploc transa transb             (? = test all values)

 1         trmm                                          (0 = disable all; 1 = specify)
--- a/testsuite/input.operations.1
+++ b/testsuite/input.operations.1
@@ -159,7 +159,7 @@
 1         gemm                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -1 -2    dimensions: m n k                           (-1 = bind to problem size)
-nn          parameters: transa transb                   (? = test all values)
+??          parameters: transa transb                   (? = test all values)

 1         hemm                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
@@ -169,7 +169,7 @@ nn          parameters: transa transb                   (? = test all values)
 1         herk                                          (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
 -1 -2       dimensions: m k                             (-1 = bind to problem size)
-ln          parameters: uploc transa                    (? = test all values)
+??          parameters: uploc transa                    (? = test all values)

 1         her2k                                         (0 = disable all; 1 = specify)
 1           test sequential front-end                   (0 = disable; 1 = enable)
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -533,7 +533,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                        BLIS_EXTEND_NC_C,
 	                        BLIS_EXTEND_NC_Z );
 	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "level-3 register blocksizes                     \n" );
+	libblis_test_fprintf_c( os, "level-3 register blocksizes  s     d     c     z \n" );
 	libblis_test_fprintf_c( os, "  m dimension            %5u %5u %5u %5u\n",
 	                        BLIS_DEFAULT_MR_S,
 	                        BLIS_DEFAULT_MR_D,
@@ -566,7 +566,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                        BLIS_DEFAULT_NI_Z );
 */
 	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "level-3 packing duplication                     \n" );
+	libblis_test_fprintf_c( os, "level-3 packing duplication  s     d     c     z \n" );
 	libblis_test_fprintf_c( os, "  dupl. factors for B    %5u %5u %5u %5u\n",
 	                        BLIS_DEFAULT_NUM_DUPL_S,
 	                        BLIS_DEFAULT_NUM_DUPL_D,
@@ -578,7 +578,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                        BLIS_NUM_ELEM_PER_REG_C,
 	                        BLIS_NUM_ELEM_PER_REG_Z );
 	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "level-2 cache blocksizes                        \n" );
+	libblis_test_fprintf_c( os, "level-2 cache blocksizes     s     d     c     z \n" );
 	libblis_test_fprintf_c( os, "  m dimension            %5u %5u %5u %5u\n",
 	                        BLIS_DEFAULT_L2_MC_S,
 	                        BLIS_DEFAULT_L2_MC_D,
@@ -590,11 +590,27 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                        BLIS_DEFAULT_L2_NC_C,
 	                        BLIS_DEFAULT_L2_NC_Z );
 	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "level-1f fusing factors  %5u %5u %5u %5u\n",
-	                        BLIS_DEFAULT_FUSING_FACTOR_S,
-	                        BLIS_DEFAULT_FUSING_FACTOR_D,
-	                        BLIS_DEFAULT_FUSING_FACTOR_C,
-	                        BLIS_DEFAULT_FUSING_FACTOR_Z );
+	libblis_test_fprintf_c( os, "level-1f fusing factors      s     d     c     z \n" );
+	libblis_test_fprintf_c( os, "  default                %5u %5u %5u %5u\n",
+	                        BLIS_DEFAULT_FUSE_FAC_S,
+	                        BLIS_DEFAULT_FUSE_FAC_D,
+	                        BLIS_DEFAULT_FUSE_FAC_C,
+	                        BLIS_DEFAULT_FUSE_FAC_Z );
+	libblis_test_fprintf_c( os, "  axpyf                  %5u %5u %5u %5u\n",
+	                        BLIS_AXPYF_FUSE_FAC_S,
+	                        BLIS_AXPYF_FUSE_FAC_D,
+	                        BLIS_AXPYF_FUSE_FAC_C,
+	                        BLIS_AXPYF_FUSE_FAC_Z );
+	libblis_test_fprintf_c( os, "  dotxf                  %5u %5u %5u %5u\n",
+	                        BLIS_DOTXF_FUSE_FAC_S,
+	                        BLIS_DOTXF_FUSE_FAC_D,
+	                        BLIS_DOTXF_FUSE_FAC_C,
+	                        BLIS_DOTXF_FUSE_FAC_Z );
+	libblis_test_fprintf_c( os, "  dotxaxpyf              %5u %5u %5u %5u\n",
+	                        BLIS_DOTXAXPYF_FUSE_FAC_S,
+	                        BLIS_DOTXAXPYF_FUSE_FAC_D,
+	                        BLIS_DOTXAXPYF_FUSE_FAC_C,
+	                        BLIS_DOTXAXPYF_FUSE_FAC_Z );
 	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf( os, "\n" );