Removed support for duplication.

Details: - Removed support for duplication from the gemmtrsm/trsm micro-kernels and all framework code. - Updated test suite modules according to above changes.
2026-04-20 07:38:53 +00:00 · 2013-11-08 11:17:34 -06:00
parent 68a5910974
commit 376bbb59c8
72 changed files with 206 additions and 1873 deletions
--- a/config/bgq/bli_kernel.h
+++ b/config/bgq/bli_kernel.h
@@ -152,34 +152,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -265,10 +237,6 @@

 // -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #include "bli_gemm_8x8.h"
--- a/config/dunnington/bli_kernel.h
+++ b/config/dunnington/bli_kernel.h
@@ -146,30 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -262,10 +238,6 @@
 //#include "bli_trsm_l_ref_4x4.h"
 //#include "bli_trsm_u_ref_4x4.h"

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #define GEMM_UKERNEL         gemm_opt_d4x4
--- a/config/loongson3a/bli_kernel.h
+++ b/config/loongson3a/bli_kernel.h
@@ -146,34 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        1
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -261,10 +233,6 @@

 #include "bli_gemm_opt_d4x4.h"

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #define GEMM_UKERNEL         gemm_opt_d4x4
--- a/config/mic/bli_kernel.h
+++ b/config/mic/bli_kernel.h
@@ -146,34 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        8
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@

 // -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #include "bli_gemm_opt_30x8.h"
--- a/config/piledriver/bli_kernel.h
+++ b/config/piledriver/bli_kernel.h
@@ -146,34 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@

 // -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #include "bli_gemm_4x6.h"
--- a/config/power7/bli_kernel.h
+++ b/config/power7/bli_kernel.h
@@ -146,34 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@

 // -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 //#define GEMM_UKERNEL         gemm_ref_mxn
--- a/config/reference/bli_kernel.h
+++ b/config/reference/bli_kernel.h
@@ -146,34 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@

 // -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #define GEMM_UKERNEL         gemm_ref_mxn
--- a/config/sandybridge/bli_kernel.h
+++ b/config/sandybridge/bli_kernel.h
@@ -146,34 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -261,10 +233,6 @@

 #include "bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h"

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #define GEMM_UKERNEL         gemm_opt_8x4_ref_u4_nodupl_avx1
--- a/config/template/bli_kernel.h
+++ b/config/template/bli_kernel.h
@@ -146,34 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -265,10 +237,6 @@
 #include "bli_gemmtrsm_l_opt_mxn.h"
 #include "bli_gemmtrsm_u_opt_mxn.h"

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #define GEMM_UKERNEL         gemm_opt_mxn
--- a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c
@@ -41,8 +41,7 @@ void bli_sgemmtrsm_l_opt_mxn(
                              float*    restrict alpha,
                              float*    restrict a10,
                              float*    restrict a11,
-                              float*    restrict bd01,
-                              float*    restrict bd11,
+                              float*    restrict b01,
                              float*    restrict b11,
                              float*    restrict c11, inc_t rs_c, inc_t cs_c,
                              float*    restrict a_next,
@@ -58,7 +57,7 @@ void bli_sgemmtrsm_l_opt_mxn(
 	bli_sgemm_opt_mxn( k,
 	                   minus_one,
 	                   a10,
-	                   bd01,
+	                   b01,
 	                   alpha,
 	                   b11, rs_b, cs_b,
 	                   a_next,
@@ -66,7 +65,6 @@ void bli_sgemmtrsm_l_opt_mxn(

 	bli_strsm_l_opt_mxn( a11,
 	                     b11,
-	                     bd11,
 	                     c11, rs_c, cs_c );
 }

@@ -77,8 +75,7 @@ void bli_dgemmtrsm_l_opt_mxn(
                              double*   restrict alpha,
                              double*   restrict a10,
                              double*   restrict a11,
-                              double*   restrict bd01,
-                              double*   restrict bd11,
+                              double*   restrict b01,
                              double*   restrict b11,
                              double*   restrict c11, inc_t rs_c, inc_t cs_c,
                              double*   restrict a_next,
@@ -171,19 +168,6 @@ void bli_dgemmtrsm_l_opt_mxn(
                                                                           
                k             MR                                           

-  Thus, with duplication enabled, the operation takes the form of:
-
-    b11  = alpha * b11 - a10 * bd01;
-    b11  = inv(a11) * b11;
-    bd11 = b11;  (skipped if duplication is disabled)
-    c11  = b11;
-                                                                        
-  And if duplication is disabled, the operation reduces to:
-
-    b11 = alpha * b11 - a10 * b01;  (Note: Here, b01 == bd01.)
-    b11 = inv(a11) * b11;
-    c11 = b11;
-
  A note on optimization:
  - This implementation simply calls the gemm micro-kernel and then the
    trsm micro-kernel. Let's assume that the gemm micro-kernel has already
@@ -208,24 +192,20 @@ void bli_dgemmtrsm_l_opt_mxn(

 	double*   restrict minus_one = bli_dm1;

-	/* Reminder: if duplication is disabled, then bd01 == b01, bd11 == b11. */
-
-	/* b11 = alpha * b11 - a10 * bd01; */
+	/* b11 = alpha * b11 - a10 * b01; */
 	bli_dgemm_opt_mxn( k,
 	                   minus_one,
 	                   a10,
-	                   bd01,
+	                   b01,
 	                   alpha,
 	                   b11, rs_b, cs_b,
 	                   a_next,
 	                   b_next );

-	/* b11  = inv(a11) * b11;
-	   bd11 = b11; (skipped if duplication is disabled)
-	   c11  = b11; */
+	/* b11 = inv(a11) * b11;
+	   c11 = b11; */
 	bli_dtrsm_l_opt_mxn( a11,
 	                     b11,
-	                     bd11,
 	                     c11, rs_c, cs_c );
 }

@@ -236,8 +216,7 @@ void bli_cgemmtrsm_l_opt_mxn(
                              scomplex* restrict alpha,
                              scomplex* restrict a10,
                              scomplex* restrict a11,
-                              scomplex* restrict bd01,
-                              scomplex* restrict bd11,
+                              scomplex* restrict b01,
                              scomplex* restrict b11,
                              scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
                              scomplex* restrict a_next,
@@ -253,7 +232,7 @@ void bli_cgemmtrsm_l_opt_mxn(
 	bli_cgemm_opt_mxn( k,
 	                   minus_one,
 	                   a10,
-	                   bd01,
+	                   b01,
 	                   alpha,
 	                   b11, rs_b, cs_b,
 	                   a_next,
@@ -261,7 +240,6 @@ void bli_cgemmtrsm_l_opt_mxn(

 	bli_ctrsm_l_opt_mxn( a11,
 	                     b11,
-	                     bd11,
 	                     c11, rs_c, cs_c );
 }

@@ -272,8 +250,7 @@ void bli_zgemmtrsm_l_opt_mxn(
                              dcomplex* restrict alpha,
                              dcomplex* restrict a10,
                              dcomplex* restrict a11,
-                              dcomplex* restrict bd01,
-                              dcomplex* restrict bd11,
+                              dcomplex* restrict b11,
                              dcomplex* restrict b11,
                              dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
                              dcomplex* restrict a_next,
@@ -289,7 +266,7 @@ void bli_zgemmtrsm_l_opt_mxn(
 	bli_zgemm_opt_mxn( k,
 	                   minus_one,
 	                   a10,
-	                   bd01,
+	                   b01,
 	                   alpha,
 	                   b11, rs_b, cs_b,
 	                   a_next,
@@ -297,7 +274,6 @@ void bli_zgemmtrsm_l_opt_mxn(

 	bli_ztrsm_l_opt_mxn( a11,
 	                     b11,
-	                     bd11,
 	                     c11, rs_c, cs_c );
 }

--- a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h
+++ b/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h
@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict a10, \
                           ctype* restrict a11, \
-                           ctype* restrict bd01, \
-                           ctype* restrict bd11, \
+                           ctype* restrict b01, \
                           ctype* restrict b11, \
                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
--- a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c
@@ -41,8 +41,7 @@ void bli_sgemmtrsm_u_opt_mxn(
                              float*    restrict alpha,
                              float*    restrict a12,
                              float*    restrict a11,
-                              float*    restrict bd21,
-                              float*    restrict bd11,
+                              float*    restrict b21,
                              float*    restrict b11,
                              float*    restrict c11, inc_t rs_c, inc_t cs_c,
                              float*    restrict a_next,
@@ -58,7 +57,7 @@ void bli_sgemmtrsm_u_opt_mxn(
 	bli_sgemm_opt_mxn( k,
 	                   minus_one,
 	                   a12,
-	                   bd21,
+	                   b21,
 	                   alpha,
 	                   b11, rs_b, cs_b,
 	                   a_next,
@@ -66,7 +65,6 @@ void bli_sgemmtrsm_u_opt_mxn(

 	bli_strsm_u_opt_mxn( a11,
 	                     b11,
-	                     bd11,
 	                     c11, rs_c, cs_c );
 }

@@ -77,8 +75,7 @@ void bli_dgemmtrsm_u_opt_mxn(
                              double*   restrict alpha,
                              double*   restrict a12,
                              double*   restrict a11,
-                              double*   restrict bd21,
-                              double*   restrict bd11,
+                              double*   restrict b21,
                              double*   restrict b11,
                              double*   restrict c11, inc_t rs_c, inc_t cs_c,
                              double*   restrict a_next,
@@ -207,24 +204,20 @@ void bli_dgemmtrsm_u_opt_mxn(

 	double*   restrict minus_one = bli_dm1;

-	/* Reminder: if duplication is disabled, then bd21 == b21, bd11 == b11. */
-
-	/* b11 = alpha * b11 - a12 * bd21; */
+	/* b11 = alpha * b11 - a12 * b21; */
 	bli_dgemm_opt_mxn( k,
 	                   minus_one,
 	                   a12,
-	                   bd21,
+	                   b21,
 	                   alpha,
 	                   b11, rs_b, cs_b,
 	                   a_next,
 	                   b_next );

-	/* b11  = inv(a11) * b11;
-	   bd11 = b11; (skipped if duplication is disabled)
-	   c11  = b11; */
+	/* b11 = inv(a11) * b11;
+	   c11 = b11; */
 	bli_dtrsm_u_opt_mxn( a11,
 	                     b11,
-	                     bd11,
 	                     c11, rs_c, cs_c );
 }

@@ -235,8 +228,7 @@ void bli_cgemmtrsm_u_opt_mxn(
                              scomplex* restrict alpha,
                              scomplex* restrict a12,
                              scomplex* restrict a11,
-                              scomplex* restrict bd21,
-                              scomplex* restrict bd11,
+                              scomplex* restrict b21,
                              scomplex* restrict b11,
                              scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
                              scomplex* restrict a_next,
@@ -252,7 +244,7 @@ void bli_cgemmtrsm_u_opt_mxn(
 	bli_cgemm_opt_mxn( k,
 	                   minus_one,
 	                   a12,
-	                   bd21,
+	                   b21,
 	                   alpha,
 	                   b11, rs_b, cs_b,
 	                   a_next,
@@ -260,7 +252,6 @@ void bli_cgemmtrsm_u_opt_mxn(

 	bli_ctrsm_u_opt_mxn( a11,
 	                     b11,
-	                     bd11,
 	                     c11, rs_c, cs_c );
 }

@@ -271,8 +262,7 @@ void bli_zgemmtrsm_u_opt_mxn(
                              dcomplex* restrict alpha,
                              dcomplex* restrict a12,
                              dcomplex* restrict a11,
-                              dcomplex* restrict bd21,
-                              dcomplex* restrict bd11,
+                              dcomplex* restrict b21,
                              dcomplex* restrict b11,
                              dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
                              dcomplex* restrict a_next,
@@ -288,7 +278,7 @@ void bli_zgemmtrsm_u_opt_mxn(
 	bli_zgemm_opt_mxn( k,
 	                   minus_one,
 	                   a12,
-	                   bd21,
+	                   b21,
 	                   alpha,
 	                   b11, rs_b, cs_b,
 	                   a_next,
@@ -296,7 +286,6 @@ void bli_zgemmtrsm_u_opt_mxn(

 	bli_ztrsm_u_opt_mxn( a11,
 	                     b11,
-	                     bd11,
 	                     c11, rs_c, cs_c );
 }

--- a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h
+++ b/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h
@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict a12, \
                           ctype* restrict a11, \
-                           ctype* restrict bd21, \
-                           ctype* restrict bd11, \
+                           ctype* restrict b21, \
                           ctype* restrict b11, \
                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
--- a/config/template/kernels/3/bli_trsm_l_opt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_l_opt_mxn.c
@@ -39,14 +39,12 @@
 void bli_strsm_l_opt_mxn(
                          float*    restrict a,
                          float*    restrict b,
-                          float*    restrict bd,
                          float*    restrict c, inc_t rs_c, inc_t cs_c
                        )
 {
 	/* Just call the reference implementation. */
 	bli_strsm_l_ref_mxn( a,
 	                     b,
-	                     bd,
 	                     c, rs_c, cs_c );
 }

@@ -55,7 +53,6 @@ void bli_strsm_l_opt_mxn(
 void bli_dtrsm_l_opt_mxn(
                          double*   restrict a,
                          double*   restrict b,
-                          double*   restrict bd,
                          double*   restrict c, inc_t rs_c, inc_t cs_c
                        )
 {
@@ -76,11 +73,6 @@ void bli_dtrsm_l_opt_mxn(
  where A11 is MR x MR and lower triangular, B11 is MR x NR, and C11 is
  MR x NR.

-  NOTE: Here, this trsm micro-kernel supports element "duplication", a
-  feature that is enabled or disabled in bli_kernel.h. Duplication factors
-  are also defined in the aforementioned header. Duplication is NOT
-  commonly used and most developers may assume it is disabled.
-
  Parameters:

  - a11:    The address of A11, which is the MR x MR lower triangular block
@@ -89,8 +81,6 @@ void bli_dtrsm_l_opt_mxn(
            been inverted and the strictly upper triangle contains zeros.
  - b11:    The address of B11, which is the MR x NR subpartition of the
            current packed (row-stored) micro-panel of B.
-  - bd11:   The address of the duplicated copy of B11. If duplication is
-            disabled, then bd11 == b11.
  - c11:    The address of C11, which is the MR x NR block of the output
            matrix (ie: the matrix provided by the user to the highest-level
            trsm API call). C11 corresponds to the elements that exist in
@@ -110,12 +100,6 @@ void bli_dtrsm_l_opt_mxn(
  - Note that the diagonal of the triangular matrix A11 contains the INVERSE
    of those elements. This is done during packing so that we can avoid
    expensive division instructions within this micro-kernel.
-  - This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
-    then the result must be written to three places: the sub-block within the
-    duplicated copy of the current micro-panel of B, the sub-block within the
-    current packed micro-panel of B, and the sub-block of the output matrix C.
-    When duplication is not used, the micro-kernel should update only the
-    latter two locations.

  For more info, please refer to the BLIS website and/or contact the
  blis-devel mailing list.
@@ -191,14 +175,12 @@ void bli_dtrsm_l_opt_mxn(
 void bli_ctrsm_l_opt_mxn(
                          scomplex* restrict a,
                          scomplex* restrict b,
-                          scomplex* restrict bd,
                          scomplex* restrict c, inc_t rs_c, inc_t cs_c
                        )
 {
 	/* Just call the reference implementation. */
 	bli_ctrsm_l_ref_mxn( a,
 	                     b,
-	                     bd,
 	                     c, rs_c, cs_c );
 }

@@ -207,14 +189,12 @@ void bli_ctrsm_l_opt_mxn(
 void bli_ztrsm_l_opt_mxn(
                          dcomplex* restrict a,
                          dcomplex* restrict b,
-                          dcomplex* restrict bd,
                          dcomplex* restrict c, inc_t rs_c, inc_t cs_c
                        )
 {
 	/* Just call the reference implementation. */
 	bli_ztrsm_l_ref_mxn( a,
 	                     b,
-	                     bd,
 	                     c, rs_c, cs_c );
 }

--- a/config/template/kernels/3/bli_trsm_l_opt_mxn.h
+++ b/config/template/kernels/3/bli_trsm_l_opt_mxn.h
@@ -42,7 +42,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c  \
                         );

--- a/config/template/kernels/3/bli_trsm_u_opt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_u_opt_mxn.c
@@ -39,14 +39,12 @@
 void bli_strsm_u_opt_mxn(
                          float*    restrict a,
                          float*    restrict b,
-                          float*    restrict bd,
                          float*    restrict c, inc_t rs_c, inc_t cs_c
                        )
 {
 	/* Just call the reference implementation. */
 	bli_strsm_u_ref_mxn( a,
 	                     b,
-	                     bd,
 	                     c, rs_c, cs_c );
 }

@@ -55,7 +53,6 @@ void bli_strsm_u_opt_mxn(
 void bli_dtrsm_u_opt_mxn(
                          double*   restrict a,
                          double*   restrict b,
-                          double*   restrict bd,
                          double*   restrict c, inc_t rs_c, inc_t cs_c
                        )
 {
@@ -76,11 +73,6 @@ void bli_dtrsm_u_opt_mxn(
  where A11 is MR x MR and upper triangular, B11 is MR x NR, and C11 is
  MR x NR.

-  NOTE: Here, this trsm micro-kernel supports element "duplication", a
-  feature that is enabled or disabled in bli_kernel.h. Duplication factors
-  are also defined in the aforementioned header. Duplication is NOT
-  commonly used and most developers may assume it is disabled.
-
  Parameters:

  - a11:    The address of A11, which is the MR x MR upper triangular block
@@ -89,8 +81,6 @@ void bli_dtrsm_u_opt_mxn(
            been inverted and the strictly lower triangle contains zeros.
  - b11:    The address of B11, which is the MR x NR subpartition of the
            current packed (row-stored) micro-panel of B.
-  - bd11:   The address of the duplicated copy of B11. If duplication is
-            disabled, then bd11 == b11.
  - c11:    The address of C11, which is the MR x NR block of the output
            matrix (ie: the matrix provided by the user to the highest-level
            trsm API call). C11 corresponds to the elements that exist in
@@ -110,12 +100,6 @@ void bli_dtrsm_u_opt_mxn(
  - Note that the diagonal of the triangular matrix A11 contains the INVERSE
    of those elements. This is done during packing so that we can avoid
    expensive division instructions within this micro-kernel.
-  - This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
-    then the result must be written to three places: the sub-block within the
-    duplicated copy of the current micro-panel of B, the sub-block within the
-    current packed micro-panel of B, and the sub-block of the output matrix C.
-    When duplication is not used, the micro-kernel should update only the
-    latter two locations.

  For more info, please refer to the BLIS website and/or contact the
  blis-devel mailing list.
@@ -191,14 +175,12 @@ void bli_dtrsm_u_opt_mxn(
 void bli_ctrsm_u_opt_mxn(
                          scomplex* restrict a,
                          scomplex* restrict b,
-                          scomplex* restrict bd,
                          scomplex* restrict c, inc_t rs_c, inc_t cs_c
                        )
 {
 	/* Just call the reference implementation. */
 	bli_ctrsm_u_ref_mxn( a,
 	                     b,
-	                     bd,
 	                     c, rs_c, cs_c );
 }

@@ -207,14 +189,12 @@ void bli_ctrsm_u_opt_mxn(
 void bli_ztrsm_u_opt_mxn(
                          dcomplex* restrict a,
                          dcomplex* restrict b,
-                          dcomplex* restrict bd,
                          dcomplex* restrict c, inc_t rs_c, inc_t cs_c
                        )
 {
 	/* Just call the reference implementation. */
 	bli_ztrsm_u_ref_mxn( a,
 	                     b,
-	                     bd,
 	                     c, rs_c, cs_c );
 }

--- a/config/template/kernels/3/bli_trsm_u_opt_mxn.h
+++ b/config/template/kernels/3/bli_trsm_u_opt_mxn.h
@@ -42,7 +42,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c  \
                         );

--- a/frame/1d/axpyd/bli_axpyd.c
+++ b/frame/1d/axpyd/bli_axpyd.c
@@ -53,10 +53,10 @@ void bli_axpyd( obj_t* alpha,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-    bli_obj_init_scalar_copy_of( dt_x,
-                                 BLIS_NO_CONJUGATE,
-                                 alpha,
-                                 &alpha_local );
+	bli_obj_init_scalar_copy_of( dt_x,
+	                             BLIS_NO_CONJUGATE,
+	                             alpha,
+	                             &alpha_local );

 	bli_axpyd_unb_var1( &alpha_local,
 	                    x,
--- a/frame/1d/scal2d/bli_scal2d.c
+++ b/frame/1d/scal2d/bli_scal2d.c
@@ -53,10 +53,10 @@ void bli_scal2d( obj_t* beta,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-    bli_obj_init_scalar_copy_of( dt_x,
-                                 BLIS_NO_CONJUGATE,
-                                 beta,
-                                 &beta_local );
+	bli_obj_init_scalar_copy_of( dt_x,
+	                             BLIS_NO_CONJUGATE,
+	                             beta,
+	                             &beta_local );

 	bli_scal2d_unb_var1( &beta_local,
 	                     x,
--- a/frame/1d/scald/bli_scald.c
+++ b/frame/1d/scald/bli_scald.c
@@ -52,10 +52,10 @@ void bli_scald( obj_t* beta,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-    bli_obj_init_scalar_copy_of( dt_x,
-                                 BLIS_NO_CONJUGATE,
-                                 beta,
-                                 &beta_local );
+	bli_obj_init_scalar_copy_of( dt_x,
+	                             BLIS_NO_CONJUGATE,
+	                             beta,
+	                             &beta_local );

 	bli_scald_unb_var1( &beta_local,
 	                    x );
--- a/frame/1d/setd/bli_setd.c
+++ b/frame/1d/setd/bli_setd.c
@@ -52,10 +52,10 @@ void bli_setd( obj_t* beta,
 	dt_x = bli_obj_datatype( *x );

 	// Create an object to hold a copy-cast of alpha.
-    bli_obj_init_scalar_copy_of( dt_x,
-                                 BLIS_NO_CONJUGATE,
-                                 beta,
-                                 &beta_local );
+	bli_obj_init_scalar_copy_of( dt_x,
+	                             BLIS_NO_CONJUGATE,
+	                             beta,
+	                             &beta_local );

 	bli_setd_unb_var1( &beta_local,
 	                   x );
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -143,13 +143,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -160,8 +153,6 @@ void PASTEMAC(ch,varname)( \
 	/* Alias some constants to shorter names. */ \
 	const dim_t     MR         = PASTEMAC(ch,mr); \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict a_cast     = a; \
@@ -176,7 +167,6 @@ void PASTEMAC(ch,varname)( \
 	ctype* restrict a2; \
 	ctype* restrict b2; \
 \
-	dim_t           k_nr; \
 	dim_t           m_iter, m_left; \
 	dim_t           n_iter, n_left; \
 	dim_t           i, j; \
@@ -215,9 +205,6 @@ void PASTEMAC(ch,varname)( \
 \
 	if ( n_left ) ++n_iter; \
 	if ( m_left ) ++m_iter; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_nr = k * NR; \
 \
 	/* Determine some increments used to step through A, B, and C. */ \
 	rstep_a = ps_a; \
@@ -229,12 +216,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -243,11 +224,6 @@ void PASTEMAC(ch,varname)( \
 		c11 = c1; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -274,7 +250,7 @@ void PASTEMAC(ch,varname)( \
 				PASTEMAC(ch,ukrname)( k, \
 				                      alpha_cast, \
 				                      a1, \
-				                      bp, \
+				                      b1, \
 				                      beta_cast, \
 				                      c11, rs_c, cs_c, \
 				                      a2, b2 ); \
@@ -285,7 +261,7 @@ void PASTEMAC(ch,varname)( \
 				PASTEMAC(ch,ukrname)( k, \
 				                      alpha_cast, \
 				                      a1, \
-				                      bp, \
+				                      b1, \
 				                      zero, \
 				                      ct, rs_ct, cs_ct, \
 				                      a2, b2 ); \
@@ -306,7 +282,7 @@ void PASTEMAC(ch,varname)( \
 	} \
 \
 /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, b1, NR*NDUP, 1, "%4.1f", "" );*/ \
 /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
 }

--- a/frame/3/gemm/bli_gemm_ker_var5.c
+++ b/frame/3/gemm/bli_gemm_ker_var5.c
@@ -143,12 +143,11 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
+	/* Temporary buffer for incremental packing of B. */ \
+	ctype           bp[ PASTEMAC(ch,maxkc) * \
 	                    PASTEMAC(ch,packnr) * \
 	                    PASTEMAC(ch,nifac) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
 \
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
@@ -226,12 +225,9 @@ void PASTEMAC(ch,varname)( \
 	b1 = b_cast; \
 	c1 = c_cast; \
 \
-	/* The current packed micro-panel of B will always be stored in bd. */ \
-	bp = bd; \
-\
-	/* Since we pack micro-panels of B incrementall, one at a time, the
+	/* Since we pack micro-panels of B incrementaly, one at a time, the
 	   address of the next micro-panel of B remains constant. */ \
-	b2 = bd; \
+	b2 = bp; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
--- a/frame/3/gemm/other/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2.c
@@ -152,13 +152,6 @@ void PASTEMAC(ch,varname)( \
 \
 	guint_t         t_id      = omp_get_thread_num(); \
 	guint_t         n_threads = omp_get_num_threads(); \
-\
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,nr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
 \
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
@@ -170,8 +163,6 @@ void PASTEMAC(ch,varname)( \
 	/* Alias some constants to shorter names. */ \
 	const dim_t     MR         = PASTEMAC(ch,mr); \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict a_cast     = a; \
@@ -186,7 +177,6 @@ void PASTEMAC(ch,varname)( \
 	ctype* restrict a2; \
 	ctype* restrict b2; \
 \
-	dim_t           k_nr; \
 	dim_t           m_iter, m_left; \
 	dim_t           n_iter, n_left; \
 	dim_t           i, j; \
@@ -217,9 +207,6 @@ void PASTEMAC(ch,varname)( \
 \
 	m_iter = m / MR; \
 	m_left = m % MR; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_nr = k * NR; \
 \
 	/* Determine some increments used to step through A, B, and C. */ \
 	rstep_a = ps_a; \
@@ -231,12 +218,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = t_id; j < n_iter; j += n_threads ) \
@@ -246,11 +227,6 @@ void PASTEMAC(ch,varname)( \
 \
 		a1  = a_cast; \
 		c11 = c1; \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -272,7 +248,7 @@ void PASTEMAC(ch,varname)( \
 			PASTEMAC(ch,ukrname)( k, \
 			                      alpha_cast, \
 			                      a1, \
-			                      bp, \
+			                      b1, \
 			                      beta_cast, \
 			                      c11, rs_c, cs_c, \
 			                      a2, b2 ); \
@@ -295,7 +271,7 @@ void PASTEMAC(ch,varname)( \
 			PASTEMAC(ch,ukrname)( k, \
 			                      alpha_cast, \
 			                      a1, \
-			                      bp, \
+			                      b1, \
 			                      zero, \
 			                      ct, rs_ct, cs_ct, \
 			                      a2, b2 ); \
@@ -318,11 +294,6 @@ void PASTEMAC(ch,varname)( \
 \
 		a1  = a_cast; \
 		c11 = c1; \
-\
-		/* If duplication is needed, copy the n_left (+ padding) columns
-		   of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -342,7 +313,7 @@ void PASTEMAC(ch,varname)( \
 			PASTEMAC(ch,ukrname)( k, \
 			                      alpha_cast, \
 			                      a1, \
-			                      bp, \
+			                      b1, \
 			                      zero, \
 			                      ct, rs_ct, cs_ct, \
 			                      a2, b2 ); \
@@ -368,7 +339,7 @@ void PASTEMAC(ch,varname)( \
 			PASTEMAC(ch,ukrname)( k, \
 			                      alpha_cast, \
 			                      a1, \
-			                      bp, \
+			                      b1, \
 			                      zero, \
 			                      ct, rs_ct, cs_ct, \
 			                      a2, b2 ); \
@@ -384,7 +355,7 @@ void PASTEMAC(ch,varname)( \
 	} /* end omp parallel */ \
 \
 /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, bp, NR, 1, "%4.1f", "" );*/ \
 /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
 }

--- a/frame/3/gemm/other/bli_gemm_ker_var2.c.old
+++ b/frame/3/gemm/other/bli_gemm_ker_var2.c.old
@@ -1,377 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2013, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)(
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   k,
-                           void*   alpha,
-                           void*   a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
-                           void*   b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
-                           void*   beta,
-                           void*   c, inc_t rs_c, inc_t cs_c
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
-
-
-void bli_gemm_ker_var2( obj_t*  alpha,
-                        obj_t*  a,
-                        obj_t*  b,
-                        obj_t*  beta,
-                        obj_t*  c,
-                        gemm_t* cntl )
-{
-	num_t     dt_exec   = bli_obj_execution_datatype( *c );
-
-	dim_t     m         = bli_obj_length( *c );
-	dim_t     n         = bli_obj_width( *c );
-	dim_t     k         = bli_obj_width( *a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( *a );
-	inc_t     rs_a      = bli_obj_row_stride( *a );
-	inc_t     cs_a      = bli_obj_col_stride( *a );
-	inc_t     ps_a      = bli_obj_panel_stride( *a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( *b );
-	inc_t     rs_b      = bli_obj_row_stride( *b );
-	inc_t     cs_b      = bli_obj_col_stride( *b );
-	inc_t     ps_b      = bli_obj_panel_stride( *b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( *c );
-	inc_t     rs_c      = bli_obj_row_stride( *c );
-	inc_t     cs_c      = bli_obj_col_stride( *c );
-
-	num_t     dt_alpha;
-	void*     buf_alpha;
-
-	num_t     dt_beta;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-/*
-	// Handle the special case where c and a are complex and b is real.
-	// Note that this is the ONLY case allowed by the inner kernel whereby
-	// the datatypes of a and b differ. In this situation, the execution
-	// datatype is real, so we need to inflate (by a factor of two):
-	//  - the m dimension,
-	//  - the column stride of c,
-	//  - the column stride (ie: the panel length) of a, and
-	//  - the panel stride of a.
-	if ( bli_obj_is_complex( *a ) && bli_obj_is_real( *b ) )
-	{
-		m    *= 2;
-		cs_c *= 2;
-		cs_a *= 2;
-		ps_a *= 2;
-	}
-*/
-
-	// If alpha is a scalar constant, use dt_exec to extract the address of the
-	// corresponding constant value; otherwise, use the datatype encoded
-	// within the alpha object and extract the buffer at the alpha offset.
-	bli_set_scalar_dt_buffer( alpha, dt_exec, dt_alpha, buf_alpha );
-
-	// If beta is a scalar constant, use dt_exec to extract the address of the
-	// corresponding constant value; otherwise, use the datatype encoded
-	// within the beta object and extract the buffer at the beta offset.
-	bli_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, rs_a, cs_a, ps_a,
-	   buf_b, rs_b, cs_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, ukrname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           dim_t   m, \
-                           dim_t   n, \
-                           dim_t   k, \
-                           void*   alpha, \
-                           void*   a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
-                           void*   b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
-                           void*   beta, \
-                           void*   c, inc_t rs_c, inc_t cs_c \
-                         ) \
-{ \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,nr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
-	/* Temporary C buffer for edge cases. */ \
-	ctype           ct[ PASTEMAC(ch,mr) * \
-	                    PASTEMAC(ch,nr) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ct      = 1; \
-	const inc_t     cs_ct      = PASTEMAC(ch,mr); \
-\
-	/* Alias some constants to shorter names. */ \
-	const dim_t     MR         = PASTEMAC(ch,mr); \
-	const dim_t     NR         = PASTEMAC(ch,nr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict a1; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-	ctype* restrict c11; \
-	ctype* restrict a2; \
-	ctype* restrict b2; \
-\
-	dim_t           k_nr; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == GEMM_MR
-	     ps_a == stride to next row panel of A
-	     rs_b == GEMM_NR
-	     cs_b == 1
-	     ps_b == stride to next column panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_nr = k * NR; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	b1 = b_cast; \
-	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
-	{ \
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = 0; i < m_iter; ++i ) \
-		{ \
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = a1 + rstep_a; \
-			if ( i == m_iter - 1 && m_left == 0 ) \
-			{ \
-				a2 = a_cast; \
-				b2 = b1 + cstep_b; \
-				if ( j == n_iter - 1 && n_left == 0 ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Invoke the gemm micro-kernel. */ \
-			PASTEMAC(ch,ukrname)( k, \
-			                      alpha_cast, \
-			                      a1, \
-			                      bp, \
-			                      beta_cast, \
-			                      c11, rs_c, cs_c, \
-			                      a2, b2 ); \
-\
-			a1  += rstep_a; \
-			c11 += rstep_c; \
-		} \
-\
-		/* Bottom edge handling. */ \
-		if ( m_left ) \
-		{ \
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = a_cast; \
-			b2 = b1 + cstep_b; \
-			if ( j == n_iter - 1 && n_left == 0 ) \
-				b2 = b_cast; \
-\
-\
-			/* Invoke the gemm micro-kernel. */ \
-			PASTEMAC(ch,ukrname)( k, \
-			                      alpha_cast, \
-			                      a1, \
-			                      bp, \
-			                      zero, \
-			                      ct, rs_ct, cs_ct, \
-			                      a2, b2 ); \
-\
-			/* Scale the bottom edge of C and add the result from above. */ \
-			PASTEMAC(ch,xpbys_mxn)( m_left, NR, \
-			                        ct,  rs_ct, cs_ct, \
-			                        beta_cast, \
-			                        c11, rs_c,  cs_c ); \
-		} \
-\
-		b1 += cstep_b; \
-		c1 += cstep_c; \
-	} \
-\
-	if ( n_left ) \
-	{ \
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		/* If duplication is needed, copy the n_left (+ padding) columns
-		   of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Right edge loop over the m dimension (MR rows at a time). */ \
-		for ( i = 0; i < m_iter; ++i ) \
-		{ \
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = a1 + rstep_a; \
-			if ( i == m_iter - 1 && m_left == 0 ) \
-			{ \
-				a2 = a_cast; \
-				b2 = b_cast; \
-			} \
-\
-			/* Invoke the gemm micro-kernel. */ \
-			PASTEMAC(ch,ukrname)( k, \
-			                      alpha_cast, \
-			                      a1, \
-			                      bp, \
-			                      zero, \
-			                      ct, rs_ct, cs_ct, \
-			                      a2, b2 ); \
-\
-			/* Scale the right edge of C and add the result from above. */ \
-			PASTEMAC(ch,xpbys_mxn)( MR, n_left, \
-			                        ct,  rs_ct, cs_ct, \
-			                        beta_cast, \
-			                        c11, rs_c,  cs_c ); \
-\
-			a1  += rstep_a; \
-			c11 += rstep_c; \
-		} \
-\
-		/* Bottom-right corner handling. */ \
-		if ( m_left ) \
-		{ \
-			/* Compute the address of the next panel of A. */ \
-			a2 = a_cast; \
-			b2 = b_cast; \
-\
-			/* Invoke the gemm micro-kernel. */ \
-			PASTEMAC(ch,ukrname)( k, \
-			                      alpha_cast, \
-			                      a1, \
-			                      bp, \
-			                      zero, \
-			                      ct, rs_ct, cs_ct, \
-			                      a2, b2 ); \
-\
-			/* Scale the bottom-right corner of C and add the result from above. */ \
-			PASTEMAC(ch,xpbys_mxn)( m_left, n_left, \
-			                        ct,  rs_ct, cs_ct, \
-			                        beta_cast, \
-			                        c11, rs_c,  cs_c ); \
-		} \
-	} \
-\
-/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
-}
-
-INSERT_GENTFUNC_BASIC( gemm_ker_var2, GEMM_UKERNEL )
-
--- a/frame/3/herk/bli_herk_l_ker_var2.c
+++ b/frame/3/herk/bli_herk_l_ker_var2.c
@@ -147,13 +147,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -164,8 +157,6 @@ void PASTEMAC(ch,varname)( \
 	/* Alias some constants to shorter names. */ \
 	const dim_t     MR         = PASTEMAC(ch,mr); \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
-	const bool_t    NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict a_cast     = a; \
@@ -181,7 +172,6 @@ void PASTEMAC(ch,varname)( \
 	ctype* restrict b2; \
 \
 	doff_t          diagoffc_ij; \
-	dim_t           k_nr; \
 	dim_t           m_iter, m_left; \
 	dim_t           n_iter, n_left; \
 	dim_t           m_cur; \
@@ -245,9 +235,6 @@ void PASTEMAC(ch,varname)( \
 \
 	if ( n_left ) ++n_iter; \
 	if ( m_left ) ++m_iter; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_nr = k * NR; \
 \
 	/* Determine some increments used to step through A, B, and C. */ \
 	rstep_a = ps_a; \
@@ -259,12 +246,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -273,11 +254,6 @@ void PASTEMAC(ch,varname)( \
 		c11 = c1; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -313,7 +289,7 @@ void PASTEMAC(ch,varname)( \
 				PASTEMAC(ch,ukrname)( k, \
 				                      alpha_cast, \
 				                      a1, \
-				                      bp, \
+				                      b1, \
 				                      zero, \
 				                      ct, rs_ct, cs_ct, \
 				                      a2, b2 ); \
@@ -334,7 +310,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      beta_cast, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -345,7 +321,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      zero, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
--- a/frame/3/herk/bli_herk_u_ker_var2.c
+++ b/frame/3/herk/bli_herk_u_ker_var2.c
@@ -147,13 +147,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -164,8 +157,6 @@ void PASTEMAC(ch,varname)( \
 	/* Alias some constants to shorter names. */ \
 	const dim_t     MR         = PASTEMAC(ch,mr); \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
-	const bool_t    NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict a_cast     = a; \
@@ -181,7 +172,6 @@ void PASTEMAC(ch,varname)( \
 	ctype* restrict b2; \
 \
 	doff_t          diagoffc_ij; \
-	dim_t           k_nr; \
 	dim_t           m_iter, m_left; \
 	dim_t           n_iter, n_left; \
 	dim_t           m_cur; \
@@ -245,9 +235,6 @@ void PASTEMAC(ch,varname)( \
 \
 	if ( n_left ) ++n_iter; \
 	if ( m_left ) ++m_iter; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_nr = k * NR; \
 \
 	/* Determine some increments used to step through A, B, and C. */ \
 	rstep_a = ps_a; \
@@ -259,12 +246,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -273,11 +254,6 @@ void PASTEMAC(ch,varname)( \
 		c11 = c1; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -313,7 +289,7 @@ void PASTEMAC(ch,varname)( \
 				PASTEMAC(ch,ukrname)( k, \
 				                      alpha_cast, \
 				                      a1, \
-				                      bp, \
+				                      b1, \
 				                      zero, \
 				                      ct, rs_ct, cs_ct, \
 				                      a2, b2 ); \
@@ -334,7 +310,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      beta_cast, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -345,7 +321,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      zero, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -148,8 +141,6 @@ void PASTEMAC(ch,varname)( \
 	const dim_t     MR         = PASTEMAC(ch,mr); \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
 	const dim_t     PACKMR     = PASTEMAC(ch,packmr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict one        = PASTEMAC(ch,1); \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
@@ -162,7 +153,7 @@ void PASTEMAC(ch,varname)( \
 	ctype* restrict b1; \
 	ctype* restrict c1; \
 	ctype* restrict c11; \
-	ctype* restrict bp_i; \
+	ctype* restrict b1_i; \
 	ctype* restrict a2; \
 	ctype* restrict b2; \
 \
@@ -171,7 +162,6 @@ void PASTEMAC(ch,varname)( \
 	dim_t           n_iter, n_left; \
 	dim_t           m_cur; \
 	dim_t           n_cur; \
-	dim_t           k_nr; \
 	dim_t           k_a1011; \
 	dim_t           off_a1011; \
 	dim_t           i, j; \
@@ -232,10 +222,6 @@ void PASTEMAC(ch,varname)( \
 \
 	if ( n_left ) ++n_iter; \
 	if ( m_left ) ++m_iter; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_a1011 = bli_min( k, diagoffa + m ); \
-	k_nr    = k_a1011 * NR; \
 \
 	/* Determine some increments used to step through A, B, and C. */ \
 	rstep_a = k * PACKMR; \
@@ -247,12 +233,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -261,11 +241,6 @@ void PASTEMAC(ch,varname)( \
 		c11 = c1; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -285,11 +260,11 @@ void PASTEMAC(ch,varname)( \
 			{ \
 				/* Determine the offset to and length of the panel that was
 				   packed so we can index into the corresponding location in
-				   bp. */ \
+				   b1. */ \
 				off_a1011 = 0; \
 				k_a1011   = bli_min( k, diagoffa_i + MR ); \
 \
-				bp_i = bp + off_a1011 * NR * NDUP; \
+				b1_i = b1 + off_a1011 * NR; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + k_a1011 * PACKMR; \
@@ -308,7 +283,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k_a1011, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp_i, \
+					                      b1_i, \
 					                      beta_cast, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -324,7 +299,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k_a1011, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp_i, \
+					                      b1_i, \
 					                      beta_cast, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
@@ -356,7 +331,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      one, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -367,7 +342,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      zero, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -148,8 +141,6 @@ void PASTEMAC(ch,varname)( \
 	const dim_t     MR         = PASTEMAC(ch,mr); \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
 	const dim_t     PACKMR     = PASTEMAC(ch,packmr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict one        = PASTEMAC(ch,1); \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
@@ -162,7 +153,7 @@ void PASTEMAC(ch,varname)( \
 	ctype* restrict b1; \
 	ctype* restrict c1; \
 	ctype* restrict c11; \
-	ctype* restrict bp_i; \
+	ctype* restrict b1_i; \
 	ctype* restrict a2; \
 	ctype* restrict b2; \
 \
@@ -171,7 +162,6 @@ void PASTEMAC(ch,varname)( \
 	dim_t           n_iter, n_left; \
 	dim_t           m_cur; \
 	dim_t           n_cur; \
-	dim_t           k_nr; \
 	dim_t           k_a1112; \
 	dim_t           off_a1112; \
 	dim_t           i, j; \
@@ -240,10 +230,6 @@ void PASTEMAC(ch,varname)( \
 \
 	if ( n_left ) ++n_iter; \
 	if ( m_left ) ++m_iter; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_a1112 = k; \
-	k_nr    = k_a1112 * NR; \
 \
 	/* Determine some increments used to step through A, B, and C. */ \
 	rstep_a = k * PACKMR; \
@@ -255,12 +241,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -269,11 +249,6 @@ void PASTEMAC(ch,varname)( \
 		c11 = c1; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -293,11 +268,11 @@ void PASTEMAC(ch,varname)( \
 			{ \
 				/* Determine the offset to and length of the panel that was
 				   packed so we can index into the corresponding location in
-				   bp. */ \
+				   b1. */ \
 				off_a1112 = bli_max( diagoffa_i, 0 ); \
 				k_a1112   = k - off_a1112; \
 \
-				bp_i = bp + off_a1112 * NR * NDUP; \
+				b1_i = b1 + off_a1112 * NR; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + k_a1112 * PACKMR; \
@@ -316,7 +291,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k_a1112, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp_i, \
+					                      b1_i, \
 					                      beta_cast, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -332,7 +307,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k_a1112, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp_i, \
+					                      b1_i, \
 					                      beta_cast, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
@@ -364,7 +339,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      one, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -375,7 +350,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      zero, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
@@ -397,7 +372,7 @@ void PASTEMAC(ch,varname)( \
 	} \
 \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }

 INSERT_GENTFUNC_BASIC( trmm_lu_ker_var2, GEMM_UKERNEL )
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -149,8 +142,6 @@ void PASTEMAC(ch,varname)( \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
 	const dim_t     PACKMR     = PASTEMAC(ch,packmr); \
 	const dim_t     PACKNR     = PASTEMAC(ch,packnr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict one        = PASTEMAC(ch,1); \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
@@ -172,7 +163,6 @@ void PASTEMAC(ch,varname)( \
 	dim_t           n_iter, n_left; \
 	dim_t           m_cur; \
 	dim_t           n_cur; \
-	dim_t           k_nr; \
 	dim_t           k_b1121; \
 	dim_t           off_b1121; \
 	dim_t           i, j; \
@@ -252,12 +242,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -273,12 +257,6 @@ void PASTEMAC(ch,varname)( \
 		   in A. Then compute the length of that panel. */ \
 		off_b1121 = bli_max( -diagoffb_j, 0 ); \
 		k_b1121   = k - off_b1121; \
-		k_nr      = k_b1121 * NR; \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
        b2 = b1; \
@@ -313,7 +291,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k_b1121, \
 					                      alpha_cast, \
 					                      a1_i, \
-					                      bp, \
+					                      b1, \
 					                      beta_cast, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -329,7 +307,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k_b1121, \
 					                      alpha_cast, \
 					                      a1_i, \
-					                      bp, \
+					                      b1, \
 					                      beta_cast, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
@@ -368,7 +346,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      one, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -379,7 +357,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      zero, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
@@ -400,7 +378,7 @@ void PASTEMAC(ch,varname)( \
 	} \
 \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }

 INSERT_GENTFUNC_BASIC( trmm_rl_ker_var2, GEMM_UKERNEL )
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -149,8 +142,6 @@ void PASTEMAC(ch,varname)( \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
 	const dim_t     PACKMR     = PASTEMAC(ch,packmr); \
 	const dim_t     PACKNR     = PASTEMAC(ch,packnr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict one        = PASTEMAC(ch,1); \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
@@ -172,7 +163,6 @@ void PASTEMAC(ch,varname)( \
 	dim_t           n_iter, n_left; \
 	dim_t           m_cur; \
 	dim_t           n_cur; \
-	dim_t           k_nr; \
 	dim_t           k_b0111; \
 	dim_t           off_b0111; \
 	dim_t           i, j; \
@@ -253,12 +243,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -273,12 +257,6 @@ void PASTEMAC(ch,varname)( \
 		   so we can index into the corresponding location in A. */ \
 		off_b0111 = 0; \
 		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
-		k_nr      = k_b0111 * NR; \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
        b2 = b1; \
@@ -313,7 +291,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k_b0111, \
 					                      alpha_cast, \
 					                      a1_i, \
-					                      bp, \
+					                      b1, \
 					                      beta_cast, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -329,7 +307,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k_b0111, \
 					                      alpha_cast, \
 					                      a1_i, \
-					                      bp, \
+					                      b1, \
 					                      beta_cast, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
@@ -368,7 +346,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      one, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -379,7 +357,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,ukrname)( k, \
 					                      alpha_cast, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      zero, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
@@ -400,7 +378,7 @@ void PASTEMAC(ch,varname)( \
 	} \
 \
 /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 }

 INSERT_GENTFUNC_BASIC( trmm_ru_ker_var2, GEMM_UKERNEL )
--- a/frame/3/trsm/bli_trsm.c
+++ b/frame/3/trsm/bli_trsm.c
@@ -140,12 +140,6 @@ void bli_trsm( side_t  side,
 	                             alpha,
 	                             &alpha_local );

-	//
-	// NOTE: we need to disable the use of the right-hand side control tree
-	// if duplication is enabled since the trsm_r macrokernels do not support
-	// duplication.
-	//
-
 	// Choose the control tree.
 	if ( bli_is_left( side ) ) cntl = trsm_l_cntl;
 	else                       cntl = trsm_r_cntl;
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -119,13 +119,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -138,8 +131,6 @@ void PASTEMAC(ch,varname)( \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
 	const dim_t     PACKMR     = PASTEMAC(ch,packmr); \
 	const dim_t     PACKNR     = PASTEMAC(ch,packnr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict minus_one  = PASTEMAC(ch,m1); \
@@ -151,11 +142,10 @@ void PASTEMAC(ch,varname)( \
 	ctype* restrict b1; \
 	ctype* restrict c1; \
 	ctype* restrict c11; \
-	ctype* restrict b11; \
 	ctype* restrict a10; \
 	ctype* restrict a11; \
-	ctype* restrict bp01; \
-	ctype* restrict bp11; \
+	ctype* restrict b01; \
+	ctype* restrict b11; \
 	ctype* restrict a2; \
 	ctype* restrict b2; \
 \
@@ -164,7 +154,6 @@ void PASTEMAC(ch,varname)( \
 	dim_t           n_iter, n_left; \
 	dim_t           m_cur; \
 	dim_t           n_cur; \
-	dim_t           k_nr; \
 	dim_t           k_a1011; \
 	dim_t           k_a10; \
 	dim_t           off_a10; \
@@ -237,10 +226,6 @@ void PASTEMAC(ch,varname)( \
 \
 	if ( n_left ) ++n_iter; \
 	if ( m_left ) ++m_iter; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_a1011 = bli_min( k, diagoffa + m ); \
-	k_nr    = k_a1011 * NR; \
 \
 	/* Determine some increments used to step through A, B, and C. */ \
 	rstep_a = k * PACKMR; \
@@ -252,12 +237,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -266,11 +245,6 @@ void PASTEMAC(ch,varname)( \
 		c11 = c1 + (0  )*rstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -298,16 +272,13 @@ void PASTEMAC(ch,varname)( \
 \
 				/* Compute the addresses of the panel A10 and the triangular
 				   block A11. */ \
-				a10  = a1; \
-				a11  = a1 + k_a10 * PACKMR; \
+				a10 = a1; \
+				a11 = a1 + k_a10 * PACKMR; \
 \
-				/* Now compute the corresponding addresses in Bd. */ \
-				bp01 = bp + off_a10 * NR * NDUP; \
-				bp11 = bp + off_a11 * NR * NDUP; \
-\
-				/* Index into b1 to locate the MR x NR block of b1 that will
-				   be updated by the trsm subproblem. */ \
-				b11  = b1 + off_a11 * PACKNR; \
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b01 = b1 + off_a10 * PACKNR; \
+				b11 = b1 + off_a11 * PACKNR; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + k_a1011 * PACKMR; \
@@ -327,8 +298,7 @@ void PASTEMAC(ch,varname)( \
 					                          alpha_cast, \
 					                          a10, \
 					                          a11, \
-					                          bp01, \
-					                          bp11, \
+					                          b01, \
 					                          b11, \
 					                          c11, rs_c, cs_c, \
 					                          a2, b2 ); \
@@ -340,8 +310,7 @@ void PASTEMAC(ch,varname)( \
 					                          alpha_cast, \
 					                          a10, \
 					                          a11, \
-					                          bp01, \
-					                          bp11, \
+					                          b01, \
 					                          b11, \
 					                          ct, rs_ct, cs_ct, \
 					                          a2, b2 ); \
@@ -373,7 +342,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,gemmukr)( k, \
 					                      minus_one, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      alpha_cast, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -384,7 +353,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,gemmukr)( k, \
 					                      minus_one, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      zero, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -119,13 +119,6 @@ void PASTEMAC(ch,varname)( \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
-	/* Temporary buffer for duplicating elements of B. */ \
-	ctype           bd[ PASTEMAC(ch,maxkc) * \
-	                    PASTEMAC(ch,packnr) * \
-	                    PASTEMAC(ch,ndup) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	ctype* restrict bp; \
-\
 	/* Temporary C buffer for edge cases. */ \
 	ctype           ct[ PASTEMAC(ch,mr) * \
 	                    PASTEMAC(ch,nr) ] \
@@ -138,8 +131,6 @@ void PASTEMAC(ch,varname)( \
 	const dim_t     NR         = PASTEMAC(ch,nr); \
 	const dim_t     PACKMR     = PASTEMAC(ch,packmr); \
 	const dim_t     PACKNR     = PASTEMAC(ch,packnr); \
-	const dim_t     NDUP       = PASTEMAC(ch,ndup); \
-	const bool_t    DUPB       = NDUP != 1; \
 \
 	ctype* restrict zero       = PASTEMAC(ch,0); \
 	ctype* restrict minus_one  = PASTEMAC(ch,m1); \
@@ -151,11 +142,10 @@ void PASTEMAC(ch,varname)( \
 	ctype* restrict b1; \
 	ctype* restrict c1; \
 	ctype* restrict c11; \
-	ctype* restrict b11; \
 	ctype* restrict a12; \
 	ctype* restrict a11; \
-	ctype* restrict bp21; \
-	ctype* restrict bp11; \
+	ctype* restrict b21; \
+	ctype* restrict b11; \
 	ctype* restrict a2; \
 	ctype* restrict b2; \
 \
@@ -164,7 +154,6 @@ void PASTEMAC(ch,varname)( \
 	dim_t           n_iter, n_left; \
 	dim_t           m_cur; \
 	dim_t           n_cur; \
-	dim_t           k_nr; \
 	dim_t           k_a1112; \
 	dim_t           k_a11; \
 	dim_t           k_a12; \
@@ -246,10 +235,6 @@ void PASTEMAC(ch,varname)( \
 \
 	if ( n_left ) ++n_iter; \
 	if ( m_left ) ++m_iter; \
-\
-	/* Compute the number of elements in B to duplicate per iteration. */ \
-	k_a1112 = k; \
-	k_nr    = k_a1112 * NR; \
 \
 	/* Determine some increments used to step through A, B, and C. */ \
 	rstep_a = k * PACKMR; \
@@ -261,12 +246,6 @@ void PASTEMAC(ch,varname)( \
 \
 	b1 = b_cast; \
 	c1 = c_cast; \
-\
-	/* If the micro-kernel needs elements of B duplicated, set bp to
-	   point to the duplication buffer. If no duplication is called for,
-	   bp will be set to the current column panel of B for each iteration
-	   of the outer loop below. */ \
-	if ( DUPB ) bp = bd; \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = 0; j < n_iter; ++j ) \
@@ -275,11 +254,6 @@ void PASTEMAC(ch,varname)( \
 		c11 = c1 + (m_iter-1)*rstep_c; \
 \
 		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* If duplication is needed, copy the current iteration's NR
-		   columns of B to a local buffer with each value duplicated. */ \
-		if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
-		else        bp = b1; \
 \
 		/* Initialize our next panel of B to be the current panel of B. */ \
 		b2 = b1; \
@@ -309,16 +283,13 @@ void PASTEMAC(ch,varname)( \
 \
 				/* Compute the addresses of the triangular block A11 and the
 				   panel A12. */ \
-				a11  = a1; \
-				a12  = a1 + k_a11 * PACKMR; \
+				a11 = a1; \
+				a12 = a1 + k_a11 * PACKMR; \
 \
-				/* Now compute the corresponding addresses in Bd. */ \
-				bp11 = bp + off_a11 * NR * NDUP; \
-				bp21 = bp + off_a12 * NR * NDUP; \
-\
-				/* Index into b1 to locate the MR x NR block of b1 that will be
-				   updated by the trsm subproblem. */ \
-				b11  = b1 + off_a11 * PACKNR; \
+				/* Compute the addresses of the panel B01 and the block
+				   B11. */ \
+				b11 = b1 + off_a11 * PACKNR; \
+				b21 = b1 + off_a12 * PACKNR; \
 \
 				/* Compute the addresses of the next panels of A and B. */ \
 				a2 = a1 + k_a1112 * PACKMR; \
@@ -338,8 +309,7 @@ void PASTEMAC(ch,varname)( \
 					                          alpha_cast, \
 					                          a12, \
 					                          a11, \
-					                          bp21, \
-					                          bp11, \
+					                          b21, \
 					                          b11, \
 					                          c11, rs_c, cs_c, \
 					                          a2, b2 ); \
@@ -351,8 +321,7 @@ void PASTEMAC(ch,varname)( \
 					                          alpha_cast, \
 					                          a12, \
 					                          a11, \
-					                          bp21, \
-					                          bp11, \
+					                          b21, \
 					                          b11, \
 					                          ct, rs_ct, cs_ct, \
 					                          a2, b2 ); \
@@ -384,7 +353,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,gemmukr)( k, \
 					                      minus_one, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      alpha_cast, \
 					                      c11, rs_c, cs_c, \
 					                      a2, b2 ); \
@@ -395,7 +364,7 @@ void PASTEMAC(ch,varname)( \
 					PASTEMAC(ch,gemmukr)( k, \
 					                      minus_one, \
 					                      a1, \
-					                      bp, \
+					                      b1, \
 					                      zero, \
 					                      ct, rs_ct, cs_ct, \
 					                      a2, b2 ); \
@@ -433,7 +402,7 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
 */ \
 \
 /*
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: bp11 after (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
 PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
 PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
 */ \
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -314,7 +314,6 @@ void PASTEMAC(ch,varname)( \
 					                          b11, \
 					                          a12, \
 					                          a11, \
-					                          a11, \
 					                          c11, cs_c, rs_c, \
 					                          b2, a2 ); \
 				} \
@@ -327,7 +326,6 @@ void PASTEMAC(ch,varname)( \
 					                          b11, \
 					                          a12, \
 					                          a11, \
-					                          a11, \
 					                          ct, cs_ct, rs_ct, \
 					                          b2, a2 ); \
 \
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -308,7 +308,6 @@ void PASTEMAC(ch,varname)( \
 					                          b11, \
 					                          a10, \
 					                          a11, \
-					                          a11, \
 					                          c11, cs_c, rs_c, \
 					                          b2, a2 ); \
 				} \
@@ -321,7 +320,6 @@ void PASTEMAC(ch,varname)( \
 					                          b11, \
 					                          a10, \
 					                          a11, \
-					                          a11, \
 					                          ct, cs_ct, rs_ct, \
 					                          b2, a2 ); \
 \
@@ -390,18 +388,6 @@ void PASTEMAC(ch,varname)( \
 		b1 += k_b0111 * PACKNR; \
 		c1 += cstep_c; \
 	} \
-\
-/*
-PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
-*/ \
-\
-/*
-PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
-*/ \
 }

 INSERT_GENTFUNC_BASIC2( trsm_ru_ker_var2, GEMMTRSM_L_UKERNEL, GEMM_UKERNEL )
--- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c
+++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c
@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict a10, \
                           ctype* restrict a11, \
-                           ctype* restrict bd01, \
-                           ctype* restrict bd11, \
+                           ctype* restrict b01, \
                           ctype* restrict b11, \
                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
@@ -56,22 +55,20 @@ void PASTEMAC(ch,varname)( \
 \
 	ctype* restrict minus_one = PASTEMAC(ch,m1); \
 \
-	/* b11 = alpha * b11 - a10 * bd01; */ \
+	/* b11 = alpha * b11 - a10 * b01; */ \
 	PASTEMAC(ch,gemmukr)( k, \
 	                      minus_one, \
 	                      a10, \
-	                      bd01, \
+	                      b01, \
 	                      alpha, \
 	                      b11, rs_b, cs_b, \
 	                      a_next, \
 	                      b_next ); \
 \
-	/* b11  = inv(a11) * b11;
-	   bd11 = b11; (skipped if duplication is disabled)
-	   c11  = b11; */ \
+	/* b11 = inv(a11) * b11;
+	   c11 = b11; */ \
 	PASTEMAC(ch,trsmukr)( a11, \
 	                      b11, \
-	                      bd11, \
 	                      c11, rs_c, cs_c ); \
 }

--- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h
+++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h
@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict a10, \
                           ctype* restrict a11, \
-                           ctype* restrict bd01, \
-                           ctype* restrict bd11, \
+                           ctype* restrict b01, \
                           ctype* restrict b11, \
                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
--- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c
+++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c
@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict a12, \
                           ctype* restrict a11, \
-                           ctype* restrict bd21, \
-                           ctype* restrict bd11, \
+                           ctype* restrict b21, \
                           ctype* restrict b11, \
                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
@@ -60,18 +59,16 @@ void PASTEMAC(ch,varname)( \
 	PASTEMAC(ch,gemmukr)( k, \
 	                      minus_one, \
 	                      a12, \
-	                      bd21, \
+	                      b21, \
 	                      alpha, \
 	                      b11, rs_b, cs_b, \
 	                      a_next, \
 	                      b_next ); \
 \
-	/* b11  = inv(a11) * b11;
-	   bd11 = b11; (skipped if duplication is disabled)
-	   c11  = b11; */ \
+	/* b11 = inv(a11) * b11;
+	   c11 = b11; */ \
 	PASTEMAC(ch,trsmukr)( a11, \
 	                      b11, \
-	                      bd11, \
 	                      c11, rs_c, cs_c ); \
 }

--- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h
+++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h
@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict a12, \
                           ctype* restrict a11, \
-                           ctype* restrict bd21, \
-                           ctype* restrict bd11, \
+                           ctype* restrict b21, \
                           ctype* restrict b11, \
                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
--- a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c
+++ b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c
@@ -41,7 +41,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c  \
                         ) \
 { \
--- a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h
+++ b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h
@@ -42,7 +42,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c  \
                         );

--- a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c
+++ b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c
@@ -41,7 +41,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c  \
                         ) \
 { \
--- a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h
+++ b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h
@@ -42,7 +42,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c  \
                         );

--- a/frame/include/bli_blas_macro_defs.h
+++ b/frame/include/bli_blas_macro_defs.h
@@ -53,17 +53,17 @@
 	if ( incx_blas < 0 ) \
 	{ \
 		/* The semantics of negative stride in BLAS are that the vector
-		operand be traversed in reverse order. (Another way to think of
-		this is that negative strides effectively reverse the order of
-		the vector, but without any explicit data movements.) This is
-		also how BLIS interprets negative strides. The differences is
-		that with BLAS, the caller *always* passes in the 0th (i.e.,
-		top-most or left-most) element of the vector, even when the
-		stride is negative. By contrast, in BLIS, negative strides are
-		used *relative* to the vector address as it is given. Thus, in
-		BLIS, if this backwards traversal is desired, the caller *must*
-		pass in the address to the (n-1)th (i.e., the bottom-most or
-		right-most) element along with a negative stride. */ \
+		   operand be traversed in reverse order. (Another way to think
+		   of this is that negative strides effectively reverse the order
+		   of the vector, but without any explicit data movements.) This
+		   is also how BLIS interprets negative strides. The differences
+		   is that with BLAS, the caller *always* passes in the 0th (i.e.,
+		   top-most or left-most) element of the vector, even when the
+		   stride is negative. By contrast, in BLIS, negative strides are
+		   used *relative* to the vector address as it is given. Thus, in
+		   BLIS, if this backwards traversal is desired, the caller *must*
+		   pass in the address to the (n-1)th (i.e., the bottom-most or
+		   right-most) element along with a negative stride. */ \
 		x_blis    = (x_blas) + (n-1)*(-incx_blas); \
 		incx_blis = ( inc_t )(incx_blas); \
 	} \
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -234,13 +234,6 @@
 #define bli_zpackkr  BLIS_PACKDIM_KR_Z
 #define bli_zpacknr  BLIS_PACKDIM_NR_Z

-// Duplication factors
-
-#define bli_sndup    BLIS_DEFAULT_NUM_DUPL_S
-#define bli_dndup    BLIS_DEFAULT_NUM_DUPL_D
-#define bli_cndup    BLIS_DEFAULT_NUM_DUPL_C
-#define bli_zndup    BLIS_DEFAULT_NUM_DUPL_Z
-
 // Incremental packing factors

 #define bli_snifac   BLIS_DEFAULT_NI_FAC
--- a/frame/include/blis.h
+++ b/frame/include/blis.h
@@ -189,11 +189,6 @@ extern "C" {
 #include "bli_trsv.h"


-// -- Helper operands for ukernels --
-
-#include "bli_dupl.h"
-
-
 // -- Level-3 operations --

 #include "bli_gemm.h"
--- a/frame/util/dupl/bli_dupl.c
+++ b/frame/util/dupl/bli_dupl.c
@@ -1,73 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2013, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-//
-// Define object-based interface.
-//
-#undef  GENFRONT
-#define GENFRONT( opname, varname ) \
-\
-void PASTEMAC0(opname)( \
-                        obj_t* b, \
-                        obj_t* bd \
-                      ) \
-{ \
-    PASTEMAC0(varname)( b, \
-                        bd ); \
-}
-
-GENFRONT( dupl, DUPL_KERNEL )
-
-
-//
-// Define BLAS-like interfaces.
-//
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, varname ) \
-\
-void PASTEMAC(ch,opname)( \
-                          dim_t  k, \
-                          ctype* b, \
-                          ctype* bd \
-                        ) \
-{ \
-	PASTEMAC(ch,varname)( k, \
-	                      b, \
-	                      bd ); \
-}
-
-INSERT_GENTFUNC_BASIC( dupl, DUPL_KERNEL )
-
--- a/frame/util/dupl/bli_dupl.h
+++ b/frame/util/dupl/bli_dupl.h
@@ -1,58 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2013, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_dupl_unb_var1.h"
-
-
-//
-// Prototype object-based interface.
-//
-void bli_dupl( obj_t* b,
-               obj_t* bd );
- 
-
-//
-// Prototype BLAS-like interfaces.
-//
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname)( \
-                          dim_t  k, \
-                          ctype* b, \
-                          ctype* bd \
-                        );
-
-INSERT_GENTPROT_BASIC( dupl )
-
--- a/frame/util/dupl/bli_dupl_unb_var1.c
+++ b/frame/util/dupl/bli_dupl_unb_var1.c
@@ -1,108 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2013, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T dupl_fp
-
-typedef void (*FUNCPTR_T)(
-                           dim_t   k,
-                           void*   b,
-                           void*   bd
-                         );
-
-static FUNCPTR_T GENARRAY(ftypes,dupl_unb_var1);
-
-
-void bli_dupl_unb_var1( obj_t* b,
-                        obj_t* bd )
-{
-	num_t     dt_b      = bli_obj_datatype( *b );
-
-	dim_t     k;
-
-	void*     buf_b     = bli_obj_buffer_at_off( *b );
-
-	void*     buf_bd    = bli_obj_buffer_at_off( *bd );
-
-	FUNCPTR_T f;
-
-	// The k dimension is the one that is "perpendicular" to the
-	// storage dimension. 
-	if ( bli_obj_is_row_stored( *b ) ) k = bli_obj_length( *b );
-	else                               k = bli_obj_width( *b );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_b];
-
-	// Invoke the function.
-	f( k,
-	   buf_b,
-	   buf_bd );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kername ) \
-\
-void PASTEMAC(ch,varname)( \
-                           dim_t  n, \
-                           void*  b, \
-                           void*  bd \
-                         ) \
-{ \
-	ctype*      b_cast  = b; \
-	ctype*      bd_cast = bd; \
-\
-	const dim_t NDUP    = PASTEMAC(ch,ndup); \
-	const dim_t NR      = PASTEMAC(ch,nr); \
-	const dim_t PACKNR  = PASTEMAC(ch,packnr); \
-\
-	dim_t       i, j, el, d; \
-\
-	for ( el = 0; el < n; ++el ) \
-	{ \
-		i = el / NR; \
-		j = el % NR; \
-\
-		for ( d = 0; d < NDUP; ++d ) \
-		{ \
-			*(bd_cast + el*NDUP + d) = *(b_cast + i*PACKNR + j); \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC( dupl_unb_var1, dupl_unb_var1 )
-
--- a/frame/util/dupl/bli_dupl_unb_var1.h
+++ b/frame/util/dupl/bli_dupl_unb_var1.h
@@ -1,56 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2013, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-
-//
-// Prototype object-based interface.
-//
-void bli_dupl_unb_var1( obj_t* b,
-                        obj_t* bd );
-
-
-//
-// Prototype BLAS-like interfaces.
-//
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           dim_t  n, \
-                           void*  b, \
-                           void*  bd \
-                         );
-
-INSERT_GENTPROT_BASIC( dupl_unb_var1 )
-
--- a/kernels/c99/3/bli_gemmtrsm_l_ref_4x4.c
+++ b/kernels/c99/3/bli_gemmtrsm_l_ref_4x4.c
@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict aL, \
                           ctype* restrict a, \
-                           ctype* restrict bdT, \
-                           ctype* restrict bd, \
+                           ctype* restrict bT, \
                           ctype* restrict b, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
@@ -59,7 +58,7 @@ void PASTEMAC(ch,varname)( \
 	PASTEMAC(ch,gemmukr)( k, \
 	                      minus_one, \
 	                      aL, \
-	                      bdT, \
+	                      bT, \
 	                      alpha, \
 	                      b, rs_b, cs_b, \
 	                      a_next, \
@@ -67,7 +66,6 @@ void PASTEMAC(ch,varname)( \
 \
 	PASTEMAC(ch,trsmukr)( a, \
 	                      b, \
-	                      bd, \
 	                      c, rs_c, cs_c ); \
 }

--- a/kernels/c99/3/bli_gemmtrsm_l_ref_4x4.h
+++ b/kernels/c99/3/bli_gemmtrsm_l_ref_4x4.h
@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict aL, \
                           ctype* restrict a, \
-                           ctype* restrict bdT, \
-                           ctype* restrict bd, \
+                           ctype* restrict bT, \
                           ctype* restrict b, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
--- a/kernels/c99/3/bli_gemmtrsm_u_ref_4x4.c
+++ b/kernels/c99/3/bli_gemmtrsm_u_ref_4x4.c
@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict aR, \
                           ctype* restrict a, \
-                           ctype* restrict bdB, \
-                           ctype* restrict bd, \
+                           ctype* restrict bB, \
                           ctype* restrict b, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
@@ -59,14 +58,13 @@ void PASTEMAC(ch,varname)( \
 	PASTEMAC(ch,gemmukr)( k, \
 	                      minus_one, \
 	                      aR, \
-	                      bdB, \
+	                      bB, \
 	                      alpha, \
 	                      b, rs_b, cs_b, \
 	                      a_next, b_next ); \
 \
 	PASTEMAC(ch,trsmukr)( a, \
 	                      b, \
-	                      bd, \
 	                      c, rs_c, cs_c ); \
 }

--- a/kernels/c99/3/bli_gemmtrsm_u_ref_4x4.h
+++ b/kernels/c99/3/bli_gemmtrsm_u_ref_4x4.h
@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict aR, \
                           ctype* restrict a, \
-                           ctype* restrict bdB, \
-                           ctype* restrict bd, \
+                           ctype* restrict bB, \
                           ctype* restrict b, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
--- a/kernels/c99/3/bli_trsm_l_ref_4x4.c
+++ b/kernels/c99/3/bli_trsm_l_ref_4x4.c
@@ -41,7 +41,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
--- a/kernels/c99/3/bli_trsm_l_ref_4x4.h
+++ b/kernels/c99/3/bli_trsm_l_ref_4x4.h
@@ -39,7 +39,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c \
                         );

--- a/kernels/c99/3/bli_trsm_u_ref_4x4.c
+++ b/kernels/c99/3/bli_trsm_u_ref_4x4.c
@@ -41,7 +41,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
--- a/kernels/c99/3/bli_trsm_u_ref_4x4.h
+++ b/kernels/c99/3/bli_trsm_u_ref_4x4.h
@@ -39,7 +39,6 @@
 void PASTEMAC(ch,varname)( \
                           ctype* restrict a, \
                           ctype* restrict b, \
-                           ctype* restrict bd, \
                           ctype* restrict c, inc_t rs_c, inc_t cs_c \
                         );

--- a/kernels/x86/3/bli_dupl_opt_var1.c
+++ b/kernels/x86/3/bli_dupl_opt_var1.c
@@ -1,152 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2013, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_sdupl_opt_var1(
-                         dim_t     n_elem,
-                         float*    b,
-                         float*    bd
-                       )
-{
-	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-}
-
-void bli_ddupl_opt_var1(
-                         dim_t     n_elem,
-                         double*   b,
-                         double*   bd
-                       )
-{
-	dim_t n_iter = n_elem / 8;
-	dim_t n_left = n_elem % 8;
-
-	__asm__ volatile
-	(
-		"                                \n\t"
-		"movl     %2, %%eax              \n\t" // load address of b.
-		"movl     %3, %%ebx              \n\t" // load address of bd.
-		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
-		"movl      %0, %%esi             \n\t" // i = n_iter;
-		"testl  %%esi, %%esi             \n\t" // check n_iter via logical AND.
-		"je     .CONSIDERNLEFT           \n\t" // if i == 0, jump to code that
-		"                                \n\t" // contains the n_left loop.
-		"                                \n\t"
-		"                                \n\t"
-		".LOOPNITER:                     \n\t" // MAIN LOOP
-		"                                \n\t"
-		"movddup  0 * 8(%%eax), %%xmm0   \n\t"
-		"movddup  1 * 8(%%eax), %%xmm1   \n\t"
-		"movddup  2 * 8(%%eax), %%xmm2   \n\t"
-		"movddup  3 * 8(%%eax), %%xmm3   \n\t"
-		"movddup  4 * 8(%%eax), %%xmm4   \n\t"
-		"movddup  5 * 8(%%eax), %%xmm5   \n\t"
-		"movddup  6 * 8(%%eax), %%xmm6   \n\t"
-		"movddup  7 * 8(%%eax), %%xmm7   \n\t"
-		"addl     $64, %%eax             \n\t" // b += 8;
-		"                                \n\t"
-		"movapd   %%xmm0, 0 * 16(%%ebx)  \n\t"
-		"movapd   %%xmm1, 1 * 16(%%ebx)  \n\t"
-		"movapd   %%xmm2, 2 * 16(%%ebx)  \n\t"
-		"movapd   %%xmm3, 3 * 16(%%ebx)  \n\t"
-		"movapd   %%xmm4, 4 * 16(%%ebx)  \n\t"
-		"movapd   %%xmm5, 5 * 16(%%ebx)  \n\t"
-		"movapd   %%xmm6, 6 * 16(%%ebx)  \n\t"
-		"movapd   %%xmm7, 7 * 16(%%ebx)  \n\t"
-		"addl    $128, %%ebx             \n\t" // bd += 16;
-		"                                \n\t"
-		"decl   %%esi                    \n\t" // i -= 1;
-		"jne    .LOOPNITER               \n\t" // iterate again if i != 0.
-		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
-		".CONSIDERNLEFT:                 \n\t"
-		"                                \n\t"
-		"movl      %1, %%esi             \n\t" // i = n_left;
-		"testl  %%esi, %%esi             \n\t" // check n_left via logical AND.
-		"je     .DONE                    \n\t" // if i == 0, we're done; jump to end.
-		"                                \n\t" // else, we prepare to enter n_left loop.
-		"                                \n\t"
-		"                                \n\t"
-		".LOOPNLEFT:                     \n\t" // EDGE LOOP
-		"                                \n\t"
-		"movddup  0 * 8(%%eax), %%xmm0   \n\t"
-		"addl      $8, %%eax             \n\t" // b += 1;
-		"                                \n\t"
-		"movapd   %%xmm0, 0 * 16(%%ebx)  \n\t"
-		"addl     $16, %%ebx             \n\t" // bd += 2;
-		"                                \n\t"
-		"decl   %%esi                    \n\t" // i -= 1;
-		"jne    .LOOPNLEFT               \n\t" // iterate again if i != 0.
-		"                                \n\t"
-		"                                \n\t"
-		"                                \n\t"
-		".DONE:                          \n\t"
-		"                                \n\t"
-
-		: // output operands (none)
-		: // input operands
-		  "r" (n_iter),
-		  "r" (n_left),
-		  "m" (b),
-		  "m" (bd)
-		: // register clobber list
-		  "eax", "ebx", "esi",
-		  "xmm0", "xmm1", "xmm2", "xmm3",
-		  "xmm4", "xmm5", "xmm6", "xmm7",
-		  "memory"
-	);
-	
-}
-
-void bli_cdupl_opt_var1(
-                         dim_t     k,
-                         scomplex* b,
-                         scomplex* bd
-                       )
-{
-	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-}
-
-void bli_zdupl_opt_var1(
-                         dim_t     k,
-                         dcomplex* b,
-                         dcomplex* bd
-                       )
-{
-	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-}
-
--- a/kernels/x86/3/bli_dupl_opt_var1.h
+++ b/kernels/x86/3/bli_dupl_opt_var1.h
@@ -1,46 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2013, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname)( \
-                           dim_t   n_elem, \
-                           ctype*  b, \
-                           ctype*  bd \
-                         );
-
-INSERT_GENTPROT_BASIC( dupl_opt_var1 )
-
--- a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.c
+++ b/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.c
@@ -39,8 +39,7 @@ void bli_sgemmtrsm_l_opt_d4x4(
                               float* restrict    alpha,
                               float* restrict    a10,
                               float* restrict    a11,
-                               float* restrict    bd01,
-                               float* restrict    bd11,
+                               float* restrict    b01,
                               float* restrict    b11,
                               float* restrict    c11, inc_t rs_c, inc_t cs_c,
                               float* restrict    a_next,
@@ -55,8 +54,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
                               double* restrict   alpha,
                               double* restrict   a10,
                               double* restrict   a11,
-                               double* restrict   bd01,
-                               double* restrict   bd11,
+                               double* restrict   b01,
                               double* restrict   b11,
                               double* restrict   c11, inc_t rs_c, inc_t cs_c,
                               double* restrict   a_next,
@@ -73,8 +71,8 @@ void bli_dgemmtrsm_l_opt_d4x4(
 	(
 		"                                \n\t"
 		"movq          %2, %%rax         \n\t" // load address of a10.
-		"movq          %4, %%rbx         \n\t" // load address of bd01.
-		//"movq         %11, %%r9          \n\t" // load address of b_next.
+		"movq          %4, %%rbx         \n\t" // load address of b01.
+		//"movq         %10, %%r9          \n\t" // load address of b_next.
 		"                                \n\t"
 		"subq    $-8 * 16, %%rax         \n\t" // increment pointers to allow byte
 		"subq    $-8 * 16, %%rbx         \n\t" // offsets in the unrolled iterations.
@@ -83,7 +81,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
 		"movaps  -7 * 16(%%rax), %%xmm1  \n\t" // of a and b.
 		"movaps  -8 * 16(%%rbx), %%xmm2  \n\t"
 		"                                \n\t"
-		//"movq          %7, %%rcx         \n\t" // load address of c11
+		//"movq          %6, %%rcx         \n\t" // load address of c11
 		//"movq          %9, %%rdi         \n\t" // load cs_c
 		//"leaq        (,%%rdi,8), %%rdi   \n\t" // cs_c *= sizeof(double)
 		//"leaq   (%%rcx,%%rdi,2), %%rdx   \n\t" // load address of c + 2*cs_c;
@@ -320,7 +318,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		"movq      %6, %%rbx             \n\t" // load address of b11.
+		"movq      %5, %%rbx             \n\t" // load address of b11.
 		"                                \n\t"
 		"                                \n\t" // xmm8:   xmm9:   xmm10:  xmm11:
 		"                                \n\t" // ( ab01  ( ab00  ( ab03  ( ab02
@@ -354,7 +352,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
 		"                                \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
 		"                                \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
 		"                                \n\t"
-		"movq    %10, %%rax              \n\t" // load address of alpha
+		"movq    %9, %%rax               \n\t" // load address of alpha
 		"movddup (%%rax), %%xmm15        \n\t" // load alpha and duplicate
 		"                                \n\t"
 		"movaps  0 * 16(%%rbx), %%xmm8   \n\t" 
@@ -394,10 +392,10 @@ void bli_dgemmtrsm_l_opt_d4x4(
 		"                                \n\t"
 		"                                \n\t"
 		"movq     %3, %%rax                \n\t" // load address of a11
-		"movq     %7, %%rcx                \n\t" // load address of c11
+		"movq     %6, %%rcx                \n\t" // load address of c11
 		"                                  \n\t"
-		"movq     %8, %%rsi                \n\t" // load rs_c
-		"movq     %9, %%rdi                \n\t" // load cs_c
+		"movq     %7, %%rsi                \n\t" // load rs_c
+		"movq     %8, %%rdi                \n\t" // load cs_c
 		"salq     $3, %%rsi                \n\t" // rs_c *= sizeof( double )
 		"salq     $3, %%rdi                \n\t" // cs_c *= sizeof( double )
 		"                                  \n\t"
@@ -514,18 +512,17 @@ void bli_dgemmtrsm_l_opt_d4x4(

 		: // output operands (none)
 		: // input operands
-		  "m" (k_iter),
-		  "m" (k_left),
-		  "m" (a10),
-		  "m" (a11),
-		  "m" (bd01),
-		  "m" (bd11),
-		  "m" (b11),
-		  "m" (c11),
-		  "m" (rs_c),
-		  "m" (cs_c),
-		  "m" (alpha),
-		  "m" (b_next)
+		  "m" (k_iter), // 0
+		  "m" (k_left), // 1
+		  "m" (a10),    // 2
+		  "m" (a11),    // 3
+		  "m" (b01),    // 4
+		  "m" (b11),    // 5
+		  "m" (c11),    // 6
+		  "m" (rs_c),   // 7
+		  "m" (cs_c),   // 8
+		  "m" (alpha),  // 9
+		  "m" (b_next)  // 10
 		: // register clobber list
 		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10",
 		  "xmm0", "xmm1", "xmm2", "xmm3",
@@ -542,8 +539,7 @@ void bli_cgemmtrsm_l_opt_d4x4(
                               scomplex* restrict alpha,
                               scomplex* restrict a10,
                               scomplex* restrict a11,
-                               scomplex* restrict bd01,
-                               scomplex* restrict bd11,
+                               scomplex* restrict b01,
                               scomplex* restrict b11,
                               scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
                               scomplex* restrict a_next,
@@ -558,8 +554,7 @@ void bli_zgemmtrsm_l_opt_d4x4(
                               dcomplex* restrict alpha,
                               dcomplex* restrict a10,
                               dcomplex* restrict a11,
-                               dcomplex* restrict bd01,
-                               dcomplex* restrict bd11,
+                               dcomplex* restrict b01,
                               dcomplex* restrict b11,
                               dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
                               dcomplex* restrict a_next,
--- a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.h
+++ b/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.h
@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict a10, \
                           ctype* restrict a11, \
-                           ctype* restrict bd01, \
-                           ctype* restrict bd11, \
+                           ctype* restrict b01, \
                           ctype* restrict b11, \
                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
--- a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.c
+++ b/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.c
@@ -39,8 +39,7 @@ void bli_sgemmtrsm_u_opt_d4x4(
                               float* restrict    alpha,
                               float* restrict    a12,
                               float* restrict    a11,
-                               float* restrict    bd21,
-                               float* restrict    bd11,
+                               float* restrict    b21,
                               float* restrict    b11,
                               float* restrict    c11, inc_t rs_c, inc_t cs_c,
                               float* restrict    a_next,
@@ -55,8 +54,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
                               double* restrict   alpha,
                               double* restrict   a12,
                               double* restrict   a11,
-                               double* restrict   bd21,
-                               double* restrict   bd11,
+                               double* restrict   b21,
                               double* restrict   b11,
                               double* restrict   c11, inc_t rs_c, inc_t cs_c,
                               double* restrict   a_next,
@@ -73,7 +71,8 @@ void bli_dgemmtrsm_u_opt_d4x4(
 	(
 		"                                \n\t"
 		"movq          %2, %%rax         \n\t" // load address of a12.
-		"movq          %4, %%rbx         \n\t" // load address of bd21.
+		"movq          %4, %%rbx         \n\t" // load address of b21.
+		//"movq         %10, %%r9          \n\t" // load address of b_next.
 		"                                \n\t"
 		"addq     $8 * 16, %%rax         \n\t" // increment pointers to allow byte
 		"addq     $8 * 16, %%rbx         \n\t" // offsets in the unrolled iterations.
@@ -302,7 +301,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
 		"                                \n\t"
 		"                                \n\t"
 		"                                \n\t"
-		"movq      %6, %%rbx             \n\t" // load address of b11.
+		"movq      %5, %%rbx             \n\t" // load address of b11.
 		"                                \n\t"
 		"                                \n\t" // xmm8:   xmm9:   xmm10:  xmm11:
 		"                                \n\t" // ( ab01  ( ab00  ( ab03  ( ab02
@@ -336,7 +335,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
 		"                                \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
 		"                                \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
 		"                                \n\t"
-		"movq    %10, %%rax              \n\t" // load address of alpha
+		"movq    %9, %%rax               \n\t" // load address of alpha
 		"movddup (%%rax), %%xmm15        \n\t" // load alpha and duplicate
 		"                                \n\t"
 		"movaps  0 * 16(%%rbx), %%xmm8   \n\t"
@@ -376,10 +375,10 @@ void bli_dgemmtrsm_u_opt_d4x4(
 		"                                \n\t"
 		"                                \n\t"
 		"movq     %3, %%rax                \n\t" // load address of a11
-		"movq     %7, %%rcx                \n\t" // load address of c11
+		"movq     %6, %%rcx                \n\t" // load address of c11
 		"                                  \n\t"
-		"movq     %8, %%rsi                \n\t" // load rs_c
-		"movq     %9, %%rdi                \n\t" // load cs_c
+		"movq     %7, %%rsi                \n\t" // load rs_c
+		"movq     %8, %%rdi                \n\t" // load cs_c
 		"salq     $3, %%rsi                \n\t" // rs_c *= sizeof( double )
 		"salq     $3, %%rdi                \n\t" // cs_c *= sizeof( double )
 		"                                  \n\t"
@@ -499,17 +498,17 @@ void bli_dgemmtrsm_u_opt_d4x4(

 		: // output operands (none)
 		: // input operands
-		  "m" (k_iter),
-		  "m" (k_left),
-		  "m" (a12),
-		  "m" (a11),
-		  "m" (bd21),
-		  "m" (bd11),
-		  "m" (b11),
-		  "m" (c11),
-		  "m" (rs_c),
-		  "m" (cs_c),
-		  "m" (alpha)
+		  "m" (k_iter), // 0
+		  "m" (k_left), // 1
+		  "m" (a12),    // 2
+		  "m" (a11),    // 3
+		  "m" (b21),    // 4
+		  "m" (b11),    // 5
+		  "m" (c11),    // 6
+		  "m" (rs_c),   // 7
+		  "m" (cs_c),   // 8
+		  "m" (alpha)   // 9
+		  "m" (b_next)  // 10
 		: // register clobber list
 		  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 		  "xmm0", "xmm1", "xmm2", "xmm3",
@@ -526,8 +525,7 @@ void bli_cgemmtrsm_u_opt_d4x4(
                               scomplex* restrict alpha,
                               scomplex* restrict a12,
                               scomplex* restrict a11,
-                               scomplex* restrict bd21,
-                               scomplex* restrict bd11,
+                               scomplex* restrict b21,
                               scomplex* restrict b11,
                               scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
                               scomplex* restrict a_next,
@@ -542,8 +540,7 @@ void bli_zgemmtrsm_u_opt_d4x4(
                               dcomplex* restrict alpha,
                               dcomplex* restrict a12,
                               dcomplex* restrict a11,
-                               dcomplex* restrict bd21,
-                               dcomplex* restrict bd11,
+                               dcomplex* restrict b21,
                               dcomplex* restrict b11,
                               dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
                               dcomplex* restrict a_next,
--- a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.h
+++ b/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.h
@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
                           ctype* restrict alpha, \
                           ctype* restrict a12, \
                           ctype* restrict a11, \
-                           ctype* restrict bd21, \
-                           ctype* restrict bd11, \
+                           ctype* restrict b21, \
                           ctype* restrict b11, \
                           ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
                           ctype* restrict a_next, \
--- a/testsuite/input.general
+++ b/testsuite/input.general
@@ -3,7 +3,7 @@ c #rg     # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major;
 c #rji    # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
 0       # Test all combinations of storage schemes?
 32      # General stride spacing (for cases when testing general stride)
-sdcz  #sdcz    # Datatype(s) to test
+d  #sdcz    # Datatype(s) to test
 100     # Problem size: first to test
 300     # Problem size: maximum to test
 100     # Problem size: increment between experiments
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -64,8 +64,7 @@ void libblis_test_gemmtrsm_ukr_impl( mt_impl_t impl,
                                     obj_t*    alpha,
                                     obj_t*    a1x,
                                     obj_t*    a11,
-                                     obj_t*    bdx1,
-                                     obj_t*    bd11,
+                                     obj_t*    bx1,
                                     obj_t*    b11,
                                     obj_t*    c11 );

@@ -82,13 +81,10 @@ void libblis_test_gemmtrsm_ukr_check( side_t  side,
 void bli_gemmtrsm_ukr_make_subparts( dim_t  k,
                                     obj_t* a,
                                     obj_t* b,
-                                     obj_t* bd,
                                     obj_t* a1x,
                                     obj_t* a11,
                                     obj_t* bx1,
-                                     obj_t* b11,
-                                     obj_t* bdx1,
-                                     obj_t* bd11 );
+                                     obj_t* b11 );


 void libblis_test_gemmtrsm_ukr_deps( test_params_t* params, test_op_t* op )
@@ -166,10 +162,10 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,

 	obj_t        kappa;
 	obj_t        alpha;
-	obj_t        a_big, a, b, bd;
+	obj_t        a_big, a, b;
 	obj_t        b11, c11;
 	obj_t        ap, bp;
-	obj_t        a1xp, a11p, bdx1, bd11, bx1p, b11p;
+	obj_t        a1xp, a11p, bx1p, b11p;
 	obj_t        c11_save;


@@ -201,8 +197,6 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
 	                          sc_str[0], m,   n,   &c11 );
 	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m,   n,   &c11_save );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
-	                          sc_b,      k+m, 4*n, &bd );

 	// Set alpha.
 	if ( bli_obj_is_real( b ) )
@@ -264,8 +258,8 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,


 	// Create subpartitions from the a and b panels.
-	bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &bd,
-	                                &a1xp, &a11p, &bx1p, &b11p, &bdx1, &bd11 );
+	bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
+	                                &a1xp, &a11p, &bx1p, &b11p );


 	// Repeat the experiment n_repeats times and record results. 
@@ -279,7 +273,7 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
 		time = bli_clock();

 		libblis_test_gemmtrsm_ukr_impl( impl, side, &alpha,
-		                                &a1xp, &a11p, &bdx1, &bd11, &b11p, &c11 );
+		                                &a1xp, &a11p, &bx1p, &b11p, &c11 );

 		time_min = bli_clock_min_diff( time_min, time );
 	}
@@ -304,7 +298,6 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
 	bli_obj_free( &b );
 	bli_obj_free( &c11 );
 	bli_obj_free( &c11_save );
-	bli_obj_free( &bd );
 }


@@ -314,15 +307,14 @@ void libblis_test_gemmtrsm_ukr_impl( mt_impl_t impl,
                                     obj_t*    alpha,
                                     obj_t*    a1x,
                                     obj_t*    a11,
-                                     obj_t*    bdx1,
-                                     obj_t*    bd11,
+                                     obj_t*    bx1,
                                     obj_t*    b11,
                                     obj_t*    c11 )
 {
 	switch ( impl )
 	{
 		case BLIS_TEST_SEQ_UKERNEL:
-		bli_gemmtrsm_ukr( alpha, a1x, a11, bdx1, bd11, b11, c11 );
+		bli_gemmtrsm_ukr( alpha, a1x, a11, bx1, b11, c11 );
 		break;

 		default:
@@ -431,20 +423,16 @@ void libblis_test_gemmtrsm_ukr_check( side_t  side,
 void bli_gemmtrsm_ukr_make_subparts( dim_t  k,
                                     obj_t* a,
                                     obj_t* b,
-                                     obj_t* bd,
                                     obj_t* a1x,
                                     obj_t* a11,
                                     obj_t* bx1,
-                                     obj_t* b11,
-                                     obj_t* bdx1,
-                                     obj_t* bd11 )
+                                     obj_t* b11 )
 {
 	dim_t mr = bli_obj_length( *a );
 	dim_t nr = bli_obj_width( *b );

 	dim_t off_a1x, off_a11;
 	dim_t off_bx1, off_b11;
-	dim_t off_bdx1, off_bd11;

 	if ( bli_obj_is_lower( *a ) )
 	{
@@ -452,8 +440,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t  k,
 		off_a11 = k;
 		off_bx1 = 0;
 		off_b11 = k;
-		off_bdx1 = 0;
-		off_bd11 = k;
 	}
 	else
 	{
@@ -461,8 +447,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t  k,
 		off_a11 = 0;
 		off_bx1 = mr;
 		off_b11 = 0;
-		off_bdx1 = mr;
-		off_bd11 = 0;
 	}

 	bli_obj_init_subpart_from( *a, *a1x );
@@ -488,28 +472,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t  k,
 	// Set the diagonal offset of a11 to 0 (which overwrites the diagonal
 	// offset value it inherited from a).
 	bli_obj_set_diag_offset( 0, *a11 );
-
-	// If duplication is disabled, alias bdxx objects to bxx.
-	if ( TRUE )
-	{
-		bli_obj_alias_to( *bx1, *bdx1 );
-		bli_obj_alias_to( *b11, *bd11 );
-	}
-	else // if duplication is enabled
-	{
-		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-
-		bli_obj_init_subpart_from( *b, *bdx1 );
-		bli_obj_set_dims( k, nr, *bdx1 );
-		bli_obj_inc_offs( off_bdx1, 0, *bdx1 );
-
-		bli_obj_init_subpart_from( *b, *bd11 );
-		bli_obj_set_dims( mr, nr, *bd11 );
-		bli_obj_inc_offs( off_bd11, 0, *bd11 );
-
-		// Now update the buffer fields of bdx1, bd11, and then call
-		// bli_dupl().
-	}
 }


@@ -527,8 +489,7 @@ typedef void (*FUNCPTR_T)(
                           void*   alpha,
                           void*   a1x,
                           void*   a11,
-                           void*   bdx1,
-                           void*   bd11,
+                           void*   bx1,
                           void*   b11,
                           void*   c11, inc_t rs_c, inc_t cs_c,
                           void*   a_next,
@@ -542,8 +503,7 @@ static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukr);
 void bli_gemmtrsm_ukr( obj_t*  alpha,
                       obj_t*  a1x,
                       obj_t*  a11,
-                       obj_t*  bdx1,
-                       obj_t*  bd11,
+                       obj_t*  bx1,
                       obj_t*  b11,
                       obj_t*  c11 )
 {
@@ -555,9 +515,7 @@ void bli_gemmtrsm_ukr( obj_t*  alpha,

    void*     buf_a11   = bli_obj_buffer_at_off( *a11 );

-    void*     buf_bdx1  = bli_obj_buffer_at_off( *bdx1 );
-
-    void*     buf_bd11  = bli_obj_buffer_at_off( *bd11 );
+    void*     buf_bx1  = bli_obj_buffer_at_off( *bx1 );

    void*     buf_b11   = bli_obj_buffer_at_off( *b11 );

@@ -579,12 +537,11 @@ void bli_gemmtrsm_ukr( obj_t*  alpha,
 	   buf_alpha,
 	   buf_a1x,
 	   buf_a11,
-	   buf_bdx1,
-	   buf_bd11,
+	   buf_bx1,
       buf_b11,
       buf_c11, rs_c, cs_c,
 	   buf_a1x,
-	   buf_bdx1 );
+	   buf_bx1 );
 }


@@ -596,8 +553,7 @@ void PASTEMAC(ch,varname)( \
                           void*   alpha, \
                           void*   a1x, \
                           void*   a11, \
-                           void*   bdx1, \
-                           void*   bd11, \
+                           void*   bx1, \
                           void*   b11, \
                           void*   c11, inc_t rs_c, inc_t cs_c, \
                           void*   a_next, \
@@ -608,8 +564,7 @@ void PASTEMAC(ch,varname)( \
                          alpha, \
                          a1x, \
                          a11, \
-                          bdx1, \
-                          bd11, \
+                          bx1, \
                          b11, \
                          c11, rs_c, cs_c, \
 	                      a_next, \
--- a/testsuite/src/test_gemmtrsm_ukr.h
+++ b/testsuite/src/test_gemmtrsm_ukr.h
@@ -40,8 +40,7 @@ void libblis_test_gemmtrsm_ukr( test_params_t* params, test_op_t* op );
 void bli_gemmtrsm_ukr( obj_t*  alpha,
                       obj_t*  a1x,
                       obj_t*  a11,
-                       obj_t*  bdx1,
-                       obj_t*  bd11,
+                       obj_t*  bx1,
                       obj_t*  b11,
                       obj_t*  c11 );

@@ -53,8 +52,7 @@ void PASTEMAC(ch,varname)( \
                           void*   alpha, \
                           void*   a1x, \
                           void*   a11, \
-                           void*   bdx1, \
-                           void*   bd11, \
+                           void*   bx1, \
                           void*   b11, \
                           void*   c11, inc_t rs_c, inc_t cs_c, \
                           void*   a_next, \
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -653,18 +653,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	                        BLIS_DEFAULT_NI_Z );
 */
 	libblis_test_fprintf_c( os, "\n" );
-	libblis_test_fprintf_c( os, "level-3 packing duplication  s     d     c     z \n" );
-	libblis_test_fprintf_c( os, "  dupl. factors for B    %5u %5u %5u %5u\n",
-	                        BLIS_DEFAULT_NUM_DUPL_S,
-	                        BLIS_DEFAULT_NUM_DUPL_D,
-	                        BLIS_DEFAULT_NUM_DUPL_C,
-	                        BLIS_DEFAULT_NUM_DUPL_Z );
-	libblis_test_fprintf_c( os, "  elements per register  %5u %5u %5u %5u\n",
-	                        BLIS_NUM_ELEM_PER_REG_S,
-	                        BLIS_NUM_ELEM_PER_REG_D,
-	                        BLIS_NUM_ELEM_PER_REG_C,
-	                        BLIS_NUM_ELEM_PER_REG_Z );
-	libblis_test_fprintf_c( os, "\n" );
 	libblis_test_fprintf_c( os, "level-2 cache blocksizes     s     d     c     z \n" );
 	libblis_test_fprintf_c( os, "  m dimension            %5u %5u %5u %5u\n",
 	                        BLIS_DEFAULT_L2_MC_S,
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -63,7 +63,6 @@ void libblis_test_trsm_ukr_impl( mt_impl_t impl,
                                 side_t    side,
                                 obj_t*    a,
                                 obj_t*    b,
-                                 obj_t*    bd,
                                 obj_t*    c );

 void libblis_test_trsm_ukr_check( side_t  side,
@@ -148,7 +147,7 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
 	uplo_t       uploa;

 	obj_t        kappa;
-	obj_t        a, b, bd, c;
+	obj_t        a, b, c;
 	obj_t        ap, bp;
 	obj_t        c_save;

@@ -177,8 +176,6 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
 	                          sc_str[0], m, n, &c );
 	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
 	                          sc_str[0], m, n, &c_save );
-	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
-	                          sc_b,      m, 4*n, &bd );

 	// Set the structure, uplo, and diagonal offset properties of A.
 	bli_obj_set_struc( BLIS_TRIANGULAR, a );
@@ -229,14 +226,11 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
 		// Re-pack the contents of b to bp.
 		bli_packm_blk_var2( &BLIS_ONE, &b, &bp );

-		// Re-duplicate the contents of bp to bd.
-		bli_dupl( &bp, &bd );
-
 		bli_copym( &c_save, &c );

 		time = bli_clock();

-		libblis_test_trsm_ukr_impl( impl, side, &ap, &bp, &bd, &c );
+		libblis_test_trsm_ukr_impl( impl, side, &ap, &bp, &c );

 		time_min = bli_clock_min_diff( time_min, time );
 	}
@@ -268,13 +262,12 @@ void libblis_test_trsm_ukr_impl( mt_impl_t impl,
                                 side_t    side,
                                 obj_t*    a,
                                 obj_t*    b,
-                                 obj_t*    bd,
                                 obj_t*    c )
 {
 	switch ( impl )
 	{
 		case BLIS_TEST_SEQ_UKERNEL:
-		bli_trsm_ukr( a, b, bd, c );
+		bli_trsm_ukr( a, b, c );
 		break;

 		default:
@@ -386,7 +379,6 @@ void libblis_test_trsm_ukr_check( side_t  side,
 typedef void (*FUNCPTR_T)(
                           void*   a,
                           void*   b,
-                           void*   bd,
                           void*   c, inc_t rs_c, inc_t cs_c
                         );

@@ -396,7 +388,6 @@ static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukr);

 void bli_trsm_ukr( obj_t*  a,
                   obj_t*  b,
-                   obj_t*  bd,
                   obj_t*  c )
 {
    num_t     dt        = bli_obj_datatype( *c );
@@ -405,8 +396,6 @@ void bli_trsm_ukr( obj_t*  a,

    void*     buf_b     = bli_obj_buffer_at_off( *b );

-    void*     buf_bd    = bli_obj_buffer_at_off( *bd );
-
    void*     buf_c     = bli_obj_buffer_at_off( *c );
    inc_t     rs_c      = bli_obj_row_stride( *c );
    inc_t     cs_c      = bli_obj_col_stride( *c );
@@ -421,7 +410,6 @@ void bli_trsm_ukr( obj_t*  a,
    // Invoke the function.
    f( buf_a,
       buf_b,
-       buf_bd,
       buf_c, rs_c, cs_c );
 }

@@ -432,13 +420,11 @@ void bli_trsm_ukr( obj_t*  a,
 void PASTEMAC(ch,varname)( \
                           void*   a, \
                           void*   b, \
-                           void*   bd, \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         ) \
 { \
    PASTEMAC(ch,ukrname)( a, \
                          b, \
-                          bd, \
                          c, rs_c, cs_c ); \
 }

--- a/testsuite/src/test_trsm_ukr.h
+++ b/testsuite/src/test_trsm_ukr.h
@@ -39,7 +39,6 @@ void libblis_test_trsm_ukr( test_params_t* params, test_op_t* op );
 //
 void bli_trsm_ukr( obj_t*  a,
                   obj_t*  b,
-                   obj_t*  bd,
                   obj_t*  c );

 #undef  GENTPROT
@@ -48,7 +47,6 @@ void bli_trsm_ukr( obj_t*  a,
 void PASTEMAC(ch,varname)( \
                           void*   a, \
                           void*   b, \
-                           void*   bd, \
                           void*   c, inc_t rs_c, inc_t cs_c \
                         );

--- a/windows/build/bli_kernel.h
+++ b/windows/build/bli_kernel.h
@@ -146,34 +146,6 @@
 #define BLIS_EXTEND_KR_C               0
 #define BLIS_EXTEND_KR_Z               0

-// -- Number of elements per vector register --
-
-// NOTE: These constants are typically only used to determine the amount
-// of duplication needed when configuring level-3 macro-kernels that
-// copy and duplicate elements of B to a temporary duplication buffer
-// (so that element-wise vector multiplication and addition instructions
-// can be used).
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-// -- Default switch for duplication of B --
-
-// NOTE: Setting these values to 1 disables duplication. Any value
-// d > 1 results in a d-1 duplicates created within special macro-kernel
-// buffer of dimension k x NR*d.
-
-//#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-//#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-//#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-//#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-#define BLIS_DEFAULT_NUM_DUPL_S        1
-#define BLIS_DEFAULT_NUM_DUPL_D        1
-#define BLIS_DEFAULT_NUM_DUPL_C        1
-#define BLIS_DEFAULT_NUM_DUPL_Z        1
-
 // -- Default incremental packing blocksizes (n dimension) --

 // NOTE: These incremental packing blocksizes (for the n dimension) are only
@@ -259,10 +231,6 @@

 // -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------

-// -- dupl --
-
-#define DUPL_KERNEL          dupl_unb_var1
-
 // -- gemm --

 #define GEMM_UKERNEL         gemm_ref_mxn