diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h
index c82392b60..d1c4ef828 100644
--- a/config/zen/bli_family_zen.h
+++ b/config/zen/bli_family_zen.h
@@ -52,8 +52,8 @@
 
 #define BLIS_SMALL_MATRIX_THRES_TRSM   32768 //128(128+128) => m*(m+n)
 #define BLIS_SMALL_MATRIX_A_THRES_TRSM	128
-#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK	96
-#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK	128
+#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT	96
+#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT	128
 
 //This macro will enable  BLIS DGEMM to choose block sizes for a  single instance mode
 #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 	0
diff --git a/config/zen2/bli_family_zen2.h b/config/zen2/bli_family_zen2.h
index a0f5b574d..d7adddf3c 100644
--- a/config/zen2/bli_family_zen2.h
+++ b/config/zen2/bli_family_zen2.h
@@ -51,8 +51,8 @@
 
 #define BLIS_SMALL_MATRIX_THRES_TRSM   32768 //128(128+128) => m*(m+n)
 #define BLIS_SMALL_MATRIX_A_THRES_TRSM	128
-#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK	96
-#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK	128
+#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT	96
+#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT	128
 
 #define BLIS_ENABLE_SMALL_MATRIX_ROME
 #define BLIS_SMALL_MATRIX_THRES_ROME       400
diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index 94e37fc17..da9348844 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -84,11 +84,7 @@
 // Operation-specific headers.
 #include "bli_gemm.h"
 #include "bli_hemm.h"
-#include "bli_herk.h"
-#include "bli_her2k.h"
 #include "bli_symm.h"
-#include "bli_syrk.h"
-#include "bli_syr2k.h"
 #include "bli_trmm.h"
 #include "bli_trmm3.h"
 #include "bli_trsm.h"
diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c
index 58b658d1d..1986b3b0f 100644
--- a/frame/3/bli_l3_blocksize.c
+++ b/frame/3/bli_l3_blocksize.c
@@ -51,8 +51,8 @@ dim_t bli_l3_determine_kc
 
 	if      ( family == BLIS_GEMM )
 		return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
-	else if ( family == BLIS_HERK )
-		return bli_herk_determine_kc( direct, i, dim, a, b, bszid, cntx );
+	else if ( family == BLIS_GEMMT )
+		return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx );
 	else if ( family == BLIS_TRMM )
 		return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx );
 	else if ( family == BLIS_TRSM )
@@ -91,7 +91,7 @@ dim_t PASTEMAC0(opname) \
 }
 
 GENFRONT( gemm_determine_kc, gemm )
-GENFRONT( herk_determine_kc, herk )
+GENFRONT( gemmt_determine_kc, gemmt )
 GENFRONT( trmm_determine_kc, trmm )
 GENFRONT( trsm_determine_kc, trsm )
 
@@ -201,7 +201,7 @@ dim_t PASTEMAC0(opname) \
 	b_alg = bli_blksz_get_def( dt, bsize ); \
 	b_max = bli_blksz_get_max( dt, bsize ); \
 \
-	/* Notice that for herk, we do not need to perform any special handling
+	/* Notice that for gemmt, we do not need to perform any special handling
 	   for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
@@ -211,8 +211,8 @@ dim_t PASTEMAC0(opname) \
 	return b_use; \
 }
 
-GENFRONT( herk_determine_kc_f, f )
-GENFRONT( herk_determine_kc_b, b )
+GENFRONT( gemmt_determine_kc_f, f )
+GENFRONT( gemmt_determine_kc_b, b )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h
index c3301ee13..3ea3c5aa0 100644
--- a/frame/3/bli_l3_blocksize.h
+++ b/frame/3/bli_l3_blocksize.h
@@ -60,7 +60,7 @@ dim_t PASTEMAC0(opname) \
       );
 
 GENPROT( gemm_determine_kc )
-GENPROT( herk_determine_kc )
+GENPROT( gemmt_determine_kc )
 GENPROT( trmm_determine_kc )
 GENPROT( trsm_determine_kc )
 
@@ -81,8 +81,8 @@ dim_t PASTEMAC0(opname) \
 GENPROT( gemm_determine_kc_f )
 GENPROT( gemm_determine_kc_b )
 
-GENPROT( herk_determine_kc_f )
-GENPROT( herk_determine_kc_b )
+GENPROT( gemmt_determine_kc_f )
+GENPROT( gemmt_determine_kc_b )
 
 GENPROT( trmm_determine_kc_f )
 GENPROT( trmm_determine_kc_b )
diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index 413f6a58d..50da4627c 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -597,10 +597,5 @@ void bli_l3_basic_check
 
 	e_val = bli_check_object_buffer( c );
 	bli_check_error_code( e_val );
-
-	// Check for sufficiently sized stack buffers
-
-	e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx );
-	bli_check_error_code( e_val );
 }
 
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
index f6bfbedbb..3cdecfbc2 100644
--- a/frame/3/bli_l3_cntl.c
+++ b/frame/3/bli_l3_cntl.c
@@ -54,7 +54,7 @@ void bli_l3_cntl_create_if
 	if ( cntl_orig == NULL )
 	{
 		if ( family == BLIS_GEMM ||
-		     family == BLIS_HERK ||
+		     family == BLIS_GEMMT ||
 		     family == BLIS_TRMM )
 		{
 			*cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b );
@@ -97,7 +97,7 @@ void bli_l3_cntl_free
 	opid_t family = bli_cntl_family( cntl_use );
 
 	if ( family == BLIS_GEMM ||
-	     family == BLIS_HERK ||
+	     family == BLIS_GEMMT ||
 	     family == BLIS_TRMM )
 	{
 		bli_gemm_cntl_free( rntm, cntl_use, thread );
diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c
index 7baf2d6ef..0d0a71921 100644
--- a/frame/3/bli_l3_direct.c
+++ b/frame/3/bli_l3_direct.c
@@ -46,7 +46,7 @@ dir_t bli_l3_direct
 	opid_t family = bli_cntl_family( cntl );
 
 	if      ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c );
-	else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c );
+	else if ( family == BLIS_GEMMT ) return bli_gemmt_direct( a, b, c );
 	else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c );
 	else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c );
 
@@ -68,14 +68,14 @@ dir_t bli_gemm_direct
 	return BLIS_FWD;
 }
 
-dir_t bli_herk_direct
+dir_t bli_gemmt_direct
      (
        obj_t* a,
        obj_t* b,
        obj_t* c
      )
 {
-	// For herk, movement may be forwards (or backwards).
+	// For gemmt, movement may be forwards (or backwards).
 
 	return BLIS_FWD;
 }
diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h
index 7383c4a9f..39798407a 100644
--- a/frame/3/bli_l3_direct.h
+++ b/frame/3/bli_l3_direct.h
@@ -53,7 +53,7 @@ dir_t PASTEMAC0(opname) \
       );
 
 GENPROT( gemm_direct )
-GENPROT( herk_direct )
+GENPROT( gemmt_direct )
 GENPROT( trmm_direct )
 GENPROT( trsm_direct )
 
diff --git a/frame/3/bli_l3_ind.c b/frame/3/bli_l3_ind.c
index 7c30f61af..fbf73be60 100644
--- a/frame/3/bli_l3_ind.c
+++ b/frame/3/bli_l3_ind.c
@@ -55,7 +55,8 @@ static bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
 static BLIS_THREAD_LOCAL
 bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
 {
-        /*   gemm  gemmt  hemm  herk  her2k  symm  syrk  syr2k  trmm3  trmm  trsm  */
+        /*   gemm           gemmt          hemm           herk           her2k          symm
+             syrk           syr2k          trmm3          trmm           trsm  */
         /*    c     z    */
 /* 1m   */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
              {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}  },
@@ -80,11 +81,7 @@ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ) \
 GENFUNC( gemm, BLIS_GEMM )
 GENFUNC( gemmt, BLIS_GEMMT )
 GENFUNC( hemm, BLIS_HEMM )
-GENFUNC( herk, BLIS_HERK )
-GENFUNC( her2k, BLIS_HER2K )
 GENFUNC( symm, BLIS_SYMM )
-GENFUNC( syrk, BLIS_SYRK )
-GENFUNC( syr2k, BLIS_SYR2K )
 GENFUNC( trmm3, BLIS_TRMM3 )
 GENFUNC( trmm, BLIS_TRMM )
 GENFUNC( trsm, BLIS_TRSM )
diff --git a/frame/3/bli_l3_ind.h b/frame/3/bli_l3_ind.h
index f80757eb0..a14ad783c 100644
--- a/frame/3/bli_l3_ind.h
+++ b/frame/3/bli_l3_ind.h
@@ -47,11 +47,7 @@ ind_t   PASTEMAC(opname,ind_find_avail)( num_t dt );
 GENPROT( gemm )
 GENPROT( gemmt )
 GENPROT( hemm )
-GENPROT( herk )
-GENPROT( her2k )
 GENPROT( symm )
-GENPROT( syrk )
-GENPROT( syr2k )
 GENPROT( trmm3 )
 GENPROT( trmm )
 GENPROT( trsm )
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index f6cfd6640..cd0df7017 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -38,301 +38,508 @@
 // Define object-based interfaces (expert).
 //
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* If the rntm is non-NULL, it may indicate that we should forgo sup
-	   handling altogether. */ \
-	bool enable_sup = TRUE; \
-	if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
-\
-	if ( enable_sup ) \
-	{ \
-		/* Execute the small/unpacked oapi handler. If it finds that the problem
-		   does not fall within the thresholds that define "small", or for some
-		   other reason decides not to use the small/unpacked implementation,
-		   the function returns with BLIS_FAILURE, which causes execution to
-		   proceed towards the conventional implementation. */ \
-		err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
-		if ( result == BLIS_SUCCESS ) \
-		{ \
-			return; \
-		} \
-	} \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( c ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If each matrix operand has a complex storage datatype, try to get an
-	   induced method (if one is available and enabled). NOTE: Allowing
-	   precisions to vary while using 1m, which is what we do here, is unique
-	   to gemm; other level-3 operations use 1m only if all storage datatypes
-	   are equal (and they ignore the computation precision). */ \
-	if ( bli_obj_is_complex( c ) && \
-	     bli_obj_is_complex( a ) && \
-	     bli_obj_is_complex( b ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \
-}
-
 // If a sandbox was enabled, we forgo defining bli_gemm_ex() since it will be
 // defined in the sandbox environment.
 #ifndef BLIS_ENABLE_SANDBOX
-GENFRONT( gemm )
+
+void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// If the rntm is non-NULL, it may indicate that we should forgo sup
+	// handling altogether.
+	bool enable_sup = TRUE;
+	if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm );
+
+	if ( enable_sup )
+	{
+		// Execute the small/unpacked oapi handler. If it finds that the problem
+		// does not fall within the thresholds that define "small", or for some
+		// other reason decides not to use the small/unpacked implementation,
+		// the function returns with BLIS_FAILURE, which causes execution to
+		// proceed towards the conventional implementation.
+		err_t result = bli_gemmsup( alpha, a, b, beta, c, cntx, rntm );
+		if ( result == BLIS_SUCCESS )
+		{
+			return;
+		}
+	}
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If each matrix operand has a complex storage datatype, try to get an
+	// induced method (if one is available and enabled). NOTE: Allowing
+	// precisions to vary while using 1m, which is what we do here, is unique
+	// to gemm; other level-3 operations use 1m only if all storage datatypes
+	// are equal (and they ignore the computation precision).
+	if ( bli_obj_is_complex( c ) &&
+	     bli_obj_is_complex( a ) &&
+	     bli_obj_is_complex( b ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_gemmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemm_check( alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
+}
+
 #endif
 
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( c ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If all matrix operands are complex and of the same storage datatype, try
-	   to get an induced method (if one is available and enabled). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
-	     bli_obj_dt( b ) == bli_obj_dt( c ) && \
-	     bli_obj_is_complex( c ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \
+void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_is_complex( c ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_gemmtind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_gemmt_check( alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL );
 }
 
-GENFRONT( gemmt )
-GENFRONT( her2k )
-GENFRONT( syr2k )
 
+void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( c ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If all matrix operands are complex and of the same storage datatype, try
-	   to get an induced method (if one is available and enabled). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
-	     bli_obj_dt( b ) == bli_obj_dt( c ) && \
-	     bli_obj_is_complex( c ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( side, alpha, a, b, beta, c, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( side, alpha, a, b, beta, c, cntx, rntm, NULL ); \
+	obj_t ah;
+	obj_t bh;
+	obj_t alphah;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_her2k_check( alpha, a, b, beta, c, cntx );
+
+	bli_obj_alias_to( alpha, &alphah );
+	bli_obj_toggle_conj( &alphah );
+
+	bli_obj_alias_to( a, &ah );
+	bli_obj_toggle_trans( &ah );
+	bli_obj_toggle_conj( &ah );
+
+	bli_obj_alias_to( b, &bh );
+	bli_obj_toggle_trans( &bh );
+	bli_obj_toggle_conj( &bh );
+
+	// Invoke gemmt twice, using beta only the first time.
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)(   alpha, a, &bh,      beta, c, cntx, rntm );
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm );
+
+	// The Hermitian rank-2k product was computed as alpha*A*B'+alpha'*B*A', even for
+	// the diagonal elements. Mathematically, the imaginary components of
+	// diagonal elements of a Hermitian rank-2k product should always be
+	// zero. However, in practice, they sometimes accumulate meaningless
+	// non-zero values. To prevent this, we explicitly set those values
+	// to zero before returning.
+	bli_setid( &BLIS_ZERO, c );
 }
 
-GENFRONT( hemm )
-GENFRONT( symm )
-GENFRONT( trmm3 )
 
+void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( c ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If all matrix operands are complex and of the same storage datatype, try
-	   to get an induced method (if one is available and enabled). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
-	     bli_obj_is_complex( c ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( alpha, a, beta, c, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( alpha, a, beta, c, cntx, rntm, NULL ); \
+	obj_t at;
+	obj_t bt;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_syr2k_check( alpha, a, b, beta, c, cntx );
+
+	bli_obj_alias_to( b, &bt );
+	bli_obj_toggle_trans( &bt );
+
+	bli_obj_alias_to( a, &at );
+	bli_obj_toggle_trans( &at );
+
+	// Invoke gemmt twice, using beta only the first time.
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bt,      beta, c, cntx, rntm );
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, b, &at, &BLIS_ONE, c, cntx, rntm );
 }
 
-GENFRONT( herk )
-GENFRONT( syrk )
 
+void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
-     ( \
-       side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
-     ) \
-{ \
-	bli_init_once(); \
-\
-	/* Initialize a local runtime with global settings if necessary. Note
-	   that in the case that a runtime is passed in, we make a local copy. */ \
-	rntm_t rntm_l; \
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; } \
-\
-	/* Default to using native execution. */ \
-	num_t dt = bli_obj_dt( b ); \
-	ind_t im = BLIS_NAT; \
-\
-	/* If all matrix operands are complex and of the same storage datatype, try
-	   to get an induced method (if one is available and enabled). */ \
-	if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \
-	     bli_obj_is_complex( b ) ) \
-	{ \
-		/* Find the highest priority induced method that is both enabled and
-		   available for the current operation. (If an induced method is
-		   available but not enabled, or simply unavailable, BLIS_NAT will
-		   be returned here.) */ \
-		im = PASTEMAC(opname,ind_find_avail)( dt ); \
-	} \
-\
-	/* If necessary, obtain a valid context from the gks using the induced
-	   method id determined above. */ \
-	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
-\
-	/* Check the operands. */ \
-	if ( bli_error_checking_is_enabled() ) \
-		PASTEMAC(opname,_check)( side, alpha, a, b, cntx ); \
-\
-	/* Invoke the operation's front-end and request the default control tree. */ \
-	PASTEMAC(opname,_front)( side, alpha, a, b, cntx, rntm, NULL ); \
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_is_complex( c ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_hemmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_hemm_check( side, alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
 }
 
-GENFRONT( trmm )
-GENFRONT( trsm )
 
+void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_is_complex( c ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_symmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_symm_check( side, alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
+}
+
+
+void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( c );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
+	     bli_obj_dt( b ) == bli_obj_dt( c ) &&
+	     bli_obj_is_complex( c ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_trmm3ind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trmm3_check( side, alpha, a, b, beta, c, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
+}
+
+
+void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	obj_t ah;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_herk_check( alpha, a, beta, c, cntx );
+
+	bli_obj_alias_to( a, &ah );
+	bli_obj_toggle_trans( &ah );
+	bli_obj_toggle_conj( &ah );
+
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &ah, beta, c, cntx, rntm );
+
+	// The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
+	// diagonal elements. Mathematically, the imaginary components of
+	// diagonal elements of a Hermitian rank-k product should always be
+	// zero. However, in practice, they sometimes accumulate meaningless
+	// non-zero values. To prevent this, we explicitly set those values
+	// to zero before returning.
+	bli_setid( &BLIS_ZERO, c );
+}
+
+
+void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
+     (
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	obj_t at;
+
+	// Check parameters.
+	if ( bli_error_checking_is_enabled() )
+		bli_syrk_check( alpha, a, beta, c, cntx );
+
+	bli_obj_alias_to( a, &at );
+	bli_obj_toggle_trans( &at );
+
+	PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &at, beta, c, cntx, rntm );
+}
+
+
+void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( b );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
+	     bli_obj_is_complex( b ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_trmmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trmm_check( side, alpha, a, b, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL );
+}
+
+
+void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
+     (
+       side_t  side,
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  b,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	bli_init_once();
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_l;
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
+	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+
+	// Default to using native execution.
+	num_t dt = bli_obj_dt( b );
+	ind_t im = BLIS_NAT;
+
+	// If all matrix operands are complex and of the same storage datatype, try
+	// to get an induced method (if one is available and enabled).
+	if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
+	     bli_obj_is_complex( b ) )
+	{
+		// Find the highest priority induced method that is both enabled and
+		// available for the current operation. (If an induced method is
+		// available but not enabled, or simply unavailable, BLIS_NAT will
+		// be returned here.)
+		im = bli_trsmind_find_avail( dt );
+	}
+
+	// If necessary, obtain a valid context from the gks using the induced
+	// method id determined above.
+	if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
+
+	// Check the operands.
+	if ( bli_error_checking_is_enabled() )
+		bli_trsm_check( side, alpha, a, b, cntx );
+
+	// Invoke the operation's front-end and request the default control tree.
+	bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL );
+}
diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c
index fa008fd15..6ca8244cb 100644
--- a/frame/3/bli_l3_prune.c
+++ b/frame/3/bli_l3_prune.c
@@ -47,7 +47,7 @@ void bli_l3_prune_unref_mparts_m
 	opid_t family = bli_cntl_family( cntl );
 
 	if      ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm.
-	else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c );
+	else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c );
 	else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c );
 	else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c );
 }
@@ -68,7 +68,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \
 	opid_t family = bli_cntl_family( cntl ); \
 \
 	if      ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \
-	else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \
+	else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \
 	else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \
 	else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \
 }
@@ -152,7 +152,7 @@ void PASTEMAC(opname,_prune_unref_mparts_k) \
 	   for the k dimension. */ \
 }
 
-GENFRONT( herk )
+GENFRONT( gemmt )
 
 // -----------------------------------------------------------------------------
 
diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h
index 340ecd4db..ad8f07dc4 100644
--- a/frame/3/bli_l3_prune.h
+++ b/frame/3/bli_l3_prune.h
@@ -64,9 +64,9 @@ GENPROT( gemm, m )
 GENPROT( gemm, n )
 GENPROT( gemm, k )
 
-GENPROT( herk, m )
-GENPROT( herk, n )
-GENPROT( herk, k )
+GENPROT( gemmt, m )
+GENPROT( gemmt, n )
+GENPROT( gemmt, k )
 
 GENPROT( trmm, m )
 GENPROT( trmm, n )
diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h
index 4726e1042..37a3909fd 100644
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -44,12 +44,12 @@
 #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
 #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
-// herk
+// gemmt
 
-// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to
+// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to
 // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
-#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
-#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
+#define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
+#define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
 
 // trmm
 
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index 94f0af409..7883dfd6d 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -93,7 +93,7 @@ void bli_gemm_blk_var3
 		// can simply overwrite the internal beta scalar with BLIS_ONE once
 		// it has been used in the first iteration. However...
 
-		// Unlike variant 3 of gemm and herk, which reset the internal scalar
+		// Unlike variant 3 of gemm and gemmt, which reset the internal scalar
 		// on C at the end of the first iteration so that subsequent iterations
 		// do not erroneously apply beta more than once, it is important that
 		// this behavior not be applied to trmm. That is because the order of
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index d7cd0a92c..27678e0bf 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -63,7 +63,7 @@ cntl_t* bli_gemmbp_cntl_create
 	// Use the function pointers to the macrokernels that use slab
 	// assignment of micropanels to threads in the jr and ir loops.
 	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
-	else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2;
+	else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2;
 	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
 	else /* should never execute */ macro_kernel_fp = NULL;
 
@@ -167,8 +167,8 @@ cntl_t* bli_gemmpb_cntl_create
 {
 	void_fp macro_kernel_p = bli_gemm_ker_var1;
 
-	// Change the macro-kernel if the operation family is herk or trmm.
-	//if      ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2;
+	// Change the macro-kernel if the operation family is gemmt or trmm.
+	//if      ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2;
 	//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
 
 	// Create two nodes for the macro-kernel.
diff --git a/frame/3/gemmt/bli_gemmt.h b/frame/3/gemmt/bli_gemmt.h
index ed522ee13..32ab3865e 100644
--- a/frame/3/gemmt/bli_gemmt.h
+++ b/frame/3/gemmt/bli_gemmt.h
@@ -34,3 +34,5 @@
 
 #include "bli_gemmt_front.h"
 
+#include "bli_gemmt_var.h"
+
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index 84385bf17..9f18a717d 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -108,7 +108,7 @@ void bli_gemmt_front
 	bli_l3_thread_decorator
 	(
 	  bli_gemm_int,
-	  BLIS_HERK, // operation family id (gemmt uses 'herk' family)
+	  BLIS_GEMMT, // operation family id
 	  alpha,
 	  &a_local,
 	  &b_local,
diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
similarity index 97%
rename from frame/3/herk/bli_herk_l_ker_var2.c
rename to frame/3/gemmt/bli_gemmt_l_ker_var2.c
index 5a05672d7..a995e6c52 100644
--- a/frame/3/herk/bli_herk_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T herk_fp
+#define FUNCPTR_T gemmt_fp
 
 typedef void (*FUNCPTR_T)
      (
@@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T)
        thrinfo_t* thread
      );
 
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
 
 
-void bli_herk_l_ker_var2
+void bli_gemmt_l_ker_var2
      (
        obj_t*  a,
        obj_t*  b,
@@ -359,11 +359,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
@@ -464,11 +464,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
@@ -551,5 +551,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
 
diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
similarity index 97%
rename from frame/3/herk/bli_herk_u_ker_var2.c
rename to frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 9e685a944..3115fc67b 100644
--- a/frame/3/herk/bli_herk_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T herk_fp
+#define FUNCPTR_T gemmt_fp
 
 typedef void (*FUNCPTR_T)
      (
@@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T)
        thrinfo_t* thread
      );
 
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
 
 
-void bli_herk_u_ker_var2
+void bli_gemmt_u_ker_var2
      (
        obj_t*  a,
        obj_t*  b,
@@ -359,11 +359,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
@@ -490,11 +490,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
 			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
 				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
@@ -554,5 +554,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
 
diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/gemmt/bli_gemmt_var.h
similarity index 90%
rename from frame/3/herk/bli_herk_var.h
rename to frame/3/gemmt/bli_gemmt_var.h
index 00b85fc5c..60c68c9f5 100644
--- a/frame/3/herk/bli_herk_var.h
+++ b/frame/3/gemmt/bli_gemmt_var.h
@@ -52,16 +52,10 @@ void PASTEMAC0(opname) \
        thrinfo_t* thread  \
      );
 
-//GENPROT( herk_blk_var1 )
-//GENPROT( herk_blk_var2 )
-//GENPROT( herk_blk_var3 )
+GENPROT( gemmt_x_ker_var2 )
 
-GENPROT( herk_x_ker_var2 )
-
-GENPROT( herk_l_ker_var2 )
-GENPROT( herk_u_ker_var2 )
-//GENPROT( herk_packa )
-//GENPROT( herk_packb )
+GENPROT( gemmt_l_ker_var2 )
+GENPROT( gemmt_u_ker_var2 )
 
 
 //
@@ -91,6 +85,6 @@ void PASTEMAC(ch,varname) \
        thrinfo_t* thread  \
      );
 
-INSERT_GENTPROT_BASIC0( herk_l_ker_var2 )
-INSERT_GENTPROT_BASIC0( herk_u_ker_var2 )
+INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 )
+INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 )
 
diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
similarity index 97%
rename from frame/3/herk/bli_herk_x_ker_var2.c
rename to frame/3/gemmt/bli_gemmt_x_ker_var2.c
index b6769d719..6d24ea496 100644
--- a/frame/3/herk/bli_herk_x_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
@@ -37,10 +37,10 @@
 
 static gemm_var_oft vars[2] =
 {
-	bli_herk_l_ker_var2, bli_herk_u_ker_var2,
+	bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2,
 };
 
-void bli_herk_x_ker_var2
+void bli_gemmt_x_ker_var2
      (
        obj_t*  a,
        obj_t*  ah,
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
similarity index 97%
rename from frame/3/herk/other/bli_herk_l_ker_var2.c
rename to frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
index 22439f5b2..0bf4b1a0f 100644
--- a/frame/3/herk/other/bli_herk_l_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T herk_fp
+#define FUNCPTR_T gemmt_fp
 
 typedef void (*FUNCPTR_T)
      (
@@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T)
        thrinfo_t* thread
      );
 
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
 
 
-void bli_herk_l_ker_var2
+void bli_gemmt_l_ker_var2
      (
        obj_t*  a,
        obj_t*  b,
@@ -318,11 +318,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
+			a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \
 			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
+				b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \
 				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 					b2 = b_cast; \
 			} \
@@ -405,5 +405,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
 
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
similarity index 97%
rename from frame/3/herk/other/bli_herk_u_ker_var2.c
rename to frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
index 1aa3ce12d..1655bea55 100644
--- a/frame/3/herk/other/bli_herk_u_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
@@ -35,7 +35,7 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T herk_fp
+#define FUNCPTR_T gemmt_fp
 
 typedef void (*FUNCPTR_T)
      (
@@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T)
        thrinfo_t* thread
      );
 
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
+static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
 
 
-void bli_herk_u_ker_var2
+void bli_gemmt_u_ker_var2
      (
        obj_t*  a,
        obj_t*  b,
@@ -318,11 +318,11 @@ void PASTEMAC(ch,varname) \
 			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
+			a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \
 			if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
 			{ \
 				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
+				b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \
 				if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 					b2 = b_cast; \
 			} \
@@ -405,5 +405,5 @@ void PASTEMAC(ch,varname) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
 
diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h
deleted file mode 100644
index 02975c2b5..000000000
--- a/frame/3/her2k/bli_her2k.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_her2k_front.h"
-
diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c
deleted file mode 100644
index 459ab05c7..000000000
--- a/frame/3/her2k/bli_her2k_front.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_her2k_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	bli_init_once();
-
-	obj_t    alpha_conj;
-	obj_t    c_local;
-	obj_t    a_local;
-	obj_t    bh_local;
-	obj_t    b_local;
-	obj_t    ah_local;
-
-	// If alpha is zero, scale by beta, zero the imaginary components of
-	// the diagonal elements, and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		bli_setid( &BLIS_ZERO, c );
-		return;
-	}
-
-	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
-
-	// For her2k, the first and second right-hand "B" operands are simply B'
-	// and A'.
-	bli_obj_alias_to( b, &bh_local );
-	bli_obj_induce_trans( &bh_local );
-	bli_obj_toggle_conj( &bh_local );
-	bli_obj_alias_to( a, &ah_local );
-	bli_obj_induce_trans( &ah_local );
-	bli_obj_toggle_conj( &ah_local );
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_swap( &a_local, &bh_local );
-		bli_obj_swap( &b_local, &ah_local );
-
-		bli_obj_induce_trans( &a_local );
-		bli_obj_induce_trans( &bh_local );
-		bli_obj_induce_trans( &b_local );
-		bli_obj_induce_trans( &ah_local );
-
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &bh_local, &c_local, cntx );
-	bli_l3_set_schemas( &b_local, &ah_local, &c_local, cntx );
-
-	// Initialize a conjugated copy of alpha.
-	bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
-	                                      BLIS_CONJUGATE,
-	                                      alpha,
-	                                      &alpha_conj );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_HER2K,
-	  BLIS_LEFT, // ignored for her[2]k/syr[2]k
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke herk twice, using beta only the first time.
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &a_local,
-	  &bh_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  &alpha_conj,
-	  &b_local,
-	  &ah_local,
-	  &BLIS_ONE,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-
-	// The Hermitian rank-2k product was computed as A*B'+B*A', even for
-	// the diagonal elements. Mathematically, the imaginary components of
-	// diagonal elements of a Hermitian rank-2k product should always be
-	// zero. However, in practice, they sometimes accumulate meaningless
-	// non-zero values. To prevent this, we explicitly set those values
-	// to zero before returning.
-	bli_setid( &BLIS_ZERO, &c_local );
-}
-
diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h
deleted file mode 100644
index 0efdb86c2..000000000
--- a/frame/3/her2k/bli_her2k_front.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_her2k_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h
deleted file mode 100644
index c43728968..000000000
--- a/frame/3/herk/bli_herk.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_herk_front.h"
-
-#include "bli_herk_var.h"
-
diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c
deleted file mode 100644
index 324e18151..000000000
--- a/frame/3/herk/bli_herk_front.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_herk_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   ah_local;
-	obj_t   c_local;
-
-	// If alpha is zero, scale by beta, zero the imaginary components of
-	// the diagonal elements, and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		bli_setid( &BLIS_ZERO, c );
-		return;
-	}
-
-	// Alias A and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
-
-	// For herk, the right-hand "B" operand is simply A'.
-	bli_obj_alias_to( a, &ah_local );
-	bli_obj_induce_trans( &ah_local );
-	bli_obj_toggle_conj( &ah_local );
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_toggle_conj( &a_local );
-		bli_obj_toggle_conj( &ah_local );
-
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &ah_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_HERK,
-	  BLIS_LEFT, // ignored for her[2]k/syr[2]k
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &a_local,
-	  &ah_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-
-	// The Hermitian rank-k product was computed as A*A', even for the
-	// diagonal elements. Mathematically, the imaginary components of
-	// diagonal elements of a Hermitian rank-k product should always be
-	// zero. However, in practice, they sometimes accumulate meaningless
-	// non-zero values. To prevent this, we explicitly set those values
-	// to zero before returning.
-	bli_setid( &BLIS_ZERO, &c_local );
-}
-
diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h
deleted file mode 100644
index 44778a450..000000000
--- a/frame/3/herk/bli_herk_front.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_herk_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
deleted file mode 100644
index 8a99a2e24..000000000
--- a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
-
-
-void bli_herk_l_ker_var2
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, ip; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely above the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region above where the diagonal of C intersects
-	   the left edge of the panel, adjust the pointer to C and A and treat
-	   this case as if the diagonal offset were zero. */ \
-	if ( diagoffc < 0 ) \
-	{ \
-		ip       = -diagoffc / MR; \
-		i        = ip * MR; \
-		m        = m - i; \
-		diagoffc = -diagoffc % MR; \
-		c_cast   = c_cast + (i  )*rs_c; \
-		a_cast   = a_cast + (ip )*ps_a; \
-	} \
-\
-	/* If there is a zero region to the right of where the diagonal
-	   of C intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffc + m < n ) \
-	{ \
-		n = diagoffc + m; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Use interleaved (round robin) assignment of micropanels to threads in
-	   the 2nd and 1st loops. */ \
-	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
-
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2rr.c b/frame/3/herk/other/bli_herk_l_ker_var2rr.c
deleted file mode 100644
index c78a36b29..000000000
--- a/frame/3/herk/other/bli_herk_l_ker_var2rr.c
+++ /dev/null
@@ -1,555 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr);
-
-//
-// -- Macrokernel functions for round-robin partitioning -----------------------
-//
-
-void bli_herk_l_ker_var2rr
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, ip; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely above the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region above where the diagonal of C intersects
-	   the left edge of the panel, adjust the pointer to C and A and treat
-	   this case as if the diagonal offset were zero. */ \
-	if ( diagoffc < 0 ) \
-	{ \
-		ip       = -diagoffc / MR; \
-		i        = ip * MR; \
-		m        = m - i; \
-		diagoffc = -diagoffc % MR; \
-		c_cast   = c_cast + (i  )*rs_c; \
-		a_cast   = a_cast + (ip )*ps_a; \
-	} \
-\
-	/* If there is a zero region to the right of where the diagonal
-	   of C intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffc + m < n ) \
-	{ \
-		n = diagoffc + m; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the rectangular
-	   part of C, and the triangular portion. */ \
-	dim_t n_iter_rct; \
-	dim_t n_iter_tri; \
-\
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the second set of
-		   loops. */ \
-		n_iter_rct = n_iter; \
-		n_iter_tri = 0; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the rectangular region by dividing NR into the diagonal
-		   offset. Any remainder from this integer division is discarded, which
-		   is what we want. That is, we want the rectangular region to contain
-		   as many columns of whole microtiles as possible without including any
-		   microtiles that intersect the diagonal. The number of iterations in
-		   the triangular (or trapezoidal) region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_rct = diagoffc / NR; \
-		n_iter_tri = n_iter - n_iter_rct; \
-	} \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
-	   loops for the initial rectangular region of C (if it exists). */ \
-	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-\
-	/* If there is no triangular region, then we're done. */ \
-	if ( n_iter_tri == 0 ) return; \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and
-	   1st loops for the remaining triangular region of C. */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the triangular region
-	   by the number of iterations used for the rectangular region. */ \
-	jr_start += n_iter_rct; \
-	jr_end   += n_iter_rct; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr )
-
diff --git a/frame/3/herk/other/bli_herk_l_ker_var2sl.c b/frame/3/herk/other/bli_herk_l_ker_var2sl.c
deleted file mode 100644
index 17e0b0d0e..000000000
--- a/frame/3/herk/other/bli_herk_l_ker_var2sl.c
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl);
-
-//
-// -- Macrokernel functions for slab partitioning ------------------------------
-//
-
-void bli_herk_l_ker_var2sl
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, ip; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely above the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region above where the diagonal of C intersects
-	   the left edge of the panel, adjust the pointer to C and A and treat
-	   this case as if the diagonal offset were zero. */ \
-	if ( diagoffc < 0 ) \
-	{ \
-		ip       = -diagoffc / MR; \
-		i        = ip * MR; \
-		m        = m - i; \
-		diagoffc = -diagoffc % MR; \
-		c_cast   = c_cast + (i  )*rs_c; \
-		a_cast   = a_cast + (ip )*ps_a; \
-	} \
-\
-	/* If there is a zero region to the right of where the diagonal
-	   of C intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffc + m < n ) \
-	{ \
-		n = diagoffc + m; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the rectangular
-	   part of C, and the triangular portion. */ \
-	dim_t n_iter_rct; \
-	dim_t n_iter_tri; \
-\
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the second set of
-		   loops. */ \
-		n_iter_rct = n_iter; \
-		n_iter_tri = 0; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the rectangular region by dividing NR into the diagonal
-		   offset. Any remainder from this integer division is discarded, which
-		   is what we want. That is, we want the rectangular region to contain
-		   as many columns of whole microtiles as possible without including any
-		   microtiles that intersect the diagonal. The number of iterations in
-		   the triangular (or trapezoidal) region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_rct = diagoffc / NR; \
-		n_iter_tri = n_iter - n_iter_rct; \
-	} \
-\
-	/* Use slab assignment of micropanels to threads in the 2nd and 1st
-	   loops for the initial rectangular region of C (if it exists). */ \
-	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-\
-	/* If there is no triangular region, then we're done. */ \
-	if ( n_iter_tri == 0 ) return; \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd
-	   loop and slab partitioning in the 1st loop for the remaining
-	   triangular region of C. */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the triangular region
-	   by the number of iterations used for the rectangular region. */ \
-	jr_start += n_iter_rct; \
-	jr_end   += n_iter_rct; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly below the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl )
-
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
deleted file mode 100644
index 31d8fab62..000000000
--- a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
-
-
-void bli_herk_u_ker_var2
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, jp; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely below the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region to the left of where the diagonal of C
-	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero. */ \
-	if ( diagoffc > 0 ) \
-	{ \
-		jp       = diagoffc / NR; \
-		j        = jp * NR; \
-		n        = n - j; \
-		diagoffc = diagoffc % NR; \
-		c_cast   = c_cast + (j  )*cs_c; \
-		b_cast   = b_cast + (jp )*ps_b; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of C intersects
-	   the right edge of the panel, shrink it to prevent "no-op" iterations
-	   from executing. */ \
-	if ( -diagoffc + n < m ) \
-	{ \
-		m = -diagoffc + n; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Use interleaved (round robin) assignment of micropanels to threads in
-	   the 2nd and 1st loops. */ \
-	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
-
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2rr.c b/frame/3/herk/other/bli_herk_u_ker_var2rr.c
deleted file mode 100644
index 085ef6308..000000000
--- a/frame/3/herk/other/bli_herk_u_ker_var2rr.c
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr);
-
-//
-// -- Macrokernel functions for round-robin partitioning -----------------------
-//
-
-void bli_herk_u_ker_var2rr
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-    f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, jp; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely below the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region to the left of where the diagonal of C
-	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero.
-	   NOTE: It's possible that after this pruning that the diagonal offset
-	   is still positive (though it is guaranteed to be less than NR). */ \
-	if ( diagoffc > 0 ) \
-	{ \
-		jp       = diagoffc / NR; \
-		j        = jp * NR; \
-		n        = n - j; \
-		diagoffc = diagoffc % NR; \
-		c_cast   = c_cast + (j  )*cs_c; \
-		b_cast   = b_cast + (jp )*ps_b; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of C intersects
-	   the right edge of the panel, shrink it to prevent "no-op" iterations
-	   from executing. */ \
-	if ( -diagoffc + n < m ) \
-	{ \
-		m = -diagoffc + n; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the triangular
-	   part of C, and the rectangular portion. */ \
-	dim_t n_iter_tri; \
-	dim_t n_iter_rct; \
-\
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the first set of
-		   loops. */ \
-		n_iter_tri = 0; \
-		n_iter_rct = n_iter; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the triangular (or trapezoidal) region by dividing NR
-		   into the number of rows in C. A non-zero remainder means we need to
-		   add one additional iteration. That is, we want the triangular region
-		   to contain as few columns of whole microtiles as possible while still
-		   including all microtiles that intersect the diagonal. The number of
-		   iterations in the rectangular region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
-		n_iter_rct = n_iter - n_iter_tri; \
-	} \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
-	   loops for the initial triangular region of C (if it exists). */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_rr( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-\
-	/* If there is no rectangular region, then we're done. */ \
-	if ( n_iter_rct == 0 ) return; \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
-	   loops for the remaining triangular region of C. */ \
-	bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the rectangular region
-	   by the number of iterations used for the triangular region. */ \
-	jr_start += n_iter_tri; \
-	jr_end   += n_iter_tri; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr )
-
diff --git a/frame/3/herk/other/bli_herk_u_ker_var2sl.c b/frame/3/herk/other/bli_herk_u_ker_var2sl.c
deleted file mode 100644
index abc6e5188..000000000
--- a/frame/3/herk/other/bli_herk_u_ker_var2sl.c
+++ /dev/null
@@ -1,558 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define FUNCPTR_T herk_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl);
-
-//
-// -- Macrokernel functions for slab partitioning ------------------------------
-//
-
-void bli_herk_u_ker_var2sl
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
-     )
-{
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
-
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
-
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
-
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
-
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
-
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
-
-	// Detach and multiply the scalars attached to A and B.
-	bli_obj_scalar_detach( a, &scalar_a );
-	bli_obj_scalar_detach( b, &scalar_b );
-	bli_mulsc( &scalar_a, &scalar_b );
-
-	// Grab the addresses of the internal scalar buffers for the scalar
-	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-    f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
-}
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
-	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, jp; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
-	/*
-	   Assumptions/assertions:
-	     rs_a == 1
-	     cs_a == PACKMR
-	     pd_a == MR
-	     ps_a == stride to next micro-panel of A
-	     rs_b == PACKNR
-	     cs_b == 1
-	     pd_b == NR
-	     ps_b == stride to next micro-panel of B
-	     rs_c == (no assumptions)
-	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
-	/* Safeguard: If the current panel of C is entirely below the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
-\
-	/* If there is a zero region to the left of where the diagonal of C
-	   intersects the top edge of the panel, adjust the pointer to C and B
-	   and treat this case as if the diagonal offset were zero.
-	   NOTE: It's possible that after this pruning that the diagonal offset
-	   is still positive (though it is guaranteed to be less than NR). */ \
-	if ( diagoffc > 0 ) \
-	{ \
-		jp       = diagoffc / NR; \
-		j        = jp * NR; \
-		n        = n - j; \
-		diagoffc = diagoffc % NR; \
-		c_cast   = c_cast + (j  )*cs_c; \
-		b_cast   = b_cast + (jp )*ps_b; \
-	} \
-\
-	/* If there is a zero region below where the diagonal of C intersects
-	   the right edge of the panel, shrink it to prevent "no-op" iterations
-	   from executing. */ \
-	if ( -diagoffc + n < m ) \
-	{ \
-		m = -diagoffc + n; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
-	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
-	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
-	/* Note that we partition the 2nd loop into two regions: the triangular
-	   part of C, and the rectangular portion. */ \
-	dim_t n_iter_tri; \
-	dim_t n_iter_rct; \
-\
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
-	{ \
-		/* If the entire panel of C does not intersect the diagonal, there is
-		   no triangular region, and therefore we can skip the first set of
-		   loops. */ \
-		n_iter_tri = 0; \
-		n_iter_rct = n_iter; \
-	} \
-	else \
-	{ \
-		/* If the panel of C does intersect the diagonal, compute the number of
-		   iterations in the triangular (or trapezoidal) region by dividing NR
-		   into the number of rows in C. A non-zero remainder means we need to
-		   add one additional iteration. That is, we want the triangular region
-		   to contain as few columns of whole microtiles as possible while still
-		   including all microtiles that intersect the diagonal. The number of
-		   iterations in the rectangular region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
-		n_iter_rct = n_iter - n_iter_tri; \
-	} \
-\
-	/* Use round-robin assignment of micropanels to threads in the 2nd loop
-	   and slab partitioning in the 1st loop for the initial triangular region
-	   of C (if it exists). */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir_sl( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-\
-	/* If there is no rectangular region, then we're done. */ \
-	if ( n_iter_rct == 0 ) return; \
-\
-	/* Use slab assignment of micropanels to threads in the 2nd and 1st loops
-	   loop for the remaining triangular region of C. */ \
-	bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Advance the start and end iteration offsets for the rectangular region
-	   by the number of iterations used for the triangular region. */ \
-	jr_start += n_iter_tri; \
-	jr_end   += n_iter_tri; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
-			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
-			/* If the diagonal intersects the current MR x NR submatrix, we
-			   compute it the temporary buffer and then add in the elements
-			   on or below the diagonal.
-			   Otherwise, if the submatrix is strictly above the diagonal,
-			   we compute and store as we normally would.
-			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Handle interior and edge cases separately. */ \
-				if ( m_cur == MR && n_cur == NR ) \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  beta_cast, \
-					  c11, rs_c, cs_c, \
-					  &aux, \
-					  cntx  \
-					); \
-				} \
-				else \
-				{ \
-					/* Invoke the gemm micro-kernel. */ \
-					gemm_ukr \
-					( \
-					  k, \
-					  alpha_cast, \
-					  a1, \
-					  b1, \
-					  zero, \
-					  ct, rs_ct, cs_ct, \
-					  &aux, \
-					  cntx  \
-					); \
-\
-					/* Scale the edge of C and add the result. */ \
-					PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
-					                        ct,  rs_ct, cs_ct, \
-					                        beta_cast, \
-					                        c11, rs_c,  cs_c ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl )
-
diff --git a/frame/3/syr2k/bli_syr2k.h b/frame/3/syr2k/bli_syr2k.h
deleted file mode 100644
index 680e6e399..000000000
--- a/frame/3/syr2k/bli_syr2k.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_syr2k_front.h"
-
diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c
deleted file mode 100644
index 4f30cc3d5..000000000
--- a/frame/3/syr2k/bli_syr2k_front.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_syr2k_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	bli_init_once();
-
-	obj_t    c_local;
-	obj_t    a_local;
-	obj_t    bt_local;
-	obj_t    b_local;
-	obj_t    at_local;
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
-	// Alias A, B, and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( b, &b_local );
-	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
-
-	// For syr2k, the first and second right-hand "B" operands are simply B'
-	// and A'.
-	bli_obj_alias_to( b, &bt_local );
-	bli_obj_induce_trans( &bt_local );
-	bli_obj_alias_to( a, &at_local );
-	bli_obj_induce_trans( &at_local );
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &bt_local, &c_local, cntx );
-	bli_l3_set_schemas( &b_local, &at_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_SYR2K,
-	  BLIS_LEFT, // ignored for her[2]k/syr[2]k
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke herk twice, using beta only the first time.
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &a_local,
-	  &bt_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &b_local,
-	  &at_local,
-	  &BLIS_ONE,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-}
-
diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h
deleted file mode 100644
index 767bb6ee1..000000000
--- a/frame/3/syr2k/bli_syr2k_front.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_syr2k_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
diff --git a/frame/3/syrk/bli_syrk.h b/frame/3/syrk/bli_syrk.h
deleted file mode 100644
index 4936fe431..000000000
--- a/frame/3/syrk/bli_syrk.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "bli_syrk_front.h"
-
diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c
deleted file mode 100644
index 819941426..000000000
--- a/frame/3/syrk/bli_syrk_front.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_syrk_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	bli_init_once();
-
-	obj_t   a_local;
-	obj_t   at_local;
-	obj_t   c_local;
-
-	// Alias A and C in case we need to apply transformations.
-	bli_obj_alias_to( a, &a_local );
-	bli_obj_alias_to( c, &c_local );
-	bli_obj_set_as_root( &c_local );
-
-	// For syrk, the right-hand "B" operand is simply A^T.
-	bli_obj_alias_to( a, &at_local );
-	bli_obj_induce_trans( &at_local );
-
-#if 0
-#ifdef BLIS_ENABLE_SMALL_MATRIX
-	gint_t status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local,
-	                                cntx, cntl );
-	if ( status == BLIS_SUCCESS ) return;
-#endif
-#endif
-
-	// If alpha is zero, scale by beta and return.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
-	// An optimization: If C is stored by rows and the micro-kernel prefers
-	// contiguous columns, or if C is stored by columns and the micro-kernel
-	// prefers contiguous rows, transpose the entire operation to allow the
-	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
-	{
-		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &at_local, &c_local, cntx );
-
-	// Parse and interpret the contents of the rntm_t object to properly
-	// set the ways of parallelism for each loop, and then make any
-	// additional modifications necessary for the current operation.
-	bli_rntm_set_ways_for_op
-	(
-	  BLIS_SYRK,
-	  BLIS_LEFT, // ignored for her[2]k/syr[2]k
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
-	  rntm
-	);
-
-	// Invoke the internal back-end.
-	bli_l3_thread_decorator
-	(
-	  bli_gemm_int,
-	  BLIS_HERK, // operation family id
-	  alpha,
-	  &a_local,
-	  &at_local,
-	  beta,
-	  &c_local,
-	  cntx,
-	  rntm,
-	  cntl
-	);
-}
-
diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h
deleted file mode 100644
index bf8d26a52..000000000
--- a/frame/3/syrk/bli_syrk_front.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_syrk_front
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
-
-#ifdef BLIS_ENABLE_SMALL_MATRIX
-err_t bli_syrk_small
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
-     );
-#endif
-
diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c
index 78d139e6b..e76314036 100644
--- a/frame/base/bli_check.c
+++ b/frame/base/bli_check.c
@@ -819,22 +819,26 @@ err_t bli_check_if_exhausted_pool( pool_t* pool )
 	return e_val;
 }
 
-err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx )
+err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx )
 {
 	err_t e_val = BLIS_SUCCESS;
+	num_t dt;
 
-	dim_t mr      = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
-	dim_t nr      = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
-	siz_t dt_size = bli_dt_size( dt );
+	for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
+	{
+		dim_t mr      = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
+		dim_t nr      = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
+		siz_t dt_size = bli_dt_size( dt );
 
-	// NOTE: For induced methods, we use the size of the complex datatypes
-	// (rather than the size of the native micro-kernels' datatype) because
-	// the macro-kernel needs this larger micro-tile footprint, even if the
-	// virtual micro-kernel implementation will only ever be writing to half
-	// of it (real or imaginary part) at a time.
+		// NOTE: For induced methods, we use the size of the complex datatypes
+		// (rather than the size of the native micro-kernels' datatype) because
+		// the macro-kernel needs this larger micro-tile footprint, even if the
+		// virtual micro-kernel implementation will only ever be writing to half
+		// of it (real or imaginary part) at a time.
 
-	if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE )
-		e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE;
+		if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE )
+			e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE;
+	}
 
 	return e_val;
 }
diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h
index 70ec2fd8f..276d27689 100644
--- a/frame/base/bli_check.h
+++ b/frame/base/bli_check.h
@@ -103,7 +103,7 @@ err_t bli_check_valid_malloc_buf( void* ptr );
 
 err_t bli_check_valid_packbuf( packbuf_t buf_type );
 err_t bli_check_if_exhausted_pool( pool_t* pool );
-err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx );
+err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx );
 err_t bli_check_alignment_is_power_of_two( size_t align_size );
 err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size );
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index c250191fc..0a5bcafd4 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -449,6 +449,11 @@ void bli_gks_register_cntx
 	e_val = bli_check_valid_mc_mod_mult( mc, nr ); bli_check_error_code( e_val );
 	e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val );
 #endif
+
+	// Verify that the register blocksizes in the context are sufficiently large
+	// relative to the maximum stack buffer size defined at configure-time.
+	e_val = bli_check_sufficient_stack_buf_size( gks_id_nat );
+	bli_check_error_code( e_val );
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index fa7901583..8a3dcd30a 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -180,12 +180,13 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
 // -- BLIS implementation query (level-3) --------------------------------------
 
 char* bli_info_get_gemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM,  dt ); }
+char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
 char* bli_info_get_hemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM,  dt ); }
-char* bli_info_get_herk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_HERK,  dt ); }
-char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HER2K, dt ); }
+char* bli_info_get_herk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
 char* bli_info_get_symm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM,  dt ); }
-char* bli_info_get_syrk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_SYRK,  dt ); }
-char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYR2K, dt ); }
+char* bli_info_get_syrk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
 char* bli_info_get_trmm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM,  dt ); }
 char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); }
 char* bli_info_get_trsm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM,  dt ); }
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index d900ca4f5..99c7d000d 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -91,6 +91,7 @@ BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t
 // -- BLIS implementation query (level-3) --------------------------------------
 
 BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt );
 BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt );
 BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt );
 BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt );
diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c
index da7643eb6..95587e4a7 100644
--- a/frame/base/bli_part.c
+++ b/frame/base/bli_part.c
@@ -266,7 +266,7 @@ void bli_acquire_mpart_mdim
 	// diagonal, then set the subpartition structure to "general"; otherwise
 	// we let the subpartition inherit the storage structure of its immediate
 	// parent.
-	if ( !bli_obj_root_is_general( sub_obj ) && 
+	if ( !bli_obj_root_is_general( sub_obj ) &&
 	      bli_obj_is_outside_diag( sub_obj ) )
 	{
 		// NOTE: This comment may be out-of-date since we now distinguish
@@ -274,10 +274,10 @@ void bli_acquire_mpart_mdim
 		// Note that we cannot mark the subpartition object as general/dense
 		// here since it makes sense to preserve the existing uplo information
 		// a while longer so that the correct kernels are invoked. (Example:
-		// incremental packing/computing in herk produces subpartitions that
+		// incremental packing/computing in gemmt produces subpartitions that
 		// appear general/dense, but their uplo fields are needed to be either
 		// lower or upper, to determine which macro-kernel gets called in the
-		// herk_int() back-end.)
+		// gemmt_int() back-end.)
 
 		// If the subpartition lies entirely in an "unstored" triangle of the
 		// root matrix, then we need to tweak the subpartition. If the root
@@ -489,7 +489,7 @@ void bli_acquire_mpart_ndim
 	// diagonal), and the subpartition does not intersect the root matrix's
 	// diagonal, then we might need to modify some of the subpartition's
 	// properties, depending on its structure type.
-	if ( !bli_obj_root_is_general( sub_obj ) && 
+	if ( !bli_obj_root_is_general( sub_obj ) &&
 	      bli_obj_is_outside_diag( sub_obj ) )
 	{
 		// NOTE: This comment may be out-of-date since we now distinguish
@@ -497,10 +497,10 @@ void bli_acquire_mpart_ndim
 		// Note that we cannot mark the subpartition object as general/dense
 		// here since it makes sense to preserve the existing uplo information
 		// a while longer so that the correct kernels are invoked. (Example:
-		// incremental packing/computing in herk produces subpartitions that
+		// incremental packing/computing in gemmt produces subpartitions that
 		// appear general/dense, but their uplo fields are needed to be either
 		// lower or upper, to determine which macro-kernel gets called in the
-		// herk_int() back-end.)
+		// gemmt_int() back-end.)
 
 		// If the subpartition lies entirely in an "unstored" triangle of the
 		// root matrix, then we need to tweak the subpartition. If the root
@@ -742,7 +742,7 @@ void bli_acquire_mpart_mndim
 	// diagonal, then set the subpartition structure to "general"; otherwise
 	// we let the subpartition inherit the storage structure of its immediate
 	// parent.
-	if ( !bli_obj_root_is_general( sub_obj ) && 
+	if ( !bli_obj_root_is_general( sub_obj ) &&
 	     req_part != BLIS_SUBPART00 &&
 	     req_part != BLIS_SUBPART11 &&
 	     req_part != BLIS_SUBPART22 )
@@ -762,10 +762,10 @@ void bli_acquire_mpart_mndim
 		// Note that we cannot mark the subpartition object as general/dense
 		// here since it makes sense to preserve the existing uplo information
 		// a while longer so that the correct kernels are invoked. (Example:
-		// incremental packing/computing in herk produces subpartitions that
+		// incremental packing/computing in gemmt produces subpartitions that
 		// appear general/dense, but their uplo fields are needed to be either
 		// lower or upper, to determine which macro-kernel gets called in the
-		// herk_int() back-end.)
+		// gemmt_int() back-end.)
 
 		// If the subpartition lies entirely in an "unstored" triangle of the
 		// root matrix, then we need to tweak the subpartition. If the root
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 9ebd47de1..6dc4f9141 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -678,7 +678,7 @@ siz_t bli_thread_range_mdim
 	// structured matrix, even though they represent part of that matrix
 	// that will be dense and full (after packing).
 	if      ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; }
-	else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE;  }
+	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
 	else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE;  }
 	else    /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; }
 
@@ -737,7 +737,7 @@ siz_t bli_thread_range_ndim
 	// structured matrix, even though they represent part of that matrix
 	// that will be dense and full (after packing).
 	if      ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; }
-	else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE;  }
+	else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE;  }
 	else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE;  }
 	else    /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; }
 
diff --git a/kernels/zen/3/bli_syrk_small.c b/kernels/zen/3/bli_gemmt_small.c
similarity index 99%
rename from kernels/zen/3/bli_syrk_small.c
rename to kernels/zen/3/bli_gemmt_small.c
index 23d47298c..f2fd88de7 100644
--- a/kernels/zen/3/bli_syrk_small.c
+++ b/kernels/zen/3/bli_gemmt_small.c
@@ -52,9 +52,9 @@ static float C_pack[F_SCRATCH_DIM]  __attribute__((aligned(64)));
 #define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES)
 static double D_A_pack[D_SCRATCH_DIM]  __attribute__((aligned(64)));
 static double D_C_pack[D_SCRATCH_DIM]  __attribute__((aligned(64)));
-#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. 
-#define AT_MR 4 // The kernel dimension of the A transpose SYRK kernel.(AT_MR * NR).
-static err_t bli_ssyrk_small
+#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called.
+#define AT_MR 4 // The kernel dimension of the A transpose GEMMT kernel.(AT_MR * NR).
+static err_t bli_sgemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -65,7 +65,7 @@ static err_t bli_ssyrk_small
        cntl_t* cntl
      );
 
-static err_t bli_dsyrk_small
+static err_t bli_dgemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -76,7 +76,7 @@ static err_t bli_dsyrk_small
        cntl_t* cntl
      );
 
-static err_t bli_ssyrk_small_atbn
+static err_t bli_sgemmt_small_atbn
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -87,7 +87,7 @@ static err_t bli_ssyrk_small_atbn
        cntl_t* cntl
      );
 
-static err_t bli_dsyrk_small_atbn
+static err_t bli_dgemmt_small_atbn
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -98,11 +98,11 @@ static err_t bli_dsyrk_small_atbn
        cntl_t* cntl
      );
 /*
-* The bli_syrk_small function will use the
+* The bli_gemmt_small function will use the
 * custom MRxNR kernels, to perform the computation.
 * The custom kernels are used if the [M * N] < 240 * 240
 */
-err_t bli_syrk_small
+err_t bli_gemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -113,20 +113,20 @@ err_t bli_syrk_small
        cntl_t* cntl
      )
 {
-	// FGVZ: This code was originally in bli_syrk_front(). However, it really
-	// fits more naturally here within the bli_syrk_small() function. This
+	// FGVZ: This code was originally in bli_gemmt_front(). However, it really
+	// fits more naturally here within the bli_gemmt_small() function. This
 	// becomes a bit more obvious now that the code is here, as it contains
-	// cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_SYRK, which are specific
+	// cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_GEMMT, which are specific
 	// to this implementation.
 	if ( bli_obj_has_trans( a ) )
 	{
 		// Continue with small implementation.
 		;
 	}
-	else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_SYRK &&
-	            bli_obj_width( a )  <  BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) ||
-	          ( bli_obj_length( a ) <  BLIS_SMALL_MATRIX_A_THRES_M_SYRK &&
-	            bli_obj_width( a )  <= BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) )
+	else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_GEMMT &&
+	            bli_obj_width( a )  <  BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) ||
+	          ( bli_obj_length( a ) <  BLIS_SMALL_MATRIX_A_THRES_M_GEMMT &&
+	            bli_obj_width( a )  <= BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) )
 	{
 		// Continue with small implementation.
 		;
@@ -162,11 +162,11 @@ err_t bli_syrk_small
         {
             if (dt == BLIS_FLOAT)
             {
-                return bli_ssyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl);
+                return bli_sgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl);
             }
             else if (dt == BLIS_DOUBLE)
             {
-                return bli_dsyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl);
+                return bli_dgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl);
             }
         }
 
@@ -175,19 +175,19 @@ err_t bli_syrk_small
 
     if (dt == BLIS_DOUBLE)
     {
-        return bli_dsyrk_small(alpha, a, b, beta, c, cntx, cntl);
+        return bli_dgemmt_small(alpha, a, b, beta, c, cntx, cntl);
     }
 
     if (dt == BLIS_FLOAT)
     {
-        return bli_ssyrk_small(alpha, a, b, beta, c, cntx, cntl);
+        return bli_sgemmt_small(alpha, a, b, beta, c, cntx, cntl);
     }
 
     return BLIS_NOT_YET_IMPLEMENTED;
 };
 
 
-static err_t bli_ssyrk_small
+static err_t bli_sgemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -240,7 +240,7 @@ static err_t bli_ssyrk_small
         beta_cast = (beta->buffer);
         int required_packing_A = 1;
 
-        // when N is equal to 1 call GEMV instead of SYRK
+        // when N is equal to 1 call GEMV instead of GEMMT
         if (N == 1)
         {
             bli_gemv
@@ -1584,7 +1584,7 @@ static err_t bli_ssyrk_small
                 }
             }
         }
-        
+
         //copy/compute sryk values back to C using SIMD
         if ( bli_seq0( *beta_cast ) )
         {//just copy in case of beta = 0
@@ -1673,7 +1673,7 @@ static err_t bli_ssyrk_small
                 _i = 0;
                 for ( _l = 0; _l < k; _l++ )
                 {
-                    ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC));     
+                    ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC));
                     ymm0 = _mm256_loadu_ps((C + _i*rsc));
                     ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0);
                     _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0);
@@ -1703,11 +1703,11 @@ static err_t bli_ssyrk_small
                     _l = 0;
                     while ( _l < k )
                     {
-                        ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC));       
+                        ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC));
                         ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc));
                         ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0);
                         _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
-                        
+
                         _i += 8;
                         _l++;
                     }
@@ -1729,8 +1729,8 @@ static err_t bli_ssyrk_small
                     _i = 0;
                     _l = 0;
                     while ( _l < k )
-                    {                                   
-                        ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC));       
+                    {
+                        ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC));
                         ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc));
                         ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0);
                         _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
@@ -1747,7 +1747,7 @@ static err_t bli_ssyrk_small
                 }
             }
         }
-        
+
         return BLIS_SUCCESS;
     }
     else
@@ -1756,7 +1756,7 @@ static err_t bli_ssyrk_small
 
 };
 
-static err_t bli_dsyrk_small
+static err_t bli_dgemmt_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -1810,7 +1810,7 @@ static err_t bli_dsyrk_small
         beta_cast = (beta->buffer);
         int required_packing_A = 1;
 
-        // when N is equal to 1 call GEMV instead of SYRK
+        // when N is equal to 1 call GEMV instead of GEMMT
         if (N == 1)
         {
             bli_gemv
@@ -3154,7 +3154,7 @@ static err_t bli_dsyrk_small
                 }
             }
         }
-        
+
         //copy/compute sryk values back to C using SIMD
         if ( bli_seq0( *beta_cast ) )
         {//just copy for beta = 0
@@ -3195,7 +3195,7 @@ static err_t bli_dsyrk_small
                     {
                         ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc));
                         _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
-                        
+
                         _i += 4;
                         _l++;
                     }
@@ -3243,7 +3243,7 @@ static err_t bli_dsyrk_small
                 _i = 0;
                 for ( _l = 0; _l < k; _l++ )
                 {
-                    ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC));     
+                    ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC));
                     ymm0 = _mm256_loadu_pd((C + _i*rsc));
                     ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0);
                     _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0);
@@ -3273,7 +3273,7 @@ static err_t bli_dsyrk_small
                     _l = 0;
                     while ( _l < k )
                     {
-                        ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC));       
+                        ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC));
                         ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc));
                         ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0);
                         _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
@@ -3299,8 +3299,8 @@ static err_t bli_dsyrk_small
                     _i = 0;
                     _l = 0;
                     while ( _l < k )
-                    {                                   
-                        ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC));       
+                    {
+                        ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC));
                         ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc));
                         ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0);
                         _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0);
@@ -3317,7 +3317,7 @@ static err_t bli_dsyrk_small
                 }
             }
         }
-        
+
         return BLIS_SUCCESS;
     }
     else
@@ -3326,7 +3326,7 @@ static err_t bli_dsyrk_small
 
 };
 
-static err_t bli_ssyrk_small_atbn
+static err_t bli_sgemmt_small_atbn
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -3364,7 +3364,7 @@ static err_t bli_ssyrk_small_atbn
     alpha_cast = (alpha->buffer);
     beta_cast = (beta->buffer);
 
-    // The non-copy version of the A^T SYRK gives better performance for the small M cases.
+    // The non-copy version of the A^T GEMMT gives better performance for the small M cases.
     // The threshold is controlled by BLIS_ATBN_M_THRES
     if (M <= BLIS_ATBN_M_THRES)
     {
@@ -3715,7 +3715,7 @@ static err_t bli_ssyrk_small_atbn
                 }
             }
         }
-        
+
         //copy/compute sryk values back to C
         if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C
         {
@@ -3774,7 +3774,7 @@ static err_t bli_ssyrk_small_atbn
         return BLIS_NONCONFORMAL_DIMENSIONS;
 }
 
-static err_t bli_dsyrk_small_atbn
+static err_t bli_dgemmt_small_atbn
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -3812,7 +3812,7 @@ static err_t bli_dsyrk_small_atbn
     alpha_cast = (alpha->buffer);
     beta_cast = (beta->buffer);
 
-    // The non-copy version of the A^T SYRK gives better performance for the small M cases.
+    // The non-copy version of the A^T GEMMT gives better performance for the small M cases.
     // The threshold is controlled by BLIS_ATBN_M_THRES
     if (M <= BLIS_ATBN_M_THRES)
     {
@@ -3968,7 +3968,7 @@ static err_t bli_dsyrk_small_atbn
                 result *= (*alpha_cast);
                 tC[3] = result/* + tC[3] * (*beta_cast)*/;
 
-      
+
                 tC += ldc;
                 ymm6 = _mm256_hadd_pd(ymm6, ymm6);
                 _mm256_storeu_pd(scratch, ymm6);
@@ -4199,7 +4199,7 @@ static err_t bli_dsyrk_small_atbn
                 }
             }
         }
-        
+
         return BLIS_SUCCESS;
     }
     else
diff --git a/sandbox/gemmlike/bls_gemm_check.c b/sandbox/gemmlike/bls_gemm_check.c
index bd6c2647e..369017338 100644
--- a/sandbox/gemmlike/bls_gemm_check.c
+++ b/sandbox/gemmlike/bls_gemm_check.c
@@ -99,11 +99,6 @@ void bls_gemm_check
 	e_val = bli_check_object_buffer( c );
 	bli_check_error_code( e_val );
 
-	// Check for sufficiently sized stack buffers
-
-	e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx );
-	bli_check_error_code( e_val );
-
 	// Check object dimensions.
 
 	e_val = bli_check_level3_dims( a, b, c );